ngs_server 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
@@ -0,0 +1,495 @@
|
|
1
|
+
/*
|
2
|
+
* vcf_file.cpp
|
3
|
+
*
|
4
|
+
* Created on: Aug 19, 2009
|
5
|
+
* Author: Adam Auton
|
6
|
+
* ($Revision: 230 $)
|
7
|
+
*/
|
8
|
+
|
9
|
+
#include "vcf_file.h"
|
10
|
+
|
11
|
+
vcf_file::vcf_file(const string &filename, bool compressed, const string &chr, const string &exclude_chr, bool force_write_index) :
|
12
|
+
filename(filename),
|
13
|
+
compressed(compressed),
|
14
|
+
has_body(false),
|
15
|
+
has_file_format(false),
|
16
|
+
has_genotypes(false),
|
17
|
+
has_header(false),
|
18
|
+
has_meta(false)
|
19
|
+
{
|
20
|
+
open();
|
21
|
+
scan_file(chr, exclude_chr, force_write_index);
|
22
|
+
}
|
23
|
+
|
24
|
+
vcf_file::~vcf_file()
|
25
|
+
{
|
26
|
+
close();
|
27
|
+
}
|
28
|
+
|
29
|
+
// Parse VCF meta information
|
30
|
+
void vcf_file::parse_meta(const string &line)
|
31
|
+
{
|
32
|
+
has_meta = true;
|
33
|
+
meta.push_back(line);
|
34
|
+
size_t found=line.find("##fileformat=");
|
35
|
+
if (found!=string::npos)
|
36
|
+
{
|
37
|
+
has_file_format = true;
|
38
|
+
found = line.find_first_of("=");
|
39
|
+
string version = line.substr(found+1);
|
40
|
+
if ((version != "VCFv4.0") && (version != "VCFv4.1"))
|
41
|
+
error("VCF version must be v4.0 or v4.1:\nYou are using version " + version);
|
42
|
+
}
|
43
|
+
|
44
|
+
found=line.find("##INFO=");
|
45
|
+
if (found!=string::npos)
|
46
|
+
{ // Found an INFO descriptor
|
47
|
+
vcf_entry::add_INFO_descriptor(line);
|
48
|
+
}
|
49
|
+
|
50
|
+
found=line.find("##FILTER=");
|
51
|
+
if (found!=string::npos)
|
52
|
+
{ // Found a FILTER descriptor
|
53
|
+
vcf_entry::add_FILTER_descriptor(line);
|
54
|
+
}
|
55
|
+
|
56
|
+
found=line.find("##FORMAT=");
|
57
|
+
if (found!=string::npos)
|
58
|
+
{ // Found a genotype filter descriptor
|
59
|
+
vcf_entry::add_FORMAT_descriptor(line);
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
// Parse VCF header, and extract individuals etc.
|
64
|
+
void vcf_file::parse_header(const string &line)
|
65
|
+
{
|
66
|
+
// #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... )
|
67
|
+
if (has_header == true)
|
68
|
+
warning("Multiple Header lines.");
|
69
|
+
|
70
|
+
has_header = true;
|
71
|
+
istringstream header(line);
|
72
|
+
int count = 0;
|
73
|
+
string tmp_str;
|
74
|
+
unsigned int N_header_indv = 0;
|
75
|
+
has_genotypes = false;
|
76
|
+
while (!header.eof())
|
77
|
+
{
|
78
|
+
header >> tmp_str;
|
79
|
+
switch (count)
|
80
|
+
{
|
81
|
+
case 0: if (tmp_str != "#CHROM") warning("First Header entry should be #CHROM: " + tmp_str); break;
|
82
|
+
case 1: if (tmp_str != "POS") warning("Second Header entry should be POS: " + tmp_str); break;
|
83
|
+
case 2: if (tmp_str != "ID") warning("Third Header entry should be ID: " + tmp_str); break;
|
84
|
+
case 3: if (tmp_str != "REF") warning("Fourth Header entry should be REF: " + tmp_str); break;
|
85
|
+
case 4: if (tmp_str != "ALT") warning("Fifth Header entry should be ALT: " + tmp_str); break;
|
86
|
+
case 5: if (tmp_str != "QUAL") warning("Sixth Header entry should be QUAL: " + tmp_str); break;
|
87
|
+
case 6: if (tmp_str != "FILTER") warning("Seventh Header entry should be FILTER: " + tmp_str); break;
|
88
|
+
case 7: if (tmp_str != "INFO") warning("Eighth Header entry should be INFO: " + tmp_str); break;
|
89
|
+
case 8:
|
90
|
+
if (tmp_str != "FORMAT")
|
91
|
+
warning("Ninth Header entry should be FORMAT: " + tmp_str);
|
92
|
+
else
|
93
|
+
has_genotypes = true;
|
94
|
+
break;
|
95
|
+
default:
|
96
|
+
{
|
97
|
+
if (count <= 8)
|
98
|
+
error("Incorrectly formatted header.");
|
99
|
+
indv.push_back(tmp_str);
|
100
|
+
N_header_indv++;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
count++;
|
104
|
+
}
|
105
|
+
N_indv = N_header_indv;
|
106
|
+
|
107
|
+
if ((has_genotypes == true ) && (N_indv == 0))
|
108
|
+
warning("FORMAT field without genotypes?");
|
109
|
+
}
|
110
|
+
|
111
|
+
|
112
|
+
// Read VCF file
|
113
|
+
void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index)
|
114
|
+
{
|
115
|
+
bool filter_by_chr = (chr != "");
|
116
|
+
bool exclude_by_chr = (exclude_chr != "");
|
117
|
+
string index_filename = filename + ".vcfidx";
|
118
|
+
bool could_read_index_file = false;
|
119
|
+
if (force_write_index == false)
|
120
|
+
could_read_index_file = read_index_file(index_filename);
|
121
|
+
string CHROM, last_CHROM="";
|
122
|
+
int POS, last_POS = -1;
|
123
|
+
if (could_read_index_file == false)
|
124
|
+
{
|
125
|
+
printLOG("Building new index file.\n");
|
126
|
+
string line, CHROM, last_CHROM = "";
|
127
|
+
streampos filepos;
|
128
|
+
char c;
|
129
|
+
N_entries=0;
|
130
|
+
N_indv = 0;
|
131
|
+
|
132
|
+
while (!feof())
|
133
|
+
{
|
134
|
+
filepos = get_filepos();
|
135
|
+
c = peek();
|
136
|
+
|
137
|
+
if ((c == '\n') || (c == '\r'))
|
138
|
+
{
|
139
|
+
read_line(line);
|
140
|
+
continue;
|
141
|
+
}
|
142
|
+
else if (c == EOF)
|
143
|
+
break;
|
144
|
+
|
145
|
+
if (c == '#')
|
146
|
+
{
|
147
|
+
read_line(line);
|
148
|
+
if (line[1] == '#')
|
149
|
+
{ // Meta information
|
150
|
+
parse_meta(line);
|
151
|
+
}
|
152
|
+
else
|
153
|
+
{ // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... )
|
154
|
+
parse_header(line);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
else
|
158
|
+
{ // Must be a data line
|
159
|
+
read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS);
|
160
|
+
if (last_CHROM != CHROM)
|
161
|
+
{
|
162
|
+
printLOG("\tScanning Chromosome: " + CHROM + "\n");
|
163
|
+
last_CHROM = CHROM;
|
164
|
+
}
|
165
|
+
if (POS == last_POS)
|
166
|
+
{
|
167
|
+
one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n");
|
168
|
+
}
|
169
|
+
last_POS = POS;
|
170
|
+
entry_file_locations.push_back(filepos);
|
171
|
+
N_entries++;
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
write_index_file(index_filename);
|
176
|
+
}
|
177
|
+
|
178
|
+
printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n");
|
179
|
+
vector<string> meta_lines = meta; meta.resize(0);
|
180
|
+
for (unsigned int ui=0; ui<meta_lines.size(); ui++)
|
181
|
+
parse_meta(meta_lines[ui]);
|
182
|
+
has_genotypes = (N_indv > 0);
|
183
|
+
|
184
|
+
bool already_found_required_chr = false;
|
185
|
+
bool already_filtered_required_chr = false;
|
186
|
+
if ((exclude_by_chr == true) || (filter_by_chr == true))
|
187
|
+
{
|
188
|
+
printLOG("Filtering by chromosome.\n");
|
189
|
+
for (unsigned int ui=0; ui<N_entries; ui++)
|
190
|
+
{
|
191
|
+
if (already_found_required_chr == true)
|
192
|
+
{
|
193
|
+
printLOG("Skipping Remainder.\n");
|
194
|
+
entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end());
|
195
|
+
break;
|
196
|
+
}
|
197
|
+
if (already_filtered_required_chr == true)
|
198
|
+
{
|
199
|
+
printLOG("Skipping Remainder.\n");
|
200
|
+
break;
|
201
|
+
}
|
202
|
+
|
203
|
+
set_filepos(entry_file_locations[ui]);
|
204
|
+
read_CHROM_only(CHROM);
|
205
|
+
|
206
|
+
if (last_CHROM != CHROM)
|
207
|
+
{
|
208
|
+
printLOG("\tChromosome: " + CHROM + "\n");
|
209
|
+
if ((filter_by_chr == true) && (last_CHROM == chr))
|
210
|
+
already_found_required_chr = true;
|
211
|
+
|
212
|
+
if ((exclude_by_chr == true) && (last_CHROM == exclude_chr))
|
213
|
+
already_filtered_required_chr = true;
|
214
|
+
|
215
|
+
last_CHROM = CHROM;
|
216
|
+
}
|
217
|
+
if ((exclude_by_chr == true) && (CHROM == exclude_chr))
|
218
|
+
{
|
219
|
+
entry_file_locations[ui] = -1;
|
220
|
+
continue;
|
221
|
+
}
|
222
|
+
if ((filter_by_chr == true) && (CHROM != chr))
|
223
|
+
{
|
224
|
+
entry_file_locations[ui] = -1;
|
225
|
+
continue;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
sort(entry_file_locations.begin(), entry_file_locations.end());
|
229
|
+
while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0))
|
230
|
+
entry_file_locations.pop_front();
|
231
|
+
|
232
|
+
N_entries = entry_file_locations.size();
|
233
|
+
printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n");
|
234
|
+
}
|
235
|
+
|
236
|
+
include_indv.clear();
|
237
|
+
include_indv.resize(N_indv, true);
|
238
|
+
include_entry.clear();
|
239
|
+
include_entry.resize(N_entries, true);
|
240
|
+
include_genotype.clear();
|
241
|
+
include_genotype.resize(N_entries, vector<bool>(N_indv, true));
|
242
|
+
}
|
243
|
+
|
244
|
+
void vcf_file::print(const string &output_file_prefix, const set<string> &INFO_to_keep, bool keep_all_INFO)
|
245
|
+
{
|
246
|
+
printLOG("Outputting VCF file... ");
|
247
|
+
unsigned int ui;
|
248
|
+
|
249
|
+
string output_file = output_file_prefix + ".recode.vcf";
|
250
|
+
ofstream out(output_file.c_str());
|
251
|
+
if (!out.is_open())
|
252
|
+
error("Could not open VCF Output File: " + output_file, 3);
|
253
|
+
|
254
|
+
for (ui=0; ui<meta.size(); ui++)
|
255
|
+
out << meta[ui] << endl;
|
256
|
+
|
257
|
+
out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
|
258
|
+
if (N_indv > 0)
|
259
|
+
out << "\tFORMAT";
|
260
|
+
for (ui=0; ui<N_indv; ui++)
|
261
|
+
if (include_indv[ui])
|
262
|
+
out << "\t" << indv[ui];
|
263
|
+
out << endl;
|
264
|
+
|
265
|
+
string vcf_line;
|
266
|
+
for (unsigned int s=0; s<N_entries; s++)
|
267
|
+
if (include_entry[s] == true)
|
268
|
+
{
|
269
|
+
get_vcf_entry(s, vcf_line);
|
270
|
+
vcf_entry e(N_indv, vcf_line);
|
271
|
+
e.parse_basic_entry(true, true, true);
|
272
|
+
e.parse_full_entry(true);
|
273
|
+
e.parse_genotype_entries(true,true,true,true);
|
274
|
+
e.print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]);
|
275
|
+
}
|
276
|
+
|
277
|
+
out.close();
|
278
|
+
printLOG("Done\n");
|
279
|
+
}
|
280
|
+
|
281
|
+
// Return the number of individuals that have not been filtered out
|
282
|
+
int vcf_file::N_kept_individuals() const
|
283
|
+
{
|
284
|
+
int N_kept = 0;
|
285
|
+
for (unsigned int ui=0; ui<include_indv.size(); ui++)
|
286
|
+
if (include_indv[ui] == true)
|
287
|
+
N_kept++;
|
288
|
+
return N_kept;
|
289
|
+
}
|
290
|
+
|
291
|
+
// Return the number of sites that have not been filtered out
|
292
|
+
int vcf_file::N_kept_sites() const
|
293
|
+
{
|
294
|
+
int N_kept = 0;
|
295
|
+
for (unsigned int ui=0; ui<include_entry.size(); ui++)
|
296
|
+
if (include_entry[ui] == true)
|
297
|
+
N_kept++;
|
298
|
+
return N_kept;
|
299
|
+
}
|
300
|
+
|
301
|
+
// Count the number of genotypes that have not been filtered out
|
302
|
+
unsigned int vcf_file::N_genotypes_included(unsigned int entry_num) const
|
303
|
+
{
|
304
|
+
unsigned int count = 0, ui;
|
305
|
+
for (ui=0; ui<N_indv; ui++)
|
306
|
+
if ((include_indv[ui] == true) && (include_genotype[entry_num][ui] == true))
|
307
|
+
{
|
308
|
+
count++;
|
309
|
+
}
|
310
|
+
|
311
|
+
return count;
|
312
|
+
}
|
313
|
+
|
314
|
+
void vcf_file::open()
|
315
|
+
{
|
316
|
+
if (!compressed)
|
317
|
+
{
|
318
|
+
if (filename.substr(filename.size()-3) == ".gz")
|
319
|
+
{
|
320
|
+
warning("Filename ends in '.gz'. Shouldn't you be using --gzvcf?\n");
|
321
|
+
}
|
322
|
+
vcf_in.open(filename.c_str(), ios::in);
|
323
|
+
if (!vcf_in.is_open())
|
324
|
+
error("Could not open VCF file: " + filename, 0);
|
325
|
+
}
|
326
|
+
else
|
327
|
+
{
|
328
|
+
gzMAX_LINE_LEN = 1024*1024;
|
329
|
+
gz_readbuffer = new char[gzMAX_LINE_LEN];
|
330
|
+
gzvcf_in = gzopen(filename.c_str(), "rb");
|
331
|
+
if (gzvcf_in == NULL)
|
332
|
+
error("Could not open GZVCF file: " + filename, 0);
|
333
|
+
#ifdef ZLIB_VERNUM
|
334
|
+
string tmp(ZLIB_VERSION);
|
335
|
+
printLOG("Using zlib version: " + tmp + "\n");
|
336
|
+
#if (ZLIB_VERNUM >= 0x1240)
|
337
|
+
gzbuffer(gzvcf_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster
|
338
|
+
#else
|
339
|
+
printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files.\n");
|
340
|
+
#endif
|
341
|
+
#endif
|
342
|
+
}
|
343
|
+
}
|
344
|
+
|
345
|
+
void vcf_file::close()
|
346
|
+
{
|
347
|
+
if (!compressed)
|
348
|
+
vcf_in.close();
|
349
|
+
else
|
350
|
+
{
|
351
|
+
gzclose(gzvcf_in);
|
352
|
+
delete [] gz_readbuffer;
|
353
|
+
}
|
354
|
+
}
|
355
|
+
|
356
|
+
bool vcf_file::feof()
|
357
|
+
{
|
358
|
+
bool out;
|
359
|
+
if (!compressed)
|
360
|
+
out = vcf_in.eof();
|
361
|
+
else
|
362
|
+
{
|
363
|
+
out = gzeof(gzvcf_in); // Returns 1 when EOF has previously been detected reading the given input stream, otherwise zero.
|
364
|
+
}
|
365
|
+
return out;
|
366
|
+
}
|
367
|
+
|
368
|
+
streampos vcf_file::get_filepos()
|
369
|
+
{
|
370
|
+
if (!compressed)
|
371
|
+
return vcf_in.tellg();
|
372
|
+
else
|
373
|
+
{
|
374
|
+
return gztell(gzvcf_in); // TODO: Type check
|
375
|
+
}
|
376
|
+
}
|
377
|
+
|
378
|
+
void vcf_file::set_filepos(streampos &filepos)
|
379
|
+
{
|
380
|
+
if (!compressed)
|
381
|
+
{
|
382
|
+
vcf_in.clear();
|
383
|
+
vcf_in.seekg(filepos, ios::beg);
|
384
|
+
}
|
385
|
+
else
|
386
|
+
{
|
387
|
+
gzseek(gzvcf_in, filepos, SEEK_SET);
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
void vcf_file::get_vcf_entry(unsigned int entry_num, string &out)
|
392
|
+
{
|
393
|
+
streampos filepos = entry_file_locations[entry_num];
|
394
|
+
set_filepos(filepos);
|
395
|
+
read_line(out);
|
396
|
+
}
|
397
|
+
|
398
|
+
void vcf_file::read_line(string &out)
|
399
|
+
{
|
400
|
+
if (!compressed)
|
401
|
+
{
|
402
|
+
getline(vcf_in, out);
|
403
|
+
out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line
|
404
|
+
}
|
405
|
+
else
|
406
|
+
{
|
407
|
+
out = "";
|
408
|
+
bool again = true;
|
409
|
+
while (again == true)
|
410
|
+
{
|
411
|
+
gzgets(gzvcf_in, gz_readbuffer, gzMAX_LINE_LEN);
|
412
|
+
out.append(gz_readbuffer);
|
413
|
+
if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
|
414
|
+
again = false;
|
415
|
+
}
|
416
|
+
out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
|
417
|
+
}
|
418
|
+
}
|
419
|
+
|
420
|
+
char vcf_file::peek()
|
421
|
+
{
|
422
|
+
if (!compressed)
|
423
|
+
return vcf_in.peek();
|
424
|
+
else
|
425
|
+
{
|
426
|
+
char c = gzgetc(gzvcf_in);
|
427
|
+
gzungetc(c, gzvcf_in);
|
428
|
+
return c;
|
429
|
+
}
|
430
|
+
}
|
431
|
+
|
432
|
+
|
433
|
+
void vcf_file::read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS)
|
434
|
+
{
|
435
|
+
if (!compressed)
|
436
|
+
{
|
437
|
+
vcf_in >> CHROM >> POS;
|
438
|
+
vcf_in.ignore(std::numeric_limits<streamsize>::max(), '\n');
|
439
|
+
}
|
440
|
+
else
|
441
|
+
{
|
442
|
+
static string line;
|
443
|
+
static stringstream ss;
|
444
|
+
read_line(line);
|
445
|
+
ss.clear(); ss.str(line);
|
446
|
+
ss >> CHROM >> POS;
|
447
|
+
}
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
|
452
|
+
void vcf_file::read_CHROM_only(string &CHROM)
|
453
|
+
{ // Just read in the chromosome. Note: leaves the stream in a funny state, but is faster than reading whole line
|
454
|
+
if (!compressed)
|
455
|
+
{
|
456
|
+
vcf_in >> CHROM;
|
457
|
+
}
|
458
|
+
else
|
459
|
+
{
|
460
|
+
CHROM = "";
|
461
|
+
char c = gzgetc(gzvcf_in);
|
462
|
+
while (c != '\t')
|
463
|
+
{
|
464
|
+
CHROM += c;
|
465
|
+
c = gzgetc(gzvcf_in);
|
466
|
+
}
|
467
|
+
}
|
468
|
+
}
|
469
|
+
|
470
|
+
void vcf_file::read_CHROM_and_POS_only(string &CHROM, int &POS)
|
471
|
+
{ // Just read in the chromosome and position. Note: leaves the stream in a funny state, but is faster than reading whole line
|
472
|
+
if (!compressed)
|
473
|
+
{
|
474
|
+
vcf_in >> CHROM >> POS;
|
475
|
+
}
|
476
|
+
else
|
477
|
+
{
|
478
|
+
CHROM = "";
|
479
|
+
char c = gzgetc(gzvcf_in);
|
480
|
+
while (c != '\t')
|
481
|
+
{
|
482
|
+
CHROM += c;
|
483
|
+
c = gzgetc(gzvcf_in);
|
484
|
+
}
|
485
|
+
string tmp;
|
486
|
+
c = gzgetc(gzvcf_in);
|
487
|
+
while (c != '\t')
|
488
|
+
{
|
489
|
+
tmp += c;
|
490
|
+
c = gzgetc(gzvcf_in);
|
491
|
+
}
|
492
|
+
POS = atoi(tmp.c_str());
|
493
|
+
}
|
494
|
+
}
|
495
|
+
|
@@ -0,0 +1,184 @@
|
|
1
|
+
/*
|
2
|
+
* vcf_file.h
|
3
|
+
*
|
4
|
+
* Created on: Aug 19, 2009
|
5
|
+
* Author: Adam Auton
|
6
|
+
* ($Revision: 249 $)
|
7
|
+
*/
|
8
|
+
|
9
|
+
#ifndef VCF_FILE_H_
|
10
|
+
#define VCF_FILE_H_
|
11
|
+
|
12
|
+
#include <cmath>
|
13
|
+
#include <cstdio>
|
14
|
+
#include <cstring>
|
15
|
+
#include <deque>
|
16
|
+
#include <iomanip>
|
17
|
+
#include <iostream>
|
18
|
+
#include <fstream>
|
19
|
+
#include <limits>
|
20
|
+
#include <set>
|
21
|
+
#include <sstream>
|
22
|
+
#include <stdint.h>
|
23
|
+
#include <string>
|
24
|
+
#include <sys/stat.h>
|
25
|
+
#include <vector>
|
26
|
+
#include <zlib.h>
|
27
|
+
|
28
|
+
#include "output_log.h"
|
29
|
+
#include "parameters.h"
|
30
|
+
#include "vcf_entry.h"
|
31
|
+
|
32
|
+
#ifdef VCFTOOLS_PCA
|
33
|
+
#include "dgeev.h"
|
34
|
+
#endif
|
35
|
+
|
36
|
+
using namespace std;
|
37
|
+
|
38
|
+
class vcf_file
|
39
|
+
{
|
40
|
+
public:
|
41
|
+
vcf_file(const string &filename, bool compressed=false, const string &chr="", const string &exclude_chr="", bool force_write_index=false);
|
42
|
+
~vcf_file();
|
43
|
+
|
44
|
+
const string filename;
|
45
|
+
bool compressed;
|
46
|
+
vector<string> meta;
|
47
|
+
vector<string> indv;
|
48
|
+
unsigned int N_indv;
|
49
|
+
unsigned int N_entries;
|
50
|
+
|
51
|
+
deque<streampos> entry_file_locations;
|
52
|
+
void get_vcf_entry(unsigned int entry_num, string &out);
|
53
|
+
|
54
|
+
vector<bool> include_indv;
|
55
|
+
deque<bool> include_entry;
|
56
|
+
deque<vector<bool> > include_genotype;
|
57
|
+
|
58
|
+
void apply_filters(const parameters ¶ms);
|
59
|
+
|
60
|
+
void filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false);
|
61
|
+
void filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file);
|
62
|
+
void filter_sites_to_exclude(const string &snps_to_exclude_file);
|
63
|
+
void filter_sites_by_position(const string &chr, int start_pos, int end_pos);
|
64
|
+
void filter_sites_by_positions(const string &positions_file);
|
65
|
+
void filter_sites_by_quality(double min_quality);
|
66
|
+
void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth);
|
67
|
+
void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate);
|
68
|
+
void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count);
|
69
|
+
void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles);
|
70
|
+
void filter_sites_by_HWE_pvalue(double min_HWE_pvalue);
|
71
|
+
void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false);
|
72
|
+
void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0);
|
73
|
+
void filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all = false);
|
74
|
+
void filter_sites_by_phase();
|
75
|
+
void filter_sites_by_thinning(int min_SNP_distance);
|
76
|
+
void filter_sites_by_INFO_flags(const set<string> &flags_to_remove, const set<string> &flags_to_keep);
|
77
|
+
|
78
|
+
void filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude=true);
|
79
|
+
void filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename);
|
80
|
+
void filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename);
|
81
|
+
void filter_individuals_by_call_rate(double min_call_rate);
|
82
|
+
void filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth);
|
83
|
+
void filter_individuals_by_phase();
|
84
|
+
void filter_individuals_randomly(int max_N_indv);
|
85
|
+
|
86
|
+
void filter_genotypes_by_quality(double min_genotype_quality);
|
87
|
+
void filter_genotypes_by_depth(int min_depth, int max_depth);
|
88
|
+
void filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all = false);
|
89
|
+
|
90
|
+
void output_frequency(const string &output_file_prefix, bool output_counts=false, bool suppress_allele_output=false);
|
91
|
+
void output_individuals_by_mean_depth(const string &output_file_prefix);
|
92
|
+
void output_site_depth(const string &output_file_prefix, bool output_mean=true);
|
93
|
+
void output_genotype_depth(const string &output_file_prefix);
|
94
|
+
void output_het(const string &output_file_prefix);
|
95
|
+
void output_hwe(const string &output_file_prefix);
|
96
|
+
void output_SNP_density(const string &output_file_prefix, int bin_size);
|
97
|
+
void output_missingness(const string &output_file_prefix);
|
98
|
+
void output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
|
99
|
+
void output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2=0.1);
|
100
|
+
void output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
|
101
|
+
void output_singletons(const string &output_file_prefix);
|
102
|
+
void output_TsTv(const string &output_file_prefix, int bin_size);
|
103
|
+
void output_TsTv_by_count(const string &output_file_prefix);
|
104
|
+
void output_TsTv_by_quality(const string &output_file_prefix);
|
105
|
+
void output_per_site_nucleotide_diversity(const string &output_file_prefix);
|
106
|
+
void output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size);
|
107
|
+
void output_Tajima_D(const string &output_file_prefix, int window_size);
|
108
|
+
void output_site_quality(const string &output_file_prefix);
|
109
|
+
void output_FILTER_summary(const string &output_file_prefix);
|
110
|
+
void output_kept_and_removed_sites(const string &output_file_prefix);
|
111
|
+
void output_LROH(const string &output_file_prefix);
|
112
|
+
void output_indv_relatedness(const string &output_file_prefix);
|
113
|
+
void output_PCA(const string &output_file_prefix, bool use_normalisation=true, int SNP_loadings_N_PCs=-1);
|
114
|
+
|
115
|
+
void output_as_012_matrix(const string &output_file_prefix);
|
116
|
+
void output_as_plink(const string &output_file_prefix);
|
117
|
+
void output_as_plink_tped(const string &output_file_prefix);
|
118
|
+
void output_BEAGLE_genotype_likelihoods(const string &output_file_prefix);
|
119
|
+
void output_as_IMPUTE(const string &output_file_prefix);
|
120
|
+
void output_as_LDhat_phased(const string &output_file_prefix, const string &chr);
|
121
|
+
void output_as_LDhat_unphased(const string &output_file_prefix, const string &chr);
|
122
|
+
void output_LDhat_locs_file(const string &output_file_prefix, const string &chr, unsigned int &n_sites_out);
|
123
|
+
void output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id);
|
124
|
+
|
125
|
+
void output_fst(const string &output_file_prefix, vcf_file &vcf_fst);
|
126
|
+
void output_fst_version_2(const string &output_file_prefix, const vector<string> &indv_files);
|
127
|
+
|
128
|
+
void output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
129
|
+
void output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
130
|
+
void output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
131
|
+
void output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
132
|
+
void output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
133
|
+
void output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
134
|
+
|
135
|
+
void output_INFO_for_each_site(const string &output_file_prefix, const vector<string> &INFO_to_extract);
|
136
|
+
|
137
|
+
void print(const string &output_file_prefix, const set<string> &INFO_to_keep, bool keep_all_INFO=false);
|
138
|
+
|
139
|
+
int N_kept_individuals() const;
|
140
|
+
int N_kept_sites() const;
|
141
|
+
unsigned int N_genotypes_included(unsigned int entry_num) const;
|
142
|
+
|
143
|
+
private:
|
144
|
+
ifstream vcf_in;
|
145
|
+
gzFile gzvcf_in;
|
146
|
+
char *gz_readbuffer;
|
147
|
+
unsigned int gzMAX_LINE_LEN;
|
148
|
+
void open();
|
149
|
+
void close();
|
150
|
+
bool feof();
|
151
|
+
inline void read_line(string &out);
|
152
|
+
inline void read_CHROM_only(string &CHROM);
|
153
|
+
void read_CHROM_and_POS_only(string &CHROM, int &POS);
|
154
|
+
inline void read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS);
|
155
|
+
void parse_header(const string &line);
|
156
|
+
void parse_meta(const string &line);
|
157
|
+
streampos get_filepos();
|
158
|
+
void set_filepos(streampos &filepos);
|
159
|
+
|
160
|
+
bool has_body;
|
161
|
+
bool has_file_format;
|
162
|
+
bool has_genotypes;
|
163
|
+
bool has_header;
|
164
|
+
bool has_meta;
|
165
|
+
|
166
|
+
void scan_file(const string &chr="", const string &exclude_chr="", bool force_write_index=false);
|
167
|
+
inline char peek();
|
168
|
+
|
169
|
+
void return_indv_union(vcf_file &file2, map<string, pair< int, int> > &combined_individuals);
|
170
|
+
void return_site_union(vcf_file &file2, map<pair<string, int>, pair<int, int> > &out);
|
171
|
+
|
172
|
+
bool read_index_file(const string &index_filename);
|
173
|
+
void write_index_file(const string &index_filename);
|
174
|
+
|
175
|
+
void ByteSwap(unsigned char *b, int n) const;
|
176
|
+
int idx_read(gzFile &in, void *buffer, unsigned int len, size_t size);
|
177
|
+
void idx_write(gzFile &out, void *buffer, unsigned int len, size_t size);
|
178
|
+
|
179
|
+
bool big_endian_machine;
|
180
|
+
static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); };
|
181
|
+
|
182
|
+
};
|
183
|
+
|
184
|
+
#endif /* VCF_FILE_H_ */
|