ngs_server 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/ngs_server +72 -50
- data/ext/bamtools/extconf.rb +3 -3
- data/ext/vcftools/Makefile +28 -0
- data/ext/vcftools/README.txt +36 -0
- data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
- data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
- data/ext/vcftools/cpp/.svn/entries +708 -0
- data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
- data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
- data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
- data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
- data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
- data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
- data/ext/vcftools/cpp/Makefile +46 -0
- data/ext/vcftools/cpp/dgeev.cpp +146 -0
- data/ext/vcftools/cpp/dgeev.h +43 -0
- data/ext/vcftools/cpp/output_log.cpp +79 -0
- data/ext/vcftools/cpp/output_log.h +34 -0
- data/ext/vcftools/cpp/parameters.cpp +535 -0
- data/ext/vcftools/cpp/parameters.h +154 -0
- data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
- data/ext/vcftools/cpp/vcf_entry.h +190 -0
- data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
- data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
- data/ext/vcftools/cpp/vcf_file.cpp +495 -0
- data/ext/vcftools/cpp/vcf_file.h +184 -0
- data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
- data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
- data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
- data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
- data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
- data/ext/vcftools/cpp/vcftools.cpp +107 -0
- data/ext/vcftools/cpp/vcftools.h +25 -0
- data/ext/vcftools/examples/.svn/all-wcprops +185 -0
- data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
- data/ext/vcftools/examples/.svn/entries +1048 -0
- data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
- data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
- data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
- data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
- data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
- data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
- data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
- data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
- data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
- data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
- data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
- data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
- data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
- data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
- data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
- data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
- data/ext/vcftools/examples/annotate-test.vcf +37 -0
- data/ext/vcftools/examples/annotate.out +23 -0
- data/ext/vcftools/examples/annotate.txt +7 -0
- data/ext/vcftools/examples/annotate2.out +52 -0
- data/ext/vcftools/examples/annotate3.out +23 -0
- data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
- data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
- data/ext/vcftools/examples/cmp-test.out +53 -0
- data/ext/vcftools/examples/concat-a.vcf +21 -0
- data/ext/vcftools/examples/concat-b.vcf +13 -0
- data/ext/vcftools/examples/concat-c.vcf +19 -0
- data/ext/vcftools/examples/concat.out +39 -0
- data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
- data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
- data/ext/vcftools/examples/merge-test-a.vcf +17 -0
- data/ext/vcftools/examples/merge-test-b.vcf +17 -0
- data/ext/vcftools/examples/merge-test-c.vcf +15 -0
- data/ext/vcftools/examples/merge-test.vcf.out +31 -0
- data/ext/vcftools/examples/perl-api-1.pl +46 -0
- data/ext/vcftools/examples/query-test.out +6 -0
- data/ext/vcftools/examples/shuffle-test.vcf +12 -0
- data/ext/vcftools/examples/subset.SNPs.out +10 -0
- data/ext/vcftools/examples/subset.indels.out +18 -0
- data/ext/vcftools/examples/subset.vcf +21 -0
- data/ext/vcftools/examples/valid-3.3.vcf +30 -0
- data/ext/vcftools/examples/valid-4.0.vcf +34 -0
- data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
- data/ext/vcftools/examples/valid-4.1.vcf +37 -0
- data/ext/vcftools/extconf.rb +2 -0
- data/ext/vcftools/perl/.svn/all-wcprops +149 -0
- data/ext/vcftools/perl/.svn/entries +844 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
- data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
- data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
- data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
- data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
- data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
- data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
- data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
- data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
- data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
- data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
- data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
- data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
- data/ext/vcftools/perl/ChangeLog +84 -0
- data/ext/vcftools/perl/FaSlice.pm +214 -0
- data/ext/vcftools/perl/Makefile +12 -0
- data/ext/vcftools/perl/Vcf.pm +2853 -0
- data/ext/vcftools/perl/VcfStats.pm +681 -0
- data/ext/vcftools/perl/fill-aa +103 -0
- data/ext/vcftools/perl/fill-an-ac +56 -0
- data/ext/vcftools/perl/fill-ref-md5 +204 -0
- data/ext/vcftools/perl/tab-to-vcf +92 -0
- data/ext/vcftools/perl/test.t +376 -0
- data/ext/vcftools/perl/vcf-annotate +1099 -0
- data/ext/vcftools/perl/vcf-compare +1193 -0
- data/ext/vcftools/perl/vcf-concat +310 -0
- data/ext/vcftools/perl/vcf-convert +180 -0
- data/ext/vcftools/perl/vcf-fix-newlines +97 -0
- data/ext/vcftools/perl/vcf-isec +660 -0
- data/ext/vcftools/perl/vcf-merge +577 -0
- data/ext/vcftools/perl/vcf-query +286 -0
- data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
- data/ext/vcftools/perl/vcf-sort +79 -0
- data/ext/vcftools/perl/vcf-stats +160 -0
- data/ext/vcftools/perl/vcf-subset +206 -0
- data/ext/vcftools/perl/vcf-to-tab +112 -0
- data/ext/vcftools/perl/vcf-validator +145 -0
- data/ext/vcftools/website/.svn/all-wcprops +41 -0
- data/ext/vcftools/website/.svn/entries +238 -0
- data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
- data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
- data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
- data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
- data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
- data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
- data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
- data/ext/vcftools/website/Makefile +6 -0
- data/ext/vcftools/website/README +2 -0
- data/ext/vcftools/website/VCF-poster.pdf +0 -0
- data/ext/vcftools/website/default.css +250 -0
- data/ext/vcftools/website/favicon.ico +0 -0
- data/ext/vcftools/website/favicon.png +0 -0
- data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/img/.svn/entries +300 -0
- data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
- data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
- data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
- data/ext/vcftools/website/img/bg.gif +0 -0
- data/ext/vcftools/website/img/bgcode.gif +0 -0
- data/ext/vcftools/website/img/bgcontainer.gif +0 -0
- data/ext/vcftools/website/img/bgul.gif +0 -0
- data/ext/vcftools/website/img/header.gif +0 -0
- data/ext/vcftools/website/img/li.gif +0 -0
- data/ext/vcftools/website/img/quote.gif +0 -0
- data/ext/vcftools/website/img/search.gif +0 -0
- data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
- data/ext/vcftools/website/src/.svn/entries +300 -0
- data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
- data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
- data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
- data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
- data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
- data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
- data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
- data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
- data/ext/vcftools/website/src/docs.inc +202 -0
- data/ext/vcftools/website/src/index.inc +52 -0
- data/ext/vcftools/website/src/index.php +80 -0
- data/ext/vcftools/website/src/license.inc +27 -0
- data/ext/vcftools/website/src/links.inc +13 -0
- data/ext/vcftools/website/src/options.inc +654 -0
- data/ext/vcftools/website/src/perl_module.inc +249 -0
- data/ext/vcftools/website/src/specs.inc +18 -0
- data/lib/config.ru +9 -0
- data/lib/ngs_server/add.rb +9 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +55 -3
- data/ngs_server.gemspec +5 -2
- metadata +296 -6
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* vcf_file.cpp
|
|
3
|
+
*
|
|
4
|
+
* Created on: Aug 19, 2009
|
|
5
|
+
* Author: Adam Auton
|
|
6
|
+
* ($Revision: 230 $)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "vcf_file.h"
|
|
10
|
+
|
|
11
|
+
vcf_file::vcf_file(const string &filename, bool compressed, const string &chr, const string &exclude_chr, bool force_write_index) :
|
|
12
|
+
filename(filename),
|
|
13
|
+
compressed(compressed),
|
|
14
|
+
has_body(false),
|
|
15
|
+
has_file_format(false),
|
|
16
|
+
has_genotypes(false),
|
|
17
|
+
has_header(false),
|
|
18
|
+
has_meta(false)
|
|
19
|
+
{
|
|
20
|
+
open();
|
|
21
|
+
scan_file(chr, exclude_chr, force_write_index);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
vcf_file::~vcf_file()
|
|
25
|
+
{
|
|
26
|
+
close();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Parse VCF meta information
|
|
30
|
+
void vcf_file::parse_meta(const string &line)
|
|
31
|
+
{
|
|
32
|
+
has_meta = true;
|
|
33
|
+
meta.push_back(line);
|
|
34
|
+
size_t found=line.find("##fileformat=");
|
|
35
|
+
if (found!=string::npos)
|
|
36
|
+
{
|
|
37
|
+
has_file_format = true;
|
|
38
|
+
found = line.find_first_of("=");
|
|
39
|
+
string version = line.substr(found+1);
|
|
40
|
+
if ((version != "VCFv4.0") && (version != "VCFv4.1"))
|
|
41
|
+
error("VCF version must be v4.0 or v4.1:\nYou are using version " + version);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
found=line.find("##INFO=");
|
|
45
|
+
if (found!=string::npos)
|
|
46
|
+
{ // Found an INFO descriptor
|
|
47
|
+
vcf_entry::add_INFO_descriptor(line);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
found=line.find("##FILTER=");
|
|
51
|
+
if (found!=string::npos)
|
|
52
|
+
{ // Found a FILTER descriptor
|
|
53
|
+
vcf_entry::add_FILTER_descriptor(line);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
found=line.find("##FORMAT=");
|
|
57
|
+
if (found!=string::npos)
|
|
58
|
+
{ // Found a genotype filter descriptor
|
|
59
|
+
vcf_entry::add_FORMAT_descriptor(line);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Parse VCF header, and extract individuals etc.
|
|
64
|
+
void vcf_file::parse_header(const string &line)
|
|
65
|
+
{
|
|
66
|
+
// #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... )
|
|
67
|
+
if (has_header == true)
|
|
68
|
+
warning("Multiple Header lines.");
|
|
69
|
+
|
|
70
|
+
has_header = true;
|
|
71
|
+
istringstream header(line);
|
|
72
|
+
int count = 0;
|
|
73
|
+
string tmp_str;
|
|
74
|
+
unsigned int N_header_indv = 0;
|
|
75
|
+
has_genotypes = false;
|
|
76
|
+
while (!header.eof())
|
|
77
|
+
{
|
|
78
|
+
header >> tmp_str;
|
|
79
|
+
switch (count)
|
|
80
|
+
{
|
|
81
|
+
case 0: if (tmp_str != "#CHROM") warning("First Header entry should be #CHROM: " + tmp_str); break;
|
|
82
|
+
case 1: if (tmp_str != "POS") warning("Second Header entry should be POS: " + tmp_str); break;
|
|
83
|
+
case 2: if (tmp_str != "ID") warning("Third Header entry should be ID: " + tmp_str); break;
|
|
84
|
+
case 3: if (tmp_str != "REF") warning("Fourth Header entry should be REF: " + tmp_str); break;
|
|
85
|
+
case 4: if (tmp_str != "ALT") warning("Fifth Header entry should be ALT: " + tmp_str); break;
|
|
86
|
+
case 5: if (tmp_str != "QUAL") warning("Sixth Header entry should be QUAL: " + tmp_str); break;
|
|
87
|
+
case 6: if (tmp_str != "FILTER") warning("Seventh Header entry should be FILTER: " + tmp_str); break;
|
|
88
|
+
case 7: if (tmp_str != "INFO") warning("Eighth Header entry should be INFO: " + tmp_str); break;
|
|
89
|
+
case 8:
|
|
90
|
+
if (tmp_str != "FORMAT")
|
|
91
|
+
warning("Ninth Header entry should be FORMAT: " + tmp_str);
|
|
92
|
+
else
|
|
93
|
+
has_genotypes = true;
|
|
94
|
+
break;
|
|
95
|
+
default:
|
|
96
|
+
{
|
|
97
|
+
if (count <= 8)
|
|
98
|
+
error("Incorrectly formatted header.");
|
|
99
|
+
indv.push_back(tmp_str);
|
|
100
|
+
N_header_indv++;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
count++;
|
|
104
|
+
}
|
|
105
|
+
N_indv = N_header_indv;
|
|
106
|
+
|
|
107
|
+
if ((has_genotypes == true ) && (N_indv == 0))
|
|
108
|
+
warning("FORMAT field without genotypes?");
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
// Read VCF file
|
|
113
|
+
void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index)
|
|
114
|
+
{
|
|
115
|
+
bool filter_by_chr = (chr != "");
|
|
116
|
+
bool exclude_by_chr = (exclude_chr != "");
|
|
117
|
+
string index_filename = filename + ".vcfidx";
|
|
118
|
+
bool could_read_index_file = false;
|
|
119
|
+
if (force_write_index == false)
|
|
120
|
+
could_read_index_file = read_index_file(index_filename);
|
|
121
|
+
string CHROM, last_CHROM="";
|
|
122
|
+
int POS, last_POS = -1;
|
|
123
|
+
if (could_read_index_file == false)
|
|
124
|
+
{
|
|
125
|
+
printLOG("Building new index file.\n");
|
|
126
|
+
string line, CHROM, last_CHROM = "";
|
|
127
|
+
streampos filepos;
|
|
128
|
+
char c;
|
|
129
|
+
N_entries=0;
|
|
130
|
+
N_indv = 0;
|
|
131
|
+
|
|
132
|
+
while (!feof())
|
|
133
|
+
{
|
|
134
|
+
filepos = get_filepos();
|
|
135
|
+
c = peek();
|
|
136
|
+
|
|
137
|
+
if ((c == '\n') || (c == '\r'))
|
|
138
|
+
{
|
|
139
|
+
read_line(line);
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
else if (c == EOF)
|
|
143
|
+
break;
|
|
144
|
+
|
|
145
|
+
if (c == '#')
|
|
146
|
+
{
|
|
147
|
+
read_line(line);
|
|
148
|
+
if (line[1] == '#')
|
|
149
|
+
{ // Meta information
|
|
150
|
+
parse_meta(line);
|
|
151
|
+
}
|
|
152
|
+
else
|
|
153
|
+
{ // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... )
|
|
154
|
+
parse_header(line);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
else
|
|
158
|
+
{ // Must be a data line
|
|
159
|
+
read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS);
|
|
160
|
+
if (last_CHROM != CHROM)
|
|
161
|
+
{
|
|
162
|
+
printLOG("\tScanning Chromosome: " + CHROM + "\n");
|
|
163
|
+
last_CHROM = CHROM;
|
|
164
|
+
}
|
|
165
|
+
if (POS == last_POS)
|
|
166
|
+
{
|
|
167
|
+
one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n");
|
|
168
|
+
}
|
|
169
|
+
last_POS = POS;
|
|
170
|
+
entry_file_locations.push_back(filepos);
|
|
171
|
+
N_entries++;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
write_index_file(index_filename);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n");
|
|
179
|
+
vector<string> meta_lines = meta; meta.resize(0);
|
|
180
|
+
for (unsigned int ui=0; ui<meta_lines.size(); ui++)
|
|
181
|
+
parse_meta(meta_lines[ui]);
|
|
182
|
+
has_genotypes = (N_indv > 0);
|
|
183
|
+
|
|
184
|
+
bool already_found_required_chr = false;
|
|
185
|
+
bool already_filtered_required_chr = false;
|
|
186
|
+
if ((exclude_by_chr == true) || (filter_by_chr == true))
|
|
187
|
+
{
|
|
188
|
+
printLOG("Filtering by chromosome.\n");
|
|
189
|
+
for (unsigned int ui=0; ui<N_entries; ui++)
|
|
190
|
+
{
|
|
191
|
+
if (already_found_required_chr == true)
|
|
192
|
+
{
|
|
193
|
+
printLOG("Skipping Remainder.\n");
|
|
194
|
+
entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end());
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
if (already_filtered_required_chr == true)
|
|
198
|
+
{
|
|
199
|
+
printLOG("Skipping Remainder.\n");
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
set_filepos(entry_file_locations[ui]);
|
|
204
|
+
read_CHROM_only(CHROM);
|
|
205
|
+
|
|
206
|
+
if (last_CHROM != CHROM)
|
|
207
|
+
{
|
|
208
|
+
printLOG("\tChromosome: " + CHROM + "\n");
|
|
209
|
+
if ((filter_by_chr == true) && (last_CHROM == chr))
|
|
210
|
+
already_found_required_chr = true;
|
|
211
|
+
|
|
212
|
+
if ((exclude_by_chr == true) && (last_CHROM == exclude_chr))
|
|
213
|
+
already_filtered_required_chr = true;
|
|
214
|
+
|
|
215
|
+
last_CHROM = CHROM;
|
|
216
|
+
}
|
|
217
|
+
if ((exclude_by_chr == true) && (CHROM == exclude_chr))
|
|
218
|
+
{
|
|
219
|
+
entry_file_locations[ui] = -1;
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
if ((filter_by_chr == true) && (CHROM != chr))
|
|
223
|
+
{
|
|
224
|
+
entry_file_locations[ui] = -1;
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
sort(entry_file_locations.begin(), entry_file_locations.end());
|
|
229
|
+
while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0))
|
|
230
|
+
entry_file_locations.pop_front();
|
|
231
|
+
|
|
232
|
+
N_entries = entry_file_locations.size();
|
|
233
|
+
printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n");
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
include_indv.clear();
|
|
237
|
+
include_indv.resize(N_indv, true);
|
|
238
|
+
include_entry.clear();
|
|
239
|
+
include_entry.resize(N_entries, true);
|
|
240
|
+
include_genotype.clear();
|
|
241
|
+
include_genotype.resize(N_entries, vector<bool>(N_indv, true));
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
void vcf_file::print(const string &output_file_prefix, const set<string> &INFO_to_keep, bool keep_all_INFO)
|
|
245
|
+
{
|
|
246
|
+
printLOG("Outputting VCF file... ");
|
|
247
|
+
unsigned int ui;
|
|
248
|
+
|
|
249
|
+
string output_file = output_file_prefix + ".recode.vcf";
|
|
250
|
+
ofstream out(output_file.c_str());
|
|
251
|
+
if (!out.is_open())
|
|
252
|
+
error("Could not open VCF Output File: " + output_file, 3);
|
|
253
|
+
|
|
254
|
+
for (ui=0; ui<meta.size(); ui++)
|
|
255
|
+
out << meta[ui] << endl;
|
|
256
|
+
|
|
257
|
+
out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
|
|
258
|
+
if (N_indv > 0)
|
|
259
|
+
out << "\tFORMAT";
|
|
260
|
+
for (ui=0; ui<N_indv; ui++)
|
|
261
|
+
if (include_indv[ui])
|
|
262
|
+
out << "\t" << indv[ui];
|
|
263
|
+
out << endl;
|
|
264
|
+
|
|
265
|
+
string vcf_line;
|
|
266
|
+
for (unsigned int s=0; s<N_entries; s++)
|
|
267
|
+
if (include_entry[s] == true)
|
|
268
|
+
{
|
|
269
|
+
get_vcf_entry(s, vcf_line);
|
|
270
|
+
vcf_entry e(N_indv, vcf_line);
|
|
271
|
+
e.parse_basic_entry(true, true, true);
|
|
272
|
+
e.parse_full_entry(true);
|
|
273
|
+
e.parse_genotype_entries(true,true,true,true);
|
|
274
|
+
e.print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
out.close();
|
|
278
|
+
printLOG("Done\n");
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Return the number of individuals that have not been filtered out
|
|
282
|
+
int vcf_file::N_kept_individuals() const
|
|
283
|
+
{
|
|
284
|
+
int N_kept = 0;
|
|
285
|
+
for (unsigned int ui=0; ui<include_indv.size(); ui++)
|
|
286
|
+
if (include_indv[ui] == true)
|
|
287
|
+
N_kept++;
|
|
288
|
+
return N_kept;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Return the number of sites that have not been filtered out
|
|
292
|
+
int vcf_file::N_kept_sites() const
|
|
293
|
+
{
|
|
294
|
+
int N_kept = 0;
|
|
295
|
+
for (unsigned int ui=0; ui<include_entry.size(); ui++)
|
|
296
|
+
if (include_entry[ui] == true)
|
|
297
|
+
N_kept++;
|
|
298
|
+
return N_kept;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Count the number of genotypes that have not been filtered out
|
|
302
|
+
unsigned int vcf_file::N_genotypes_included(unsigned int entry_num) const
|
|
303
|
+
{
|
|
304
|
+
unsigned int count = 0, ui;
|
|
305
|
+
for (ui=0; ui<N_indv; ui++)
|
|
306
|
+
if ((include_indv[ui] == true) && (include_genotype[entry_num][ui] == true))
|
|
307
|
+
{
|
|
308
|
+
count++;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
return count;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
void vcf_file::open()
|
|
315
|
+
{
|
|
316
|
+
if (!compressed)
|
|
317
|
+
{
|
|
318
|
+
if (filename.substr(filename.size()-3) == ".gz")
|
|
319
|
+
{
|
|
320
|
+
warning("Filename ends in '.gz'. Shouldn't you be using --gzvcf?\n");
|
|
321
|
+
}
|
|
322
|
+
vcf_in.open(filename.c_str(), ios::in);
|
|
323
|
+
if (!vcf_in.is_open())
|
|
324
|
+
error("Could not open VCF file: " + filename, 0);
|
|
325
|
+
}
|
|
326
|
+
else
|
|
327
|
+
{
|
|
328
|
+
gzMAX_LINE_LEN = 1024*1024;
|
|
329
|
+
gz_readbuffer = new char[gzMAX_LINE_LEN];
|
|
330
|
+
gzvcf_in = gzopen(filename.c_str(), "rb");
|
|
331
|
+
if (gzvcf_in == NULL)
|
|
332
|
+
error("Could not open GZVCF file: " + filename, 0);
|
|
333
|
+
#ifdef ZLIB_VERNUM
|
|
334
|
+
string tmp(ZLIB_VERSION);
|
|
335
|
+
printLOG("Using zlib version: " + tmp + "\n");
|
|
336
|
+
#if (ZLIB_VERNUM >= 0x1240)
|
|
337
|
+
gzbuffer(gzvcf_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster
|
|
338
|
+
#else
|
|
339
|
+
printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files.\n");
|
|
340
|
+
#endif
|
|
341
|
+
#endif
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
void vcf_file::close()
|
|
346
|
+
{
|
|
347
|
+
if (!compressed)
|
|
348
|
+
vcf_in.close();
|
|
349
|
+
else
|
|
350
|
+
{
|
|
351
|
+
gzclose(gzvcf_in);
|
|
352
|
+
delete [] gz_readbuffer;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
bool vcf_file::feof()
|
|
357
|
+
{
|
|
358
|
+
bool out;
|
|
359
|
+
if (!compressed)
|
|
360
|
+
out = vcf_in.eof();
|
|
361
|
+
else
|
|
362
|
+
{
|
|
363
|
+
out = gzeof(gzvcf_in); // Returns 1 when EOF has previously been detected reading the given input stream, otherwise zero.
|
|
364
|
+
}
|
|
365
|
+
return out;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
streampos vcf_file::get_filepos()
|
|
369
|
+
{
|
|
370
|
+
if (!compressed)
|
|
371
|
+
return vcf_in.tellg();
|
|
372
|
+
else
|
|
373
|
+
{
|
|
374
|
+
return gztell(gzvcf_in); // TODO: Type check
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
void vcf_file::set_filepos(streampos &filepos)
|
|
379
|
+
{
|
|
380
|
+
if (!compressed)
|
|
381
|
+
{
|
|
382
|
+
vcf_in.clear();
|
|
383
|
+
vcf_in.seekg(filepos, ios::beg);
|
|
384
|
+
}
|
|
385
|
+
else
|
|
386
|
+
{
|
|
387
|
+
gzseek(gzvcf_in, filepos, SEEK_SET);
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
void vcf_file::get_vcf_entry(unsigned int entry_num, string &out)
|
|
392
|
+
{
|
|
393
|
+
streampos filepos = entry_file_locations[entry_num];
|
|
394
|
+
set_filepos(filepos);
|
|
395
|
+
read_line(out);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
void vcf_file::read_line(string &out)
|
|
399
|
+
{
|
|
400
|
+
if (!compressed)
|
|
401
|
+
{
|
|
402
|
+
getline(vcf_in, out);
|
|
403
|
+
out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line
|
|
404
|
+
}
|
|
405
|
+
else
|
|
406
|
+
{
|
|
407
|
+
out = "";
|
|
408
|
+
bool again = true;
|
|
409
|
+
while (again == true)
|
|
410
|
+
{
|
|
411
|
+
gzgets(gzvcf_in, gz_readbuffer, gzMAX_LINE_LEN);
|
|
412
|
+
out.append(gz_readbuffer);
|
|
413
|
+
if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1)
|
|
414
|
+
again = false;
|
|
415
|
+
}
|
|
416
|
+
out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!)
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
char vcf_file::peek()
|
|
421
|
+
{
|
|
422
|
+
if (!compressed)
|
|
423
|
+
return vcf_in.peek();
|
|
424
|
+
else
|
|
425
|
+
{
|
|
426
|
+
char c = gzgetc(gzvcf_in);
|
|
427
|
+
gzungetc(c, gzvcf_in);
|
|
428
|
+
return c;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
void vcf_file::read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS)
|
|
434
|
+
{
|
|
435
|
+
if (!compressed)
|
|
436
|
+
{
|
|
437
|
+
vcf_in >> CHROM >> POS;
|
|
438
|
+
vcf_in.ignore(std::numeric_limits<streamsize>::max(), '\n');
|
|
439
|
+
}
|
|
440
|
+
else
|
|
441
|
+
{
|
|
442
|
+
static string line;
|
|
443
|
+
static stringstream ss;
|
|
444
|
+
read_line(line);
|
|
445
|
+
ss.clear(); ss.str(line);
|
|
446
|
+
ss >> CHROM >> POS;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
void vcf_file::read_CHROM_only(string &CHROM)
|
|
453
|
+
{ // Just read in the chromosome. Note: leaves the stream in a funny state, but is faster than reading whole line
|
|
454
|
+
if (!compressed)
|
|
455
|
+
{
|
|
456
|
+
vcf_in >> CHROM;
|
|
457
|
+
}
|
|
458
|
+
else
|
|
459
|
+
{
|
|
460
|
+
CHROM = "";
|
|
461
|
+
char c = gzgetc(gzvcf_in);
|
|
462
|
+
while (c != '\t')
|
|
463
|
+
{
|
|
464
|
+
CHROM += c;
|
|
465
|
+
c = gzgetc(gzvcf_in);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
void vcf_file::read_CHROM_and_POS_only(string &CHROM, int &POS)
|
|
471
|
+
{ // Just read in the chromosome and position. Note: leaves the stream in a funny state, but is faster than reading whole line
|
|
472
|
+
if (!compressed)
|
|
473
|
+
{
|
|
474
|
+
vcf_in >> CHROM >> POS;
|
|
475
|
+
}
|
|
476
|
+
else
|
|
477
|
+
{
|
|
478
|
+
CHROM = "";
|
|
479
|
+
char c = gzgetc(gzvcf_in);
|
|
480
|
+
while (c != '\t')
|
|
481
|
+
{
|
|
482
|
+
CHROM += c;
|
|
483
|
+
c = gzgetc(gzvcf_in);
|
|
484
|
+
}
|
|
485
|
+
string tmp;
|
|
486
|
+
c = gzgetc(gzvcf_in);
|
|
487
|
+
while (c != '\t')
|
|
488
|
+
{
|
|
489
|
+
tmp += c;
|
|
490
|
+
c = gzgetc(gzvcf_in);
|
|
491
|
+
}
|
|
492
|
+
POS = atoi(tmp.c_str());
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* vcf_file.h
|
|
3
|
+
*
|
|
4
|
+
* Created on: Aug 19, 2009
|
|
5
|
+
* Author: Adam Auton
|
|
6
|
+
* ($Revision: 249 $)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#ifndef VCF_FILE_H_
|
|
10
|
+
#define VCF_FILE_H_
|
|
11
|
+
|
|
12
|
+
#include <cmath>
|
|
13
|
+
#include <cstdio>
|
|
14
|
+
#include <cstring>
|
|
15
|
+
#include <deque>
|
|
16
|
+
#include <iomanip>
|
|
17
|
+
#include <iostream>
|
|
18
|
+
#include <fstream>
|
|
19
|
+
#include <limits>
|
|
20
|
+
#include <set>
|
|
21
|
+
#include <sstream>
|
|
22
|
+
#include <stdint.h>
|
|
23
|
+
#include <string>
|
|
24
|
+
#include <sys/stat.h>
|
|
25
|
+
#include <vector>
|
|
26
|
+
#include <zlib.h>
|
|
27
|
+
|
|
28
|
+
#include "output_log.h"
|
|
29
|
+
#include "parameters.h"
|
|
30
|
+
#include "vcf_entry.h"
|
|
31
|
+
|
|
32
|
+
#ifdef VCFTOOLS_PCA
|
|
33
|
+
#include "dgeev.h"
|
|
34
|
+
#endif
|
|
35
|
+
|
|
36
|
+
using namespace std;
|
|
37
|
+
|
|
38
|
+
class vcf_file
|
|
39
|
+
{
|
|
40
|
+
public:
|
|
41
|
+
vcf_file(const string &filename, bool compressed=false, const string &chr="", const string &exclude_chr="", bool force_write_index=false);
|
|
42
|
+
~vcf_file();
|
|
43
|
+
|
|
44
|
+
const string filename;
|
|
45
|
+
bool compressed;
|
|
46
|
+
vector<string> meta;
|
|
47
|
+
vector<string> indv;
|
|
48
|
+
unsigned int N_indv;
|
|
49
|
+
unsigned int N_entries;
|
|
50
|
+
|
|
51
|
+
deque<streampos> entry_file_locations;
|
|
52
|
+
void get_vcf_entry(unsigned int entry_num, string &out);
|
|
53
|
+
|
|
54
|
+
vector<bool> include_indv;
|
|
55
|
+
deque<bool> include_entry;
|
|
56
|
+
deque<vector<bool> > include_genotype;
|
|
57
|
+
|
|
58
|
+
void apply_filters(const parameters ¶ms);
|
|
59
|
+
|
|
60
|
+
void filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false);
|
|
61
|
+
void filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file);
|
|
62
|
+
void filter_sites_to_exclude(const string &snps_to_exclude_file);
|
|
63
|
+
void filter_sites_by_position(const string &chr, int start_pos, int end_pos);
|
|
64
|
+
void filter_sites_by_positions(const string &positions_file);
|
|
65
|
+
void filter_sites_by_quality(double min_quality);
|
|
66
|
+
void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth);
|
|
67
|
+
void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate);
|
|
68
|
+
void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count);
|
|
69
|
+
void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles);
|
|
70
|
+
void filter_sites_by_HWE_pvalue(double min_HWE_pvalue);
|
|
71
|
+
void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false);
|
|
72
|
+
void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0);
|
|
73
|
+
void filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all = false);
|
|
74
|
+
void filter_sites_by_phase();
|
|
75
|
+
void filter_sites_by_thinning(int min_SNP_distance);
|
|
76
|
+
void filter_sites_by_INFO_flags(const set<string> &flags_to_remove, const set<string> &flags_to_keep);
|
|
77
|
+
|
|
78
|
+
void filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude=true);
|
|
79
|
+
void filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename);
|
|
80
|
+
void filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename);
|
|
81
|
+
void filter_individuals_by_call_rate(double min_call_rate);
|
|
82
|
+
void filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth);
|
|
83
|
+
void filter_individuals_by_phase();
|
|
84
|
+
void filter_individuals_randomly(int max_N_indv);
|
|
85
|
+
|
|
86
|
+
void filter_genotypes_by_quality(double min_genotype_quality);
|
|
87
|
+
void filter_genotypes_by_depth(int min_depth, int max_depth);
|
|
88
|
+
void filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all = false);
|
|
89
|
+
|
|
90
|
+
void output_frequency(const string &output_file_prefix, bool output_counts=false, bool suppress_allele_output=false);
|
|
91
|
+
void output_individuals_by_mean_depth(const string &output_file_prefix);
|
|
92
|
+
void output_site_depth(const string &output_file_prefix, bool output_mean=true);
|
|
93
|
+
void output_genotype_depth(const string &output_file_prefix);
|
|
94
|
+
void output_het(const string &output_file_prefix);
|
|
95
|
+
void output_hwe(const string &output_file_prefix);
|
|
96
|
+
void output_SNP_density(const string &output_file_prefix, int bin_size);
|
|
97
|
+
void output_missingness(const string &output_file_prefix);
|
|
98
|
+
void output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
|
|
99
|
+
void output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2=0.1);
|
|
100
|
+
void output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
|
|
101
|
+
void output_singletons(const string &output_file_prefix);
|
|
102
|
+
void output_TsTv(const string &output_file_prefix, int bin_size);
|
|
103
|
+
void output_TsTv_by_count(const string &output_file_prefix);
|
|
104
|
+
void output_TsTv_by_quality(const string &output_file_prefix);
|
|
105
|
+
void output_per_site_nucleotide_diversity(const string &output_file_prefix);
|
|
106
|
+
void output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size);
|
|
107
|
+
void output_Tajima_D(const string &output_file_prefix, int window_size);
|
|
108
|
+
void output_site_quality(const string &output_file_prefix);
|
|
109
|
+
void output_FILTER_summary(const string &output_file_prefix);
|
|
110
|
+
void output_kept_and_removed_sites(const string &output_file_prefix);
|
|
111
|
+
void output_LROH(const string &output_file_prefix);
|
|
112
|
+
void output_indv_relatedness(const string &output_file_prefix);
|
|
113
|
+
void output_PCA(const string &output_file_prefix, bool use_normalisation=true, int SNP_loadings_N_PCs=-1);
|
|
114
|
+
|
|
115
|
+
void output_as_012_matrix(const string &output_file_prefix);
|
|
116
|
+
void output_as_plink(const string &output_file_prefix);
|
|
117
|
+
void output_as_plink_tped(const string &output_file_prefix);
|
|
118
|
+
void output_BEAGLE_genotype_likelihoods(const string &output_file_prefix);
|
|
119
|
+
void output_as_IMPUTE(const string &output_file_prefix);
|
|
120
|
+
void output_as_LDhat_phased(const string &output_file_prefix, const string &chr);
|
|
121
|
+
void output_as_LDhat_unphased(const string &output_file_prefix, const string &chr);
|
|
122
|
+
void output_LDhat_locs_file(const string &output_file_prefix, const string &chr, unsigned int &n_sites_out);
|
|
123
|
+
void output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id);
|
|
124
|
+
|
|
125
|
+
void output_fst(const string &output_file_prefix, vcf_file &vcf_fst);
|
|
126
|
+
void output_fst_version_2(const string &output_file_prefix, const vector<string> &indv_files);
|
|
127
|
+
|
|
128
|
+
void output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
129
|
+
void output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
130
|
+
void output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
131
|
+
void output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
132
|
+
void output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
133
|
+
void output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file);
|
|
134
|
+
|
|
135
|
+
void output_INFO_for_each_site(const string &output_file_prefix, const vector<string> &INFO_to_extract);
|
|
136
|
+
|
|
137
|
+
void print(const string &output_file_prefix, const set<string> &INFO_to_keep, bool keep_all_INFO=false);
|
|
138
|
+
|
|
139
|
+
int N_kept_individuals() const;
|
|
140
|
+
int N_kept_sites() const;
|
|
141
|
+
unsigned int N_genotypes_included(unsigned int entry_num) const;
|
|
142
|
+
|
|
143
|
+
private:
|
|
144
|
+
ifstream vcf_in;
|
|
145
|
+
gzFile gzvcf_in;
|
|
146
|
+
char *gz_readbuffer;
|
|
147
|
+
unsigned int gzMAX_LINE_LEN;
|
|
148
|
+
void open();
|
|
149
|
+
void close();
|
|
150
|
+
bool feof();
|
|
151
|
+
inline void read_line(string &out);
|
|
152
|
+
inline void read_CHROM_only(string &CHROM);
|
|
153
|
+
void read_CHROM_and_POS_only(string &CHROM, int &POS);
|
|
154
|
+
inline void read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS);
|
|
155
|
+
void parse_header(const string &line);
|
|
156
|
+
void parse_meta(const string &line);
|
|
157
|
+
streampos get_filepos();
|
|
158
|
+
void set_filepos(streampos &filepos);
|
|
159
|
+
|
|
160
|
+
bool has_body;
|
|
161
|
+
bool has_file_format;
|
|
162
|
+
bool has_genotypes;
|
|
163
|
+
bool has_header;
|
|
164
|
+
bool has_meta;
|
|
165
|
+
|
|
166
|
+
void scan_file(const string &chr="", const string &exclude_chr="", bool force_write_index=false);
|
|
167
|
+
inline char peek();
|
|
168
|
+
|
|
169
|
+
void return_indv_union(vcf_file &file2, map<string, pair< int, int> > &combined_individuals);
|
|
170
|
+
void return_site_union(vcf_file &file2, map<pair<string, int>, pair<int, int> > &out);
|
|
171
|
+
|
|
172
|
+
bool read_index_file(const string &index_filename);
|
|
173
|
+
void write_index_file(const string &index_filename);
|
|
174
|
+
|
|
175
|
+
void ByteSwap(unsigned char *b, int n) const;
|
|
176
|
+
int idx_read(gzFile &in, void *buffer, unsigned int len, size_t size);
|
|
177
|
+
void idx_write(gzFile &out, void *buffer, unsigned int len, size_t size);
|
|
178
|
+
|
|
179
|
+
bool big_endian_machine;
|
|
180
|
+
static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); };
|
|
181
|
+
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
#endif /* VCF_FILE_H_ */
|