ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,3012 @@
1
+ /*
2
+ * vcf_file_output.cpp
3
+ *
4
+ * Created on: Aug 28, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 249 $)
7
+ */
8
+ #include "vcf_file.h"
9
+
10
+ void vcf_file::output_frequency(const string &output_file_prefix, bool output_counts, bool suppress_allele_output)
11
+ {
12
+ // Output statistics of frequency at each site
13
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
14
+ error("Require Genotypes in VCF file in order to output Frequency Statistics.");
15
+
16
+ printLOG("Outputting Frequency Statistics...\n");
17
+ string output_file = output_file_prefix + ".frq";
18
+ if (output_counts)
19
+ output_file += ".count";
20
+
21
+ ofstream out(output_file.c_str());
22
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
23
+ if (suppress_allele_output == false)
24
+ {
25
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:";
26
+ if (output_counts)
27
+ out << "COUNT}" << endl;
28
+ else
29
+ out << "FREQ}" << endl;
30
+ }
31
+ else
32
+ {
33
+ if (output_counts)
34
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}" << endl;
35
+ else
36
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}" << endl;
37
+ }
38
+
39
+ vector<int> allele_counts;
40
+ unsigned int N_non_missing_chr;
41
+ unsigned int N_alleles;
42
+ string vcf_line;
43
+ vcf_entry e(N_indv);
44
+ for (unsigned int s=0; s<N_entries; s++)
45
+ {
46
+ if (include_entry[s] == false)
47
+ continue;
48
+
49
+ get_vcf_entry(s, vcf_line);
50
+ e.reset(vcf_line);
51
+ e.parse_basic_entry(true);
52
+ e.parse_genotype_entries(true);
53
+ N_alleles = e.get_N_alleles();
54
+
55
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
56
+
57
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << N_alleles << "\t" << N_non_missing_chr;
58
+ if (output_counts)
59
+ {
60
+ if (suppress_allele_output == false)
61
+ {
62
+ out << "\t" << e.get_REF() << ":" << allele_counts[0];
63
+ for (unsigned int ui=1; ui<N_alleles; ui++)
64
+ {
65
+ out << "\t" << e.get_ALT_allele(ui-1) << ":" << allele_counts[ui];
66
+ }
67
+ out << endl;
68
+ }
69
+ else
70
+ {
71
+ for (unsigned ui=0; ui<N_alleles; ui++)
72
+ {
73
+ out << "\t" << allele_counts[ui];
74
+ }
75
+ out << endl;
76
+ }
77
+ }
78
+ else
79
+ {
80
+ double freq;
81
+ if (suppress_allele_output == false)
82
+ {
83
+ freq = allele_counts[0] / (double)N_non_missing_chr;
84
+ out << "\t" << e.get_REF() << ":" << freq;
85
+ for (unsigned int ui=1; ui<N_alleles; ui++)
86
+ {
87
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
88
+ out << "\t" << e.get_ALT_allele(ui-1) << ":" << freq;
89
+ }
90
+ out << endl;
91
+ }
92
+ else
93
+ {
94
+ for (unsigned int ui=0; ui<N_alleles; ui++)
95
+ {
96
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
97
+ out << "\t" << freq;
98
+ }
99
+ out << endl;
100
+ }
101
+ }
102
+ }
103
+ out.close();
104
+ }
105
+
106
+ void vcf_file::output_het(const string &output_file_prefix)
107
+ {
108
+ // Output statistics on Heterozygosity for each individual
109
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
110
+ error("Require Genotypes in VCF file in order to output Heterozygosity Statistics.");
111
+ // Following the calculations in PLINK....
112
+ // Note this assumes Biallelic SNPs.
113
+
114
+ printLOG("Outputting Individual Heterozygosity\n");
115
+
116
+ string output_file = output_file_prefix + ".het";
117
+ ofstream out(output_file.c_str());
118
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
119
+ out << "INDV\tO(HOM)\tE(HOM)\tN_SITES\tF" << endl;
120
+
121
+ // P(Homo) = F + (1-F)P(Homo by chance)
122
+ // P(Homo by chance) = p^2+q^2 for a biallelic locus.
123
+ // For an individual with N genotyped loci, we
124
+ // 1. count the total observed number of loci which are homozygous (O),
125
+ // 2. calculate the total expected number of loci homozygous by chance (E)
126
+ // Then, using the method of moments, we have
127
+ // O = NF + (1-F)E
128
+ // Which rearranges to give
129
+ // F = (O-E)/(N-E)
130
+
131
+ // First, calc frequency of each site (should really move this to a subroutine)
132
+ vector<double> freq(N_entries, 0.0);
133
+ vector<int> allele_counts;
134
+ vector<unsigned int> N_non_missing_chr(N_entries,0);
135
+ string vcf_line;
136
+ vcf_entry e(N_indv);
137
+ for (unsigned int s=0; s<N_entries; s++)
138
+ {
139
+ if (include_entry[s] == false)
140
+ continue;
141
+
142
+ get_vcf_entry(s, vcf_line);
143
+ e.reset(vcf_line);
144
+ e.parse_basic_entry(true);
145
+
146
+ if (e.get_N_alleles() != 2)
147
+ {
148
+ one_off_warning("\tIndividual Heterozygosity: Only using biallelic SNPs.");
149
+ continue;
150
+ }
151
+
152
+ e.parse_genotype_entries(true);
153
+
154
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
155
+ {
156
+ one_off_warning("\tIndividual Heterozygosity: Only using fully diploid SNPs.");
157
+ continue;
158
+ }
159
+
160
+ // Frequency of non-reference allele
161
+ e.get_allele_counts(allele_counts, N_non_missing_chr[s], include_indv, include_genotype[s]);
162
+
163
+ if (N_non_missing_chr[s] > 0)
164
+ freq[s] = allele_counts[1] / double(N_non_missing_chr[s]);
165
+ else
166
+ freq[s] = -1;
167
+ }
168
+
169
+ vector<int> N_sites_included(N_indv, 0);
170
+ vector<int> N_obs_hom(N_indv, 0);
171
+ vector<double> N_expected_hom(N_indv, 0.0);
172
+ pair<int, int> alleles;
173
+
174
+ for (unsigned int s=0; s<N_entries; s++)
175
+ {
176
+ if (include_entry[s] == false)
177
+ continue;
178
+
179
+ get_vcf_entry(s, vcf_line);
180
+ e.reset(vcf_line);
181
+ e.parse_basic_entry(true);
182
+
183
+ if (e.get_N_alleles() != 2)
184
+ continue;
185
+
186
+ e.parse_genotype_entries(true);
187
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
188
+ continue;
189
+
190
+ if ((freq[s] <= numeric_limits<double>::epsilon()) || (1.0 - freq[s] <= numeric_limits<double>::epsilon()))
191
+ continue;
192
+
193
+ for (unsigned int ui=0; ui<N_indv; ui++)
194
+ {
195
+ if (include_indv[ui] == false)
196
+ continue;
197
+
198
+ if (include_genotype[s][ui] == true)
199
+ {
200
+ e.get_indv_GENOTYPE_ids(ui, alleles);
201
+ if ((alleles.first != -1) && (alleles.second != -1))
202
+ {
203
+ N_sites_included[ui]++;
204
+ if (alleles.first == alleles.second)
205
+ N_obs_hom[ui]++;
206
+ }
207
+
208
+ /////////////////////////
209
+ // Expected homozygosity
210
+ // E = 1 - (2pq . 2N/(2N-1))
211
+ // (Using Nei's unbiased estimator)
212
+ N_expected_hom[ui] += 1.0 - (2.0 * freq[s] * (1.0 - freq[s]) * (N_non_missing_chr[s] / (N_non_missing_chr[s] - 1.0)));
213
+ }
214
+ }
215
+ }
216
+
217
+ out.setf(ios::fixed,ios::floatfield);
218
+ for (unsigned int ui=0; ui<N_indv; ui++)
219
+ {
220
+ if (include_indv[ui] == false)
221
+ continue;
222
+ if (N_sites_included[ui] > 0)
223
+ {
224
+ double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]);
225
+ out << indv[ui] << "\t" << N_obs_hom[ui] << "\t";
226
+ out.precision(1);
227
+ out << N_expected_hom[ui] << "\t";
228
+ out.precision(5);
229
+ out << N_sites_included[ui] << "\t" << F << endl;
230
+ }
231
+ }
232
+
233
+ out.close();
234
+ }
235
+
236
+ void vcf_file::output_hwe(const string &output_file_prefix)
237
+ {
238
+ // Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005)
239
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
240
+ error("Require Genotypes in VCF file in order to output HWE Statistics.");
241
+ // Note this assumes Biallelic SNPs.
242
+ printLOG("Outputting HWE statistics (but only for biallelic loci)\n");
243
+
244
+ string output_file = output_file_prefix + ".hwe";
245
+ ofstream out(output_file.c_str());
246
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
247
+ out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq\tP" << endl;
248
+
249
+ /* PLINK code:
250
+ // b11 = Nhom1, b12 = Nhet, b22 = Nhom2
251
+ double tot = b11 + b12 + b22;
252
+ double exp_11 = freq * freq * tot;
253
+ double exp_12 = 2 * freq * (1-freq) * tot;
254
+ double exp_22 = (1-freq) * (1-freq) * tot;
255
+
256
+ double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
257
+ + ( (b12-exp_12)*(b12-exp_12) ) / exp_12
258
+ + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ;
259
+
260
+ p = chiprobP(chisq,1);
261
+ */
262
+
263
+ double freq;
264
+ unsigned int b11, b12, b22;
265
+ double exp_11, exp_12, exp_22;
266
+ double chisq;
267
+ double tot;
268
+ double p;
269
+ unsigned int precision = out.precision();
270
+ vector<int> allele_counts;
271
+ unsigned int N_non_missing_chr;
272
+ string vcf_line;
273
+ vcf_entry e(N_indv);
274
+ for (unsigned int s=0; s<N_entries; s++)
275
+ {
276
+ if (include_entry[s] == false)
277
+ continue;
278
+
279
+ get_vcf_entry(s, vcf_line);
280
+ e.reset(vcf_line);
281
+ e.parse_basic_entry(true);
282
+
283
+ if (e.get_N_alleles() != 2)
284
+ {
285
+ one_off_warning("\tHWE: Only using biallelic SNPs.");
286
+ continue; // Isn't biallelic
287
+ }
288
+
289
+ e.parse_genotype_entries(true);
290
+
291
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
292
+ {
293
+ one_off_warning("\tHWE: Only using fully diploid SNPs.");
294
+ continue; // Isn't diploid
295
+ }
296
+
297
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
298
+ freq = allele_counts[0] / (double)N_non_missing_chr;
299
+ e.get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22);
300
+ tot = b11 + b12 + b22;
301
+ exp_11 = freq * freq * tot;
302
+ exp_12 = 2.0 * freq * (1.0-freq) * tot;
303
+ exp_22 = (1.0-freq) * (1.0-freq) * tot;
304
+
305
+ chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
306
+ + ( (b12-exp_12)*(b12-exp_12) ) / exp_12
307
+ + ( (b22-exp_22)*(b22-exp_22) ) / exp_22;
308
+
309
+ p = vcf_entry::SNPHWE(b12, b11, b22);
310
+ out << e.get_CHROM() << "\t" << e.get_POS();
311
+ out << "\t" << b11 << "/" << b12 << "/" << b22;
312
+ out.precision(2);
313
+ out << fixed << "\t" << exp_11 << "/" << exp_12 << "/" << exp_22;
314
+ out.precision(precision);
315
+ out << "\t" << chisq << "\t" << p << endl;
316
+ }
317
+ }
318
+
319
+ void vcf_file::output_individuals_by_mean_depth(const string &output_file_prefix)
320
+ {
321
+ // Output information regarding the mean depth for each individual
322
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
323
+ error("Require Genotypes in VCF file in order to output Individuals by Mean Depth Statistics.");
324
+
325
+ printLOG("Outputting Mean Depth by Individual\n");
326
+ string output = output_file_prefix + ".idepth";
327
+ ofstream out(output.c_str());
328
+ if (!out.is_open())
329
+ error("Could not open Individual Depth Output File: " + output, 2);
330
+ out << "INDV\tN_SITES\tMEAN_DEPTH" << endl;
331
+ vector<double> depth_sum(N_indv, 0.0);
332
+ vector<int> count(N_indv, 0);
333
+ int depth;
334
+ string vcf_line;
335
+ vcf_entry e(N_indv);
336
+ for (unsigned int s=0; s<N_entries; s++)
337
+ {
338
+ if (include_entry[s] == false)
339
+ continue;
340
+
341
+ get_vcf_entry(s, vcf_line);
342
+ e.reset(vcf_line);
343
+
344
+ for (unsigned int ui=0; ui<N_indv; ui++)
345
+ {
346
+ if (include_indv[ui] == false)
347
+ continue;
348
+
349
+ if (include_genotype[s][ui] == true)
350
+ {
351
+ e.parse_genotype_entry(ui, false, false, true);
352
+ depth = e.get_indv_DEPTH(ui);
353
+ if (depth >= 0)
354
+ {
355
+ depth_sum[ui] += depth;
356
+ count[ui]++;
357
+ }
358
+ }
359
+ }
360
+ }
361
+
362
+ for (unsigned int ui=0; ui<N_indv; ui++)
363
+ {
364
+ if (include_indv[ui] == false)
365
+ continue;
366
+
367
+ double mean_depth = depth_sum[ui] / count[ui];
368
+ out << indv[ui] << "\t" << count[ui] << "\t" << mean_depth << endl;
369
+ }
370
+
371
+ out.close();
372
+ }
373
+
374
+ void vcf_file::output_SNP_density(const string &output_file_prefix, int bin_size)
375
+ {
376
+ // Output SNP density (technically variant density)
377
+ if (bin_size <= 0)
378
+ return;
379
+ printLOG("Outputting SNP density\n");
380
+
381
+ string output = output_file_prefix + ".snpden";
382
+ ofstream out(output.c_str());
383
+ if (!out.is_open())
384
+ error("Could not open SNP Density Output File: " + output, 2);
385
+
386
+ // Find maximum position
387
+ unsigned int s;
388
+ map<string, int> max_pos;
389
+ string vcf_line;
390
+ string CHROM; int POS;
391
+ vcf_entry e(N_indv);
392
+ for (s=0; s<N_entries; s++)
393
+ {
394
+ if (include_entry[s] == true)
395
+ {
396
+ //get_vcf_entry(s, vcf_line);
397
+ //e.reset(vcf_line);
398
+ //e.parse_basic_entry();
399
+
400
+ //CHROM = e.get_CHROM();
401
+ //POS = e.get_POS();
402
+
403
+ set_filepos(entry_file_locations[s]);
404
+ read_CHROM_and_POS_only(CHROM, POS);
405
+ if (max_pos.find(CHROM) != max_pos.end())
406
+ {
407
+ if (POS > max_pos[CHROM])
408
+ max_pos[CHROM] = POS;
409
+ }
410
+ else
411
+ max_pos[CHROM] = POS;
412
+ }
413
+ }
414
+
415
+ map<string, int>::iterator it;
416
+
417
+ unsigned int N_bins;
418
+ map<string, vector<int> > bins;
419
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
420
+ {
421
+ CHROM = (*it).first;
422
+ N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
423
+ bins[CHROM].resize(N_bins, 0);
424
+ }
425
+
426
+
427
+ unsigned int idx;
428
+ double C = 1.0 / double(bin_size);
429
+ for (s=0; s<N_entries; s++)
430
+ {
431
+ if (include_entry[s] == true)
432
+ {
433
+ //get_vcf_entry(s, vcf_line);
434
+ //e.reset(vcf_line);
435
+ //e.parse_basic_entry();
436
+
437
+ //CHROM = e.get_CHROM();
438
+ //POS = e.get_POS();
439
+ set_filepos(entry_file_locations[s]);
440
+ read_CHROM_and_POS_only(CHROM, POS);
441
+ idx = (unsigned int)(POS * C);
442
+ bins[CHROM][idx]++;
443
+ }
444
+ }
445
+
446
+ out << "CHROM\tBIN_START\tSNP_COUNT\tSNPS/KB" << endl;
447
+ double sum1=0.0, sum2=0.0;
448
+ int bin_tot;
449
+ C = 1000.0 / bin_size;
450
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
451
+ {
452
+ bool output = false;
453
+ CHROM = (*it).first;
454
+ sum2 += max_pos[CHROM];
455
+ for (s=0; s<bins[CHROM].size(); s++)
456
+ {
457
+ bin_tot = bins[CHROM][s];
458
+ sum1 += bin_tot;
459
+ if (bin_tot > 0)
460
+ output = true;
461
+ if (output == true)
462
+ out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl;
463
+ }
464
+ }
465
+ out.close();
466
+
467
+ double mean_SNP_density = sum1 / sum2 * 1000;
468
+ printLOG("Mean SNP density: " + dbl2str(mean_SNP_density, 5) + " SNPs / kb\n");
469
+ }
470
+
471
+ void vcf_file::output_missingness(const string &output_file_prefix)
472
+ {
473
+ // Output missingness by individual and site
474
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
475
+ error("Require Genotypes in VCF file in order to output Missingness Statistics.");
476
+
477
+ printLOG("Outputting Site and Individual Missingness\n");
478
+ string output1 = output_file_prefix + ".imiss";
479
+ ofstream out1(output1.c_str());
480
+ if (!out1.is_open())
481
+ error("Could not open Individual Missingness Output File: " + output1, 3);
482
+
483
+ string output2 = output_file_prefix + ".lmiss";
484
+ ofstream out2(output2.c_str());
485
+ if (!out2.is_open())
486
+ error("Could not open Site Missingness Output File: " + output2, 4);
487
+
488
+ out1 << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl;
489
+ unsigned int ui, s;
490
+ vector<unsigned int> indv_N_missing(N_indv, 0), indv_N_tot(N_indv, 0);
491
+ vector<unsigned int> indv_N_geno_filtered(N_indv, 0);
492
+ unsigned int site_N_missing, site_N_tot, site_N_geno_filtered;
493
+ pair<int, int> alleles;
494
+ string vcf_line;
495
+ vcf_entry e(N_indv);
496
+
497
+ out2 << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl;
498
+ for (s=0; s<N_entries; s++)
499
+ {
500
+ if (include_entry[s] == false)
501
+ continue;
502
+
503
+ get_vcf_entry(s, vcf_line);
504
+ e.reset(vcf_line);
505
+ e.parse_basic_entry();
506
+
507
+ site_N_missing = 0;
508
+ site_N_tot = 0;
509
+ site_N_geno_filtered = 0;
510
+ for (ui=0; ui<N_indv; ui++)
511
+ {
512
+ if (include_indv[ui] == false)
513
+ continue;
514
+ if (include_genotype[s][ui] == false)
515
+ {
516
+ site_N_geno_filtered++;
517
+ indv_N_geno_filtered[ui]++;
518
+ continue;
519
+ }
520
+
521
+ e.parse_genotype_entry(ui, true);
522
+ e.get_indv_GENOTYPE_ids(ui, alleles);
523
+ if (alleles.first == -1)
524
+ {
525
+ site_N_missing++;
526
+ indv_N_missing[ui]++;
527
+ }
528
+ indv_N_tot[ui]++;
529
+
530
+ if (alleles.second == -1)
531
+ {
532
+ site_N_missing++;
533
+ }
534
+ site_N_tot+=2;
535
+
536
+ if ((alleles.second == -1) && (e.get_indv_PHASE(ui) == '|'))
537
+ { // Phased missing genotypes indicate haploid genome
538
+ site_N_tot--;
539
+ }
540
+ }
541
+ out2 << e.get_CHROM() << "\t" << e.get_POS() << "\t" << site_N_tot << "\t" << site_N_geno_filtered << "\t";
542
+ out2 << site_N_missing << "\t" << double(site_N_missing) / double(site_N_tot) << endl;
543
+ }
544
+
545
+ for (ui=0; ui<N_indv; ui++)
546
+ {
547
+ if (include_indv[ui] == false)
548
+ continue;
549
+ out1 << indv[ui] << "\t" << indv_N_tot[ui] << "\t";
550
+ out1 << indv_N_geno_filtered[ui] << "\t" << indv_N_missing[ui] << "\t";
551
+ out1 << indv_N_missing[ui] / double(indv_N_tot[ui]) << endl;
552
+ }
553
+
554
+ out2.close();
555
+ out1.close();
556
+ }
557
+
558
+ void vcf_file::output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
559
+ {
560
+ // Output pairwise LD statistics, using traditional r^2. Requires phased haplotypes.
561
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
562
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
563
+
564
+ unsigned int s, s2;
565
+ unsigned int ui;
566
+
567
+ printLOG("Outputting Pairwise LD (phased bi-allelic only)\n");
568
+ string output = output_file_prefix + ".hap.ld";
569
+ ofstream out(output.c_str());
570
+ if (!out.is_open())
571
+ error("Could not open LD Output File: " + output, 3);
572
+
573
+ out << "CHR\tPOS1\tPOS2\tN_CHR\tR^2\tD\tDprime" << endl;
574
+
575
+ //For D, D' computations
576
+ double D, Dmax, Dprime;
577
+ int x11, x12, x21, x22;
578
+ double p1, p2, q1, q2;
579
+ double rel_x11, rel_x12, rel_x21, rel_x22;
580
+
581
+ unsigned int chr_count;
582
+ double r2;
583
+ int sx, sy;
584
+ double X, X2, Y, Y2, XY;
585
+ double var1, var2, cov12;
586
+ pair<int,int> geno1, geno2;
587
+ string vcf_line, vcf_line2;
588
+ vcf_entry e(N_indv), e2(N_indv);
589
+ for (s=0; s<(N_entries-1); s++)
590
+ {
591
+ if (include_entry[s] == false)
592
+ continue;
593
+
594
+ get_vcf_entry(s, vcf_line);
595
+ e.reset(vcf_line);
596
+ e.parse_basic_entry(true);
597
+
598
+ if (e.get_N_alleles() != 2)
599
+ {
600
+ one_off_warning("\tLD: Only using biallelic SNPs.");
601
+ continue; // Isn't biallelic
602
+ }
603
+
604
+ for (s2 = s+1; s2<N_entries; s2++)
605
+ {
606
+ if (include_entry[s2] == false)
607
+ continue;
608
+
609
+ if (int(s2 - s) > snp_window_size)
610
+ {
611
+ s2 = N_entries; // SNPs sorted, so no need to go any further
612
+ continue;
613
+ }
614
+
615
+ get_vcf_entry(s2, vcf_line2);
616
+ e2.reset(vcf_line2);
617
+ e2.parse_basic_entry(true);
618
+
619
+ if (e.get_CHROM() != e2.get_CHROM())
620
+ {
621
+ s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
622
+ continue;
623
+ }
624
+
625
+ if ((e2.get_POS() - e.get_POS()) > bp_window_size)
626
+ {
627
+ s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
628
+ continue;
629
+ }
630
+
631
+ if (e2.get_N_alleles() != 2)
632
+ {
633
+ one_off_warning("\tLD: Only using biallelic SNPs.");
634
+ continue;
635
+ }
636
+
637
+ x11=0; x12=0; x21=0; x22=0;
638
+
639
+ X=0, X2=0; Y=0; Y2=0; XY=0;
640
+ chr_count = 0;
641
+ for (ui=0; ui<N_indv; ui++)
642
+ {
643
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
644
+ continue;
645
+
646
+ e.parse_genotype_entry(ui, true);
647
+ e.get_indv_GENOTYPE_ids(ui, geno1);
648
+
649
+ e2.parse_genotype_entry(ui, true);
650
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
651
+
652
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
653
+ {
654
+ one_off_warning("\tLD: Only using diploid individuals.");
655
+ continue;
656
+ }
657
+
658
+ if ((e.get_indv_PHASE(ui) != '|') || (e2.get_indv_PHASE(ui) != '|'))
659
+ error("Require phased haplotypes for r^2 calculation (use --phased)\n");
660
+
661
+ for (unsigned int c=0; c<2; c++)
662
+ {
663
+ int allele1, allele2;
664
+ if (c==0)
665
+ {
666
+ allele1 = geno1.first;
667
+ allele2 = geno2.first;
668
+ }
669
+ else
670
+ {
671
+ allele1 = geno1.second;
672
+ allele2 = geno2.second;
673
+ }
674
+
675
+ if ((allele1 == -1) || (allele2 == -1))
676
+ continue;
677
+
678
+ if (allele1 == 0 && allele2 == 0){
679
+ x11++;
680
+ } else if (allele1 == 0 && allele2 != 0){
681
+ x12++;
682
+ } else if (allele1 != 0 && allele2 == 0){
683
+ x21++;
684
+ } else { // (allele1 !=0 && allele2 != 0)
685
+ x22++;
686
+ }
687
+
688
+ sx=0, sy=0;
689
+ if (allele1 == 0)
690
+ sx += 1;
691
+
692
+ if (allele2 == 0)
693
+ sy += 1;
694
+
695
+ X += sx; Y += sy;
696
+ XY += sx*sy;
697
+ sx *= sx; sy *= sy;
698
+ X2 += sx;
699
+ Y2 += sy;
700
+
701
+ chr_count++;
702
+ }
703
+ }
704
+
705
+ rel_x11 = 1.0*x11/chr_count;
706
+ rel_x12 = 1.0*x12/chr_count;
707
+ rel_x21 = 1.0*x21/chr_count;
708
+ rel_x22 = 1.0*x22/chr_count;
709
+ p1 = rel_x11 + rel_x12;
710
+ p2 = rel_x21 + rel_x22;
711
+ q1 = rel_x11 + rel_x21;
712
+ q2 = rel_x12 + rel_x22;
713
+ D = rel_x11 - p1*q1;
714
+ if (D < 0){
715
+ Dmax = min(p1*q1,p2*q2);
716
+ } else {
717
+ Dmax = min(p1*q2,p2*q1);
718
+ };
719
+ Dprime = D/Dmax;
720
+
721
+ X /= chr_count; X2 /= chr_count;
722
+ Y /= chr_count; Y2 /= chr_count;
723
+ XY /= chr_count;
724
+
725
+ var1 = X2 - X*X;
726
+ var2 = Y2 - Y*Y;
727
+ cov12 = XY - X*Y;
728
+
729
+ r2 = cov12 * cov12 / (var1 * var2);
730
+
731
+ if (min_r2 > 0)
732
+ if ((r2 < min_r2) | (r2 != r2))
733
+ continue;
734
+
735
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << "\t" << endl;
736
+ }
737
+ }
738
+ out.close();
739
+ }
740
+
741
+ void vcf_file::output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
742
+ {
743
+ // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
744
+ // correlation coefficient between genotypes numbered as 0, 1, 2.
745
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
746
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
747
+
748
+ unsigned int s, s2;
749
+ unsigned int ui;
750
+
751
+ printLOG("Outputting Pairwise LD (bi-allelic only)\n");
752
+ string output = output_file_prefix + ".geno.ld";
753
+ ofstream out(output.c_str());
754
+ if (!out.is_open())
755
+ error("Could not open LD Output File: " + output, 3);
756
+
757
+ out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl;
758
+
759
+ unsigned int indv_count;
760
+ double r2;
761
+ int sx, sy;
762
+ double X, X2, Y, Y2, XY;
763
+ double var1, var2, cov12;
764
+ pair<int,int> geno1, geno2;
765
+ string vcf_line, vcf_line2;
766
+ vcf_entry e(N_indv), e2(N_indv);
767
+ for (s=0; s<(N_entries-1); s++)
768
+ {
769
+ if (include_entry[s] == false)
770
+ continue;
771
+
772
+ get_vcf_entry(s, vcf_line);
773
+ e.reset(vcf_line);
774
+ e.parse_basic_entry(true);
775
+
776
+ if (e.get_N_alleles() != 2)
777
+ {
778
+ one_off_warning("\tgenoLD: Only using biallelic SNPs.");
779
+ continue; // Isn't biallelic
780
+ }
781
+
782
+ for (s2 = s+1; s2<N_entries; s2++)
783
+ {
784
+ if (include_entry[s2] == false)
785
+ continue;
786
+
787
+ if (int(s2 - s) > snp_window_size)
788
+ {
789
+ s2 = N_entries; // SNPs sorted, so no need to go any further
790
+ continue;
791
+ }
792
+
793
+ get_vcf_entry(s2, vcf_line2);
794
+ e2.reset(vcf_line2);
795
+ e2.parse_basic_entry(true);
796
+
797
+ if (e2.get_N_alleles() != 2)
798
+ {
799
+ one_off_warning("\tgenoLD: Only using biallelic SNPs.");
800
+ continue; // Isn't biallelic
801
+ }
802
+
803
+ if (e.get_CHROM() != e2.get_CHROM())
804
+ {
805
+ s2 = N_entries; // SNPs sorted, so no need to go any further
806
+ continue;
807
+ }
808
+
809
+ if ((e2.get_POS() - e.get_POS()) > bp_window_size)
810
+ {
811
+ s2 = N_entries; // SNPs sorted, so no need to go any further
812
+ continue;
813
+ }
814
+
815
+ X=0, X2=0; Y=0; Y2=0; XY=0;
816
+ indv_count = 0;
817
+ for (ui=0; ui<N_indv; ui++)
818
+ {
819
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
820
+ continue;
821
+
822
+ e.parse_genotype_entry(ui, true);
823
+ e.get_indv_GENOTYPE_ids(ui, geno1);
824
+
825
+ e2.parse_genotype_entry(ui, true);
826
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
827
+
828
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
829
+ {
830
+ one_off_warning("\tgenoLD: Only using diploid individuals.");
831
+ continue;
832
+ }
833
+
834
+ if ((geno1.first == -1) || (geno1.second == -1))
835
+ continue;
836
+
837
+ if ((geno2.first == -1) || (geno2.second == -1))
838
+ continue;
839
+
840
+ sx=0, sy=0;
841
+ if (geno1.first == geno1.second)
842
+ {
843
+ if (geno1.first == 0)
844
+ {
845
+ sx = 2;
846
+ }
847
+ }
848
+ else
849
+ sx = 1;
850
+
851
+ if (geno2.first == geno2.second)
852
+ {
853
+ if (geno2.first == 0)
854
+ {
855
+ sy = 2;
856
+ }
857
+ }
858
+ else
859
+ sy = 1;
860
+
861
+ X += sx; Y += sy;
862
+ XY += sx*sy;
863
+ sx *= sx; sy *= sy;
864
+ X2 += sx; Y2 += sy;
865
+
866
+ indv_count++;
867
+ }
868
+
869
+ X /= indv_count; X2 /= indv_count;
870
+ Y /= indv_count; Y2 /= indv_count;
871
+ XY /= indv_count;
872
+
873
+ var1 = X2 - X*X;
874
+ var2 = Y2 - Y*Y;
875
+ cov12 = XY - X*Y;
876
+
877
+ r2 = cov12 * cov12 / (var1 * var2);
878
+
879
+ if (min_r2 > 0)
880
+ if ((r2 < min_r2) | (r2 != r2))
881
+ continue;
882
+
883
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
884
+ }
885
+ }
886
+ out.close();
887
+ }
888
+
889
+ // TODO - provide similar function for haplotype r2.
890
+ void vcf_file::output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2)
891
+ {
892
+ // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
893
+ // correlation coefficient between genotypes numbered as 0, 1, 2.
894
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
895
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
896
+
897
+ unsigned int s, s2;
898
+ unsigned int ui;
899
+
900
+ printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n");
901
+ string output = output_file_prefix + ".interchrom.geno.ld";
902
+ ofstream out(output.c_str());
903
+ if (!out.is_open())
904
+ error("Could not open LD Output File: " + output, 3);
905
+
906
+ out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl;
907
+
908
+ unsigned int indv_count;
909
+ double r2;
910
+ int sx, sy;
911
+ double X, X2, Y, Y2, XY;
912
+ double var1, var2, cov12;
913
+ pair<int,int> geno1, geno2;
914
+ string vcf_line, vcf_line2;
915
+ vcf_entry e(N_indv), e2(N_indv);
916
+ for (s=0; s<(N_entries-1); s++)
917
+ {
918
+ if (include_entry[s] == false)
919
+ continue;
920
+
921
+ get_vcf_entry(s, vcf_line);
922
+ e.reset(vcf_line);
923
+ e.parse_basic_entry(true);
924
+
925
+ if (e.get_N_alleles() != 2)
926
+ {
927
+ one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
928
+ continue; // Isn't biallelic
929
+ }
930
+
931
+ for (s2 = s+1; s2<N_entries; s2++)
932
+ {
933
+ if (include_entry[s2] == false)
934
+ continue;
935
+
936
+ get_vcf_entry(s2, vcf_line2);
937
+ e2.reset(vcf_line2);
938
+ e2.parse_basic_entry(true);
939
+
940
+ if (e2.get_N_alleles() != 2)
941
+ {
942
+ one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
943
+ continue; // Isn't biallelic
944
+ }
945
+
946
+ if (e.get_CHROM() == e2.get_CHROM())
947
+ {
948
+ continue;
949
+ }
950
+
951
+ X=0, X2=0; Y=0; Y2=0; XY=0;
952
+ indv_count = 0;
953
+ for (ui=0; ui<N_indv; ui++)
954
+ {
955
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
956
+ continue;
957
+
958
+ e.parse_genotype_entry(ui, true);
959
+ e.get_indv_GENOTYPE_ids(ui, geno1);
960
+
961
+ e2.parse_genotype_entry(ui, true);
962
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
963
+
964
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
965
+ {
966
+ one_off_warning("\tinterchromLD: Only using diploid individuals.");
967
+ continue;
968
+ }
969
+
970
+ if ((geno1.first == -1) || (geno1.second == -1))
971
+ continue;
972
+
973
+ if ((geno2.first == -1) || (geno2.second == -1))
974
+ continue;
975
+
976
+ sx=0, sy=0;
977
+ if (geno1.first == geno1.second)
978
+ {
979
+ if (geno1.first == 0)
980
+ {
981
+ sx = 2;
982
+ }
983
+ }
984
+ else
985
+ sx = 1;
986
+
987
+ if (geno2.first == geno2.second)
988
+ {
989
+ if (geno2.first == 0)
990
+ {
991
+ sy = 2;
992
+ }
993
+ }
994
+ else
995
+ sy = 1;
996
+
997
+ X += sx; Y += sy;
998
+ XY += sx*sy;
999
+ sx *= sx; sy *= sy;
1000
+ X2 += sx; Y2 += sy;
1001
+
1002
+ indv_count++;
1003
+ }
1004
+
1005
+ X /= indv_count; X2 /= indv_count;
1006
+ Y /= indv_count; Y2 /= indv_count;
1007
+ XY /= indv_count;
1008
+
1009
+ var1 = X2 - X*X;
1010
+ var2 = Y2 - Y*Y;
1011
+ cov12 = XY - X*Y;
1012
+
1013
+ r2 = cov12 * cov12 / (var1 * var2);
1014
+
1015
+ if (min_r2 > 0)
1016
+ if ((r2 < min_r2) | (r2 != r2))
1017
+ continue;
1018
+
1019
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_CHROM() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
1020
+ }
1021
+ }
1022
+ out.close();
1023
+ }
1024
+
1025
+ void vcf_file::output_singletons(const string &output_file_prefix)
1026
+ {
1027
+ // Locate and output singletons (and private doubletons)
1028
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1029
+ error("Require Genotypes in VCF file in order to output Singletons.");
1030
+
1031
+ printLOG("Outputting Singleton Locations\n");
1032
+ string output = output_file_prefix + ".singletons";
1033
+ ofstream out(output.c_str());
1034
+ if (!out.is_open())
1035
+ error("Could not open Singleton Output File: " + output, 3);
1036
+
1037
+ out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl;
1038
+
1039
+ unsigned int ui;
1040
+ int a;
1041
+ vector<int> allele_counts;
1042
+ unsigned int N_non_missing_chr;
1043
+ unsigned int N_alleles;
1044
+ pair<int, int> geno;
1045
+ string allele;
1046
+ string vcf_line;
1047
+ vcf_entry e(N_indv);
1048
+ for (unsigned int s=0; s<N_entries; s++)
1049
+ {
1050
+ if (include_entry[s] == false)
1051
+ continue;
1052
+
1053
+ get_vcf_entry(s, vcf_line);
1054
+ e.reset(vcf_line);
1055
+ e.parse_basic_entry(true);
1056
+ e.parse_genotype_entries(true);
1057
+
1058
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
1059
+ N_alleles = e.get_N_alleles();
1060
+
1061
+ for (a=0; a<(signed)N_alleles; a++)
1062
+ {
1063
+ if (allele_counts[a] == 1)
1064
+ { // Singleton
1065
+ for (ui=0; ui<N_indv; ui++)
1066
+ {
1067
+ if (include_indv[ui] == false)
1068
+ continue;
1069
+ e.get_indv_GENOTYPE_ids(ui, geno);
1070
+ if ((geno.first == a) || (geno.second == a))
1071
+ {
1072
+ e.get_allele(a, allele);
1073
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\tS\t" << allele << "\t" << indv[ui] << endl;
1074
+ ui=N_indv;
1075
+ break;
1076
+ }
1077
+ }
1078
+ }
1079
+ else if (allele_counts[a] == 2)
1080
+ { // Possible doubleton
1081
+ for (ui=0; ui<N_indv; ui++)
1082
+ {
1083
+ if (include_indv[ui] == false)
1084
+ continue;
1085
+ e.get_indv_GENOTYPE_ids(ui, geno);
1086
+ if ((geno.first == a) && (geno.second == a))
1087
+ {
1088
+ e.get_allele(a, allele);
1089
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\tD\t" << allele << "\t" << indv[ui] << endl;
1090
+ ui=N_indv;
1091
+ break;
1092
+ }
1093
+ }
1094
+ }
1095
+ }
1096
+ }
1097
+
1098
+ out.close();
1099
+ }
1100
+
1101
+ void vcf_file::output_genotype_depth(const string &output_file_prefix)
1102
+ {
1103
+ // Output genotype depth in tab-delimited format.
1104
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1105
+ error("Require Genotypes in VCF file in order to output Genotype Depth Statistics.");
1106
+
1107
+ printLOG("Outputting Depth for Each Genotype\n");
1108
+ string output = output_file_prefix + ".gdepth";
1109
+ ofstream out(output.c_str());
1110
+ if (!out.is_open())
1111
+ error("Could not open Genotype Depth Output File: " + output, 7);
1112
+
1113
+ out << "CHROM\tPOS";
1114
+ for (unsigned int ui=0; ui<N_indv; ui++)
1115
+ {
1116
+ if (include_indv[ui] == false)
1117
+ continue;
1118
+
1119
+ out << "\t" << indv[ui];
1120
+ }
1121
+ out << endl;
1122
+
1123
+ string vcf_line;
1124
+ vcf_entry e(N_indv);
1125
+ for (unsigned int s=0; s<N_entries; s++)
1126
+ {
1127
+ if (include_entry[s] == false)
1128
+ continue;
1129
+
1130
+ get_vcf_entry(s, vcf_line);
1131
+ e.reset(vcf_line);
1132
+ e.parse_basic_entry();
1133
+
1134
+ out << e.get_CHROM() << "\t" << e.get_POS();
1135
+
1136
+ for (unsigned int ui=0; ui<N_indv; ui++)
1137
+ {
1138
+ if (include_indv[ui] == false)
1139
+ continue;
1140
+
1141
+ if (include_genotype[s][ui] == true)
1142
+ {
1143
+ e.parse_genotype_entry(ui, false, false, true);
1144
+ out << "\t" << e.get_indv_DEPTH(ui);
1145
+ }
1146
+ else
1147
+ out << "\t-1";
1148
+ }
1149
+ out << endl;
1150
+ }
1151
+ out.close();
1152
+ }
1153
+
1154
+ void vcf_file::output_FILTER_summary(const string &output_file_prefix)
1155
+ {
1156
+ // Output a summary of sites in various FILTER categories.
1157
+ printLOG("Outputting Filter Summary (for bi-allelic loci only)\n");
1158
+
1159
+ map<string, unsigned int> model_to_idx;
1160
+ model_to_idx["AC"] = 0;
1161
+ model_to_idx["AG"] = 1;
1162
+ model_to_idx["AT"] = 2;
1163
+ model_to_idx["CG"] = 3;
1164
+ model_to_idx["CT"] = 4;
1165
+ model_to_idx["GT"] = 5;
1166
+ string FILTER;
1167
+ string vcf_line;
1168
+ vcf_entry e(N_indv);
1169
+
1170
+ map<string, pair<int, int> > FILTER_to_TsTv;
1171
+ map<string, int > FILTER_to_Nsites;
1172
+ map<string, int >::iterator FILTER_to_Nsites_it;
1173
+ for (unsigned int s=0; s<N_entries; s++)
1174
+ {
1175
+ if (include_entry[s] == false)
1176
+ continue;
1177
+
1178
+ get_vcf_entry(s, vcf_line);
1179
+ e.reset(vcf_line);
1180
+ e.parse_basic_entry(true, true);
1181
+
1182
+ string model = e.get_REF() + e.get_ALT_allele(0);
1183
+ sort(model.begin(), model.end());
1184
+
1185
+ FILTER = e.get_FILTER();
1186
+ FILTER_to_Nsites[FILTER]++;
1187
+ if (model_to_idx.find(model) != model_to_idx.end())
1188
+ {
1189
+ switch (model_to_idx[model])
1190
+ {
1191
+ case 1:
1192
+ case 4:
1193
+ FILTER_to_TsTv[FILTER].first++;
1194
+ break;
1195
+ case 0:
1196
+ case 2:
1197
+ case 3:
1198
+ case 5:
1199
+ FILTER_to_TsTv[FILTER].second++;
1200
+ break;
1201
+ default:
1202
+ // Don't count this snp towards Ts/Tv
1203
+ break;
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ vector<pair<int, string > > count_to_FILTER;
1209
+ for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it )
1210
+ {
1211
+ FILTER = (*FILTER_to_Nsites_it).first;
1212
+ int Nsites = (*FILTER_to_Nsites_it).second;
1213
+
1214
+ count_to_FILTER.push_back(make_pair(Nsites, FILTER));
1215
+ }
1216
+
1217
+ sort(count_to_FILTER.begin(), count_to_FILTER.end());
1218
+
1219
+ string output = output_file_prefix + ".FILTER.summary";
1220
+ ofstream out(output.c_str());
1221
+ if (!out.is_open())
1222
+ error("Could not open Filter Summary Output File: " + output, 7);
1223
+
1224
+ out << "FILTER\tN_SNPs\tN_Ts\tN_Tv\tTs/Tv" << endl;
1225
+
1226
+ for (int i=count_to_FILTER.size()-1; i > -1; i--)
1227
+ {
1228
+ FILTER = count_to_FILTER[i].second;
1229
+ int Ts = FILTER_to_TsTv[FILTER].first;
1230
+ int Tv = FILTER_to_TsTv[FILTER].second;
1231
+ int Nsites = FILTER_to_Nsites[FILTER];
1232
+ out << FILTER << "\t" << Nsites << "\t";
1233
+ out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl;
1234
+ }
1235
+
1236
+ out.close();
1237
+ }
1238
+
1239
+ void vcf_file::output_TsTv(const string &output_file_prefix, int bin_size)
1240
+ {
1241
+ // Output Ts/Tv ratios in bins of a given size.
1242
+ printLOG("Outputting Ts/Tv in bins of " + int2str(bin_size) + "bp\n");
1243
+
1244
+ map<string, unsigned int> model_to_idx;
1245
+ model_to_idx["AC"] = 0;
1246
+ model_to_idx["AG"] = 1;
1247
+ model_to_idx["AT"] = 2;
1248
+ model_to_idx["CG"] = 3;
1249
+ model_to_idx["CT"] = 4;
1250
+ model_to_idx["GT"] = 5;
1251
+
1252
+ map<string, int> max_pos;
1253
+ string vcf_line, CHROM;
1254
+ vcf_entry e(N_indv);
1255
+ for (unsigned int s=0; s<N_entries; s++)
1256
+ {
1257
+ if (include_entry[s] == true)
1258
+ {
1259
+ get_vcf_entry(s, vcf_line);
1260
+ e.reset(vcf_line);
1261
+ e.parse_basic_entry();
1262
+
1263
+ CHROM = e.get_CHROM();
1264
+
1265
+ if (max_pos.find(CHROM) != max_pos.end())
1266
+ {
1267
+ if (e.get_POS() > max_pos[CHROM])
1268
+ max_pos[CHROM] = e.get_POS();
1269
+ }
1270
+ else
1271
+ max_pos[CHROM] = e.get_POS();
1272
+ }
1273
+ }
1274
+
1275
+ map<string, int>::iterator it;
1276
+
1277
+ unsigned int N_bins;
1278
+ map<string, vector<int> > Ts_counts;
1279
+ map<string, vector<int> > Tv_counts;
1280
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
1281
+ {
1282
+ CHROM = (*it).first;
1283
+ N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
1284
+ Ts_counts[CHROM].resize(N_bins, 0);
1285
+ Tv_counts[CHROM].resize(N_bins, 0);
1286
+ }
1287
+
1288
+ vector<unsigned int> model_counts(6,0);
1289
+ double C = 1.0 / double(bin_size);
1290
+ unsigned int idx;
1291
+
1292
+ string model;
1293
+ for (unsigned int s=0; s<N_entries; s++)
1294
+ {
1295
+ if (include_entry[s] == false)
1296
+ continue;
1297
+
1298
+ get_vcf_entry(s, vcf_line);
1299
+ e.reset(vcf_line);
1300
+ e.parse_basic_entry(true);
1301
+
1302
+ if (!e.is_biallelic_SNP())
1303
+ continue;
1304
+
1305
+ model = e.get_REF() + e.get_ALT_allele(0);
1306
+ sort(model.begin(), model.end());
1307
+
1308
+ CHROM = e.get_CHROM();
1309
+ idx = (unsigned int)(e.get_POS() * C);
1310
+
1311
+ if (model_to_idx.find(model) != model_to_idx.end())
1312
+ {
1313
+ model_counts[model_to_idx[model]]++;
1314
+ switch (model_to_idx[model])
1315
+ {
1316
+ case 1:
1317
+ case 4:
1318
+ Ts_counts[CHROM][idx]++;
1319
+ break;
1320
+ case 0:
1321
+ case 2:
1322
+ case 3:
1323
+ case 5:
1324
+ Tv_counts[CHROM][idx]++;
1325
+ break;
1326
+ default:
1327
+ error("Unknown idx\n");
1328
+ }
1329
+ }
1330
+ else
1331
+ warning("Unknown model type. Not a SNP? " + CHROM + ":" + int2str(e.get_POS()) +"\n");
1332
+ }
1333
+
1334
+ string output = output_file_prefix + ".TsTv";
1335
+ ofstream out(output.c_str());
1336
+ if (!out.is_open())
1337
+ error("Could not open TsTv Output File: " + output, 7);
1338
+
1339
+ out << "CHROM\tBinStart\tSNP_count\tTs/Tv" << endl;
1340
+ double ratio;
1341
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
1342
+ {
1343
+ CHROM = (*it).first;
1344
+ for (unsigned int s=0; s<Ts_counts[CHROM].size(); s++)
1345
+ {
1346
+ ratio = 0.0;
1347
+ if (Tv_counts[CHROM][s] != 0)
1348
+ ratio = double(Ts_counts[CHROM][s]) / Tv_counts[CHROM][s];
1349
+ out << CHROM << "\t" << s*bin_size << "\t" << Ts_counts[CHROM][s]+Tv_counts[CHROM][s] << "\t" << ratio << endl;
1350
+ }
1351
+ }
1352
+ out.close();
1353
+
1354
+ output = output_file_prefix + ".TsTv.summary";
1355
+ out.open(output.c_str());
1356
+ if (!out.is_open())
1357
+ error("Could not open TsTv Summary Output File: " + output, 7);
1358
+
1359
+ out << "MODEL\tCOUNT" << endl;
1360
+ out << "AC\t" << model_counts[0] << endl;
1361
+ out << "AG\t" << model_counts[1] << endl;
1362
+ out << "AT\t" << model_counts[2] << endl;
1363
+ out << "CG\t" << model_counts[3] << endl;
1364
+ out << "CT\t" << model_counts[4] << endl;
1365
+ out << "GT\t" << model_counts[5] << endl;
1366
+ unsigned int Ts = model_counts[1] + model_counts[4];
1367
+ unsigned int Tv = model_counts[0] + model_counts[2] + model_counts[3] + model_counts[5];
1368
+ out << "Ts\t" << Ts << endl;
1369
+ out << "Tv\t" << Tv << endl;
1370
+
1371
+ printLOG("Ts/Tv ratio: " + dbl2str(double(Ts)/Tv, 4) + "\n");
1372
+
1373
+ out.close();
1374
+ }
1375
+
1376
+ void vcf_file::output_TsTv_by_count(const string &output_file_prefix)
1377
+ {
1378
+ // Output Ts/Tv ratios in bins of a given size.
1379
+ printLOG("Outputting Ts/Tv by Alternative Allele Count\n");
1380
+ vector<unsigned int> Ts_counts, Tv_counts;
1381
+ unsigned int N_kept_indv = N_kept_individuals();
1382
+ Ts_counts.resize(2*N_kept_indv);
1383
+ Tv_counts.resize(2*N_kept_indv);
1384
+
1385
+ string vcf_line, model;
1386
+ vcf_entry e(N_indv);
1387
+ map<string, unsigned int> model_to_Ts_or_Tv;
1388
+ model_to_Ts_or_Tv["AC"] = 1;
1389
+ model_to_Ts_or_Tv["CA"] = 1;
1390
+ model_to_Ts_or_Tv["AG"] = 0; // Ts
1391
+ model_to_Ts_or_Tv["GA"] = 0; // Ts
1392
+ model_to_Ts_or_Tv["AT"] = 1;
1393
+ model_to_Ts_or_Tv["TA"] = 1;
1394
+ model_to_Ts_or_Tv["CG"] = 1;
1395
+ model_to_Ts_or_Tv["GC"] = 1;
1396
+ model_to_Ts_or_Tv["CT"] = 0; // Ts
1397
+ model_to_Ts_or_Tv["TC"] = 0; // Ts
1398
+ model_to_Ts_or_Tv["GT"] = 1;
1399
+ model_to_Ts_or_Tv["TG"] = 1;
1400
+ unsigned int idx;
1401
+ vector<int> allele_counts;
1402
+ unsigned int allele_count;
1403
+ unsigned int N_included_indv;
1404
+ for (unsigned int s=0; s<N_entries; s++)
1405
+ {
1406
+ if (include_entry[s] == true)
1407
+ {
1408
+ get_vcf_entry(s, vcf_line);
1409
+ e.reset(vcf_line);
1410
+ e.parse_basic_entry(true);
1411
+
1412
+ if (!e.is_biallelic_SNP())
1413
+ continue;
1414
+
1415
+ e.parse_genotype_entries(true);
1416
+ e.get_allele_counts(allele_counts, N_included_indv, include_indv, include_genotype[s]);
1417
+ allele_count = allele_counts[1];
1418
+
1419
+ model = e.get_REF() + e.get_ALT_allele(0);
1420
+ if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
1421
+ {
1422
+ idx = model_to_Ts_or_Tv[model];
1423
+ if (idx == 0) // Ts
1424
+ Ts_counts[allele_count]++;
1425
+ else if (idx == 1) // Tv;
1426
+ Tv_counts[allele_count]++;
1427
+ else
1428
+ error("Unknown model type\n");
1429
+ }
1430
+ else
1431
+ warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
1432
+ }
1433
+ }
1434
+
1435
+ string output = output_file_prefix + ".TsTv.count";
1436
+ ofstream out(output.c_str());
1437
+ if (!out.is_open())
1438
+ error("Could not open TsTv by Count Output File: " + output, 7);
1439
+
1440
+ double ratio;
1441
+ out << "ALT_ALLELE_COUNT\tN_Ts\tN_Tv\tTs/Tv" << endl;
1442
+ for (unsigned int ui=0; ui<2*N_kept_indv; ui++)
1443
+ {
1444
+ ratio = double(Ts_counts[ui]) / Tv_counts[ui];
1445
+ out << ui << "\t" << Ts_counts[ui] << "\t" << Tv_counts[ui] << "\t" << ratio << endl;
1446
+ }
1447
+ out.close();
1448
+ }
1449
+
1450
+ void vcf_file::output_TsTv_by_quality(const string &output_file_prefix)
1451
+ {
1452
+ // Output Ts/Tv ratios in bins of a given size.
1453
+ printLOG("Outputting Ts/Tv By Quality\n");
1454
+ map<double, pair<unsigned int, unsigned int> > TsTv_counts;
1455
+ double max_qual = -numeric_limits<double>::max(), min_qual=numeric_limits<double>::max();
1456
+
1457
+ string vcf_line, model;
1458
+ vcf_entry e(N_indv);
1459
+ map<string, unsigned int> model_to_Ts_or_Tv;
1460
+ model_to_Ts_or_Tv["AC"] = 1;
1461
+ model_to_Ts_or_Tv["CA"] = 1;
1462
+ model_to_Ts_or_Tv["AG"] = 0; // Ts
1463
+ model_to_Ts_or_Tv["GA"] = 0; // Ts
1464
+ model_to_Ts_or_Tv["AT"] = 1;
1465
+ model_to_Ts_or_Tv["TA"] = 1;
1466
+ model_to_Ts_or_Tv["CG"] = 1;
1467
+ model_to_Ts_or_Tv["GC"] = 1;
1468
+ model_to_Ts_or_Tv["CT"] = 0; // Ts
1469
+ model_to_Ts_or_Tv["TC"] = 0; // Ts
1470
+ model_to_Ts_or_Tv["GT"] = 1;
1471
+ model_to_Ts_or_Tv["TG"] = 1;
1472
+ unsigned int idx;
1473
+ double QUAL;
1474
+ for (unsigned int s=0; s<N_entries; s++)
1475
+ {
1476
+ if (include_entry[s] == true)
1477
+ {
1478
+ get_vcf_entry(s, vcf_line);
1479
+ e.reset(vcf_line);
1480
+ e.parse_basic_entry(true);
1481
+
1482
+ if (!e.is_biallelic_SNP())
1483
+ continue;
1484
+
1485
+ QUAL = e.get_QUAL();
1486
+ if (QUAL > max_qual)
1487
+ max_qual = QUAL;
1488
+ if (QUAL < min_qual)
1489
+ min_qual = QUAL;
1490
+
1491
+ model = e.get_REF() + e.get_ALT_allele(0);
1492
+ if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
1493
+ {
1494
+ idx = model_to_Ts_or_Tv[model];
1495
+ if (idx == 0) // Ts
1496
+ {
1497
+ TsTv_counts[QUAL].first++;
1498
+ }
1499
+ else if (idx == 1) // Tv;
1500
+ TsTv_counts[QUAL].second++;
1501
+ else
1502
+ error("Unknown model type\n");
1503
+ }
1504
+ else
1505
+ warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
1506
+ }
1507
+ }
1508
+
1509
+ string output = output_file_prefix + ".TsTv.qual";
1510
+ ofstream out(output.c_str());
1511
+ if (!out.is_open())
1512
+ error("Could not open TsTv by Count Output File: " + output, 7);
1513
+
1514
+ out << "QUAL_THRESHOLD";
1515
+ out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD";
1516
+ out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl;
1517
+
1518
+ unsigned int N_TsTv = TsTv_counts.size();
1519
+
1520
+ vector<double> Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0);
1521
+ vector<double> QUAL_vector(N_TsTv+1, 0.0);
1522
+ QUAL_vector[0] = min_qual;
1523
+ QUAL_vector[N_TsTv] = max_qual;
1524
+ idx = 1;
1525
+ for (map<double, pair<unsigned int, unsigned int> >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it)
1526
+ {
1527
+ QUAL = (it->first);
1528
+ double Ts = (it->second).first;
1529
+ double Tv = (it->second).second;
1530
+ Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts;
1531
+ Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv;
1532
+ QUAL_vector[idx-1] = QUAL;
1533
+ idx++;
1534
+ }
1535
+ QUAL_vector[N_TsTv] = max_qual;
1536
+
1537
+ vector<double> Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0);
1538
+ idx = N_TsTv;
1539
+ for (map<double, pair<unsigned int, unsigned int> >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it)
1540
+ {
1541
+ QUAL = (it->first);
1542
+ double Ts = (it->second).first;
1543
+ double Tv = (it->second).second;
1544
+ Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts;
1545
+ Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv;
1546
+ idx--;
1547
+ }
1548
+
1549
+ double Ts_sum, Tv_sum, ratio;
1550
+ for (unsigned int ui=1; ui<(N_TsTv+1); ui++)
1551
+ {
1552
+ QUAL = QUAL_vector[ui-1];
1553
+ out << QUAL;
1554
+ Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1];
1555
+ ratio = Ts_sum / Tv_sum;
1556
+ out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
1557
+ Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1];
1558
+ ratio = Ts_sum / Tv_sum;
1559
+ out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
1560
+ out << endl;
1561
+ }
1562
+ out.close();
1563
+ }
1564
+
1565
+ void vcf_file::output_site_quality(const string &output_file_prefix)
1566
+ {
1567
+ // Output per-site quality information.
1568
+ printLOG("Outputting Quality for Each Site\n");
1569
+ string output = output_file_prefix + ".lqual";
1570
+
1571
+ ofstream out(output.c_str());
1572
+ if (!out.is_open())
1573
+ error("Could not open Site Depth Output File: " + output, 7);
1574
+
1575
+ out << "CHROM\tPOS\tQUAL" << endl;
1576
+
1577
+ string vcf_line;
1578
+ vcf_entry e(N_indv);
1579
+ for (unsigned int s=0; s<N_entries; s++)
1580
+ {
1581
+ if (include_entry[s] == false)
1582
+ continue;
1583
+
1584
+ get_vcf_entry(s, vcf_line);
1585
+ e.reset(vcf_line);
1586
+ e.parse_basic_entry();
1587
+
1588
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e.get_QUAL() << endl;
1589
+ }
1590
+ out.close();
1591
+ }
1592
+
1593
+ void vcf_file::output_site_depth(const string &output_file_prefix, bool output_mean)
1594
+ {
1595
+ // Output per-site depth information
1596
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1597
+ error("Require Genotypes in VCF file in order to output Site Depth Statistics.");
1598
+
1599
+ printLOG("Outputting Depth for Each Site\n");
1600
+ string output = output_file_prefix + ".ldepth";
1601
+ if (output_mean)
1602
+ output += ".mean";
1603
+ ofstream out(output.c_str());
1604
+ if (!out.is_open())
1605
+ error("Could not open Site Depth Output File: " + output, 7);
1606
+
1607
+ out << "CHROM\tPOS\t";
1608
+ if (output_mean)
1609
+ out << "MEAN_DEPTH\tVAR_DEPTH" << endl;
1610
+ else
1611
+ out << "SUM_DEPTH\tSUMSQ_DEPTH" << endl;
1612
+
1613
+ int depth;
1614
+ string vcf_line;
1615
+ vcf_entry e(N_indv);
1616
+ for (unsigned int s=0; s<N_entries; s++)
1617
+ {
1618
+ if (include_entry[s] == false)
1619
+ continue;
1620
+
1621
+ get_vcf_entry(s, vcf_line);
1622
+ e.reset(vcf_line);
1623
+ e.parse_basic_entry();
1624
+
1625
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t";
1626
+
1627
+ unsigned int sum=0;
1628
+ unsigned int sumsq=0;
1629
+ unsigned int n=0;
1630
+ for (unsigned int ui=0; ui<N_indv; ui++)
1631
+ {
1632
+ if (include_indv[ui] == false)
1633
+ continue;
1634
+ if (include_genotype[s][ui] == false)
1635
+ continue;
1636
+
1637
+ e.parse_genotype_entry(ui, false, false, true);
1638
+ depth = e.get_indv_DEPTH(ui);
1639
+ if (depth >= 0)
1640
+ {
1641
+ sum += depth;
1642
+ sumsq += (depth*depth);
1643
+ n++;
1644
+ }
1645
+ }
1646
+
1647
+ if (output_mean)
1648
+ {
1649
+ double mean = double(sum) / n;
1650
+ double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1);
1651
+ out << mean << "\t" << var << endl;
1652
+ }
1653
+ else
1654
+ out << sum << "\t" << sumsq << endl;
1655
+ }
1656
+ out.close();
1657
+ }
1658
+
1659
+ void vcf_file::output_fst(const string &output_file_prefix, vcf_file &vcf_fst)
1660
+ {
1661
+ // Calculate, and output, Fst using the formula outlined in HapMap I
1662
+ // Namely:
1663
+ // Fst = 1 - (Pi_within / Pi_combined)
1664
+ // where
1665
+ // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
1666
+ // and
1667
+ // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
1668
+ // where j is the population index, and i is the SNP index
1669
+ printLOG("Outputting Fst estimates (for bi-allelic only)\n");
1670
+
1671
+ string output = output_file_prefix + ".fst";
1672
+ ofstream out(output.c_str());
1673
+ if (!out.is_open())
1674
+ error("Could not open Fst Output File: " + output, 7);
1675
+
1676
+ out << "CHROM\tPOS\tFST" << endl;
1677
+
1678
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
1679
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
1680
+
1681
+ return_site_union(vcf_fst, CHROMPOS_to_filepos_pair);
1682
+
1683
+ string vcf_line;
1684
+
1685
+ int n_1, n_2, n_1_choose_2 = 0, n_2_choose_2=0;
1686
+ int last_n_1=-1, last_n_2=-1;
1687
+
1688
+ unsigned int n_i1, n_i2, n_iT;
1689
+ int N_alleles1, N_alleles2;
1690
+ vector<int> allele_counts1, allele_counts2;
1691
+ double x_i1, x_i2, x_iT;
1692
+ int POS;
1693
+ int s1, s2;
1694
+
1695
+ double tmp1, tmp2, tmpT;
1696
+ double sum1=0.0, sum2=0.0, sumT=0.0;
1697
+ double Fst;
1698
+ string CHROM;
1699
+
1700
+ unsigned int N_intersecting_sites = 0;
1701
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
1702
+ {
1703
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
1704
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
1705
+
1706
+ if ((s1 == -1) || (s2 == -1))
1707
+ continue;
1708
+
1709
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
1710
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
1711
+
1712
+ get_vcf_entry(s1, vcf_line);
1713
+ vcf_entry e1(N_indv, vcf_line);
1714
+ vcf_fst.get_vcf_entry(s2, vcf_line);
1715
+ vcf_entry e2(vcf_fst.N_indv, vcf_line);
1716
+
1717
+ e1.parse_basic_entry(true);
1718
+ e2.parse_basic_entry(true);
1719
+
1720
+ // Check sites have same alternative alleles
1721
+ N_alleles1 = e1.get_N_alleles();
1722
+ N_alleles2 = e2.get_N_alleles();
1723
+
1724
+ if ((N_alleles1 != 2) || (N_alleles2 != 2))
1725
+ {
1726
+ one_off_warning("\tFst: Only using biallelic SNPs.");
1727
+ continue;
1728
+ }
1729
+
1730
+ if ((N_alleles1 == 2) && (N_alleles2 == 2))
1731
+ if (e1.get_ALT_allele(0) != e2.get_ALT_allele(0))
1732
+ {
1733
+ one_off_warning("\tFst: Only using sites with matching reference alleles.");
1734
+ continue;
1735
+ }
1736
+
1737
+ e1.parse_genotype_entries(true);
1738
+ e2.parse_genotype_entries(true);
1739
+
1740
+ // Calculate allele frequencies
1741
+ e1.get_allele_counts(allele_counts1, n_i1, include_indv, include_genotype[s1]);
1742
+ e2.get_allele_counts(allele_counts2, n_i2, vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
1743
+
1744
+ if ((n_i1 == 0) || (n_i2 == 0))
1745
+ continue;
1746
+
1747
+ n_1 = e1.get_N_chr(include_indv, include_genotype[s1]);
1748
+ n_2 = e2.get_N_chr(vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
1749
+
1750
+ if (last_n_1 != -1)
1751
+ {
1752
+ if ((n_1 != last_n_1) || (n_2 != last_n_2))
1753
+ {
1754
+ error("Cannot mix sites with different ploidy. Are you including sex-chromosomes?\n"+CHROM+":"+int2str(POS)+"\n");
1755
+ }
1756
+ }
1757
+ else
1758
+ {
1759
+ last_n_1 = n_1;
1760
+ last_n_2 = n_2;
1761
+ }
1762
+
1763
+ n_1_choose_2 = n_1 * (n_1 - 1) / 2;
1764
+ n_2_choose_2 = n_2 * (n_2 - 1) / 2;
1765
+
1766
+ N_intersecting_sites++;
1767
+
1768
+ x_i1 = allele_counts1[0] / double(n_i1);
1769
+ x_i2 = allele_counts2[0] / double(n_i2);
1770
+ n_iT = (n_i1 + n_i2);
1771
+ x_iT = (allele_counts1[0] + allele_counts2[0]) / double(n_iT);
1772
+
1773
+ tmp1 = 2 * (n_i1 / (n_i1 - 1.0)) * x_i1 * (1-x_i1);
1774
+ tmp2 = 2 * (n_i2 / (n_i2 - 1.0)) * x_i2 * (1-x_i2);
1775
+ tmpT = 2 * (n_iT / (n_iT - 1.0)) * x_iT * (1-x_iT);
1776
+
1777
+ Fst = 1.0 - (((n_1_choose_2 * tmp1) + (n_2_choose_2 * tmp2)) / (n_1_choose_2 + n_2_choose_2) / tmpT);
1778
+
1779
+ out << CHROM << "\t" << POS << "\t" << Fst << endl;
1780
+
1781
+ sum1 += tmp1;
1782
+ sum2 += tmp2;
1783
+ sumT += tmpT;
1784
+
1785
+ last_n_1 = n_1; last_n_2 = n_2;
1786
+ }
1787
+
1788
+ Fst = 1.0 - (((n_1_choose_2 * sum1) + (n_2_choose_2 * sum2)) / (n_1_choose_2 + n_2_choose_2) / sumT);
1789
+
1790
+ printLOG("Found " + int2str(N_intersecting_sites) + " intersecting sites\n");
1791
+ printLOG("Fst = " + dbl2str(Fst, 6) + "\n");
1792
+
1793
+ out.close();
1794
+ }
1795
+
1796
+
1797
+ void vcf_file::output_fst_version_2(const string &output_file_prefix, const vector<string> &indv_files)
1798
+ {
1799
+ // Calculate Fst using individuals in one (rather than two VCF files)
1800
+ // Calculate, and output, Fst using the formula outlined in HapMap I
1801
+ // Namely:
1802
+ // Fst = 1 - (Pi_within / Pi_combined)
1803
+ // where
1804
+ // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
1805
+ // and
1806
+ // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
1807
+ // where j is the population index, and i is the SNP index
1808
+
1809
+ if (indv_files.size() == 1)
1810
+ {
1811
+ printLOG("Require at least two populations to estimate Fst. Skipping\n");
1812
+ return;
1813
+ }
1814
+
1815
+ printLOG("Outputting Fst estimates.\n");
1816
+
1817
+ // First, read in the relevant files.
1818
+ vector< vector<bool> > indvs_in_pops;
1819
+ unsigned int N_pops = indv_files.size();
1820
+ indvs_in_pops.resize(N_pops, vector<bool>(N_indv, false));
1821
+ vector<bool> all_indv(N_indv,false);
1822
+ map<string, int> indv_to_idx;
1823
+ for (unsigned int ui=0; ui<N_indv; ui++)
1824
+ if (include_indv[ui] == true)
1825
+ indv_to_idx[indv[ui]] = ui;
1826
+ for (unsigned int ui=0; ui<N_pops; ui++)
1827
+ {
1828
+ ifstream indv_file(indv_files[ui].c_str());
1829
+ if (!indv_file.is_open())
1830
+ error("Could not open Individual file: " + indv_files[ui]);
1831
+ string line;
1832
+ string tmp_indv;
1833
+ stringstream ss;
1834
+ while (!indv_file.eof())
1835
+ {
1836
+ getline(indv_file, line);
1837
+ ss.str(line);
1838
+ ss >> tmp_indv;
1839
+ if (indv_to_idx.find(tmp_indv) != indv_to_idx.end())
1840
+ {
1841
+ indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true;
1842
+ all_indv[indv_to_idx[tmp_indv]]=true;
1843
+ }
1844
+ ss.clear();
1845
+ }
1846
+ indv_file.close();
1847
+ }
1848
+
1849
+ string output = output_file_prefix + ".fst";
1850
+ ofstream out(output.c_str());
1851
+ if (!out.is_open())
1852
+ error("Could not open Fst Output File: " + output, 7);
1853
+
1854
+ out << "CHROM\tPOS\tFST" << endl;
1855
+
1856
+ vcf_entry e(N_indv);
1857
+ string vcf_line;
1858
+ vector<int> allele_counts1;
1859
+ double Fst_tot_num=0.0, Fst_tot_denom=0.0;
1860
+ for (unsigned int s=0; s<N_entries; s++)
1861
+ {
1862
+ if (include_entry[s] == false)
1863
+ continue;
1864
+
1865
+ get_vcf_entry(s, vcf_line);
1866
+ e.reset(vcf_line);
1867
+ e.parse_basic_entry(true);
1868
+
1869
+ if (e.get_N_alleles() != 2)
1870
+ {
1871
+ one_off_warning("\tFst: Only using biallelic sites.");
1872
+ continue;
1873
+ }
1874
+
1875
+ e.parse_full_entry(true);
1876
+ e.parse_genotype_entries(true);
1877
+
1878
+ unsigned int N_chr;
1879
+ e.get_allele_counts(allele_counts1, N_chr, all_indv, include_genotype[s]);
1880
+ double count_all = allele_counts1[1];
1881
+ double N_chr_all = N_chr;
1882
+
1883
+ if ((count_all == 0) || (count_all == N_chr_all))
1884
+ continue; // No polymorphism
1885
+
1886
+ vector<double> counts(N_pops, 0);
1887
+ vector<double> pop_N_chr(N_pops, 0);
1888
+ vector<double> pop_N_choose_2(N_pops, 0);
1889
+ for (unsigned int p=0; p<N_pops; p++)
1890
+ {
1891
+ e.get_allele_counts(allele_counts1, N_chr, indvs_in_pops[p], include_genotype[s]);
1892
+ counts[p] = allele_counts1[1];
1893
+ pop_N_chr[p] = N_chr;
1894
+ pop_N_choose_2[p] = N_chr * (N_chr-1.0) / 2.0;
1895
+ }
1896
+
1897
+ double Fst_SNP = 0;
1898
+ double f;
1899
+ double sum1=0.0;
1900
+ for (unsigned int p=0; p<N_pops; p++)
1901
+ {
1902
+ f = counts[p] / pop_N_chr[p];
1903
+ Fst_SNP += 2.0*pop_N_choose_2[p]*(pop_N_chr[p]/(pop_N_chr[p]-1.0))*f*(1.0-f);
1904
+ sum1 += pop_N_choose_2[p];
1905
+ }
1906
+ Fst_SNP /= sum1;
1907
+ Fst_tot_num += Fst_SNP;
1908
+ f = count_all / N_chr_all;
1909
+ double tmp = (2.0*(N_chr_all / (N_chr_all-1.0))*f*(1.0-f));
1910
+ Fst_SNP /= tmp;
1911
+ Fst_tot_denom += tmp;
1912
+ Fst_SNP = 1.0 - Fst_SNP;
1913
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << Fst_SNP << endl;
1914
+
1915
+ // TODO add other methods of calculating Fst (such as Weir-Cockerham)
1916
+ }
1917
+ double Fst_tot = 1.0 - (Fst_tot_num / Fst_tot_denom);
1918
+ printLOG("Fst = " + dbl2str(Fst_tot, 6) + "\n");
1919
+
1920
+ out.close();
1921
+ }
1922
+
1923
+ void vcf_file::output_per_site_nucleotide_diversity(const string &output_file_prefix)
1924
+ {
1925
+ // Output nucleotide diversity, calculated on a per-site basis.
1926
+ // Pi = average number of pairwise differences
1927
+ // Assumes a constant distance of 1 between all possible mutations
1928
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1929
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
1930
+
1931
+ printLOG("Outputting Per-Site Nucleotide Diversity Statistics...\n");
1932
+ string output_file = output_file_prefix + ".sites.pi";
1933
+
1934
+ ofstream out(output_file.c_str());
1935
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
1936
+ out << "CHROM\tPOS\tPI" << endl;
1937
+
1938
+ string vcf_line, FORMAT_out;
1939
+ vcf_entry e(N_indv);
1940
+ pair<int, int> genotype1, genotype2;
1941
+ for (unsigned int s=0; s<N_entries; s++)
1942
+ {
1943
+ if (include_entry[s] == false)
1944
+ continue;
1945
+
1946
+ get_vcf_entry(s, vcf_line);
1947
+ e.reset(vcf_line);
1948
+ e.parse_basic_entry(true);
1949
+
1950
+ if (e.get_N_alleles() != 2)
1951
+ {
1952
+ one_off_warning("\tsitePi: Only using biallelic sites.");
1953
+ continue;
1954
+ }
1955
+
1956
+ e.parse_full_entry(true);
1957
+ e.parse_genotype_entries(true);
1958
+
1959
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
1960
+ {
1961
+ one_off_warning("\tsitePi: Only using fully diploid sites.");
1962
+ continue;
1963
+ }
1964
+
1965
+ int total_alleles_count = 0;
1966
+ int first_allele_count = 0;
1967
+ int first_allele = -1;
1968
+ for (unsigned int ui=0; ui < N_indv; ui++)
1969
+ {
1970
+ if (include_indv[ui] == false)
1971
+ continue;
1972
+ if (include_genotype[s][ui] == false)
1973
+ continue;
1974
+ e.get_indv_GENOTYPE_ids(ui, genotype1);
1975
+ if ((genotype1.first != -1) && (genotype1.second != -1))
1976
+ {
1977
+ total_alleles_count += 2;
1978
+ if (first_allele == -1)
1979
+ first_allele = genotype1.first; //initialize to the first allele found
1980
+ if (genotype1.first == first_allele)
1981
+ first_allele_count++;
1982
+ if (genotype1.second == first_allele)
1983
+ first_allele_count++;
1984
+ }
1985
+ }
1986
+ int n = total_alleles_count;
1987
+ int k = first_allele_count;
1988
+ double pi= (2.0*k*(n-k))/(n*(n-1));
1989
+
1990
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << pi << endl;
1991
+ }
1992
+ }
1993
+
1994
+ // Output Tajima's D
1995
+ // Carlson et al. Genome Res (2005)
1996
+ void vcf_file::output_Tajima_D(const string &output_file_prefix, int window_size)
1997
+ {
1998
+ if (window_size <= 0)
1999
+ return;
2000
+
2001
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2002
+ error("Require Genotypes in VCF file in order to output Tajima's D Statistic.");
2003
+
2004
+ printLOG("Outputting Tajima's D Statistic...\n");
2005
+ string output_file = output_file_prefix + ".Tajima.D";
2006
+
2007
+ double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2;
2008
+ unsigned int n = N_kept_individuals()*2;
2009
+ if (n < 2)
2010
+ error("Require at least two chromosomes!");
2011
+
2012
+ for (unsigned int ui=1; ui<n; ui++)
2013
+ {
2014
+ a1 += 1.0 / double(ui);
2015
+ a2 += 1.0 / double(ui * ui);
2016
+ }
2017
+ b1 = double(n+1) / 3.0 / double(n-1);
2018
+ b2 = 2.0 * double(n*n + n + 3) / 9.0 / double(n) / double(n-1);
2019
+ c1 = b1 - (1.0 / a1);
2020
+ c2 = b2 - (double(n+2)/double(a1*n)) + (a2/a1/a1);
2021
+ e1 = c1 / a1;
2022
+ e2 = c2 / ((a1*a1) + a2);
2023
+
2024
+ // Find maximum position
2025
+ map<string, int> max_pos;
2026
+ string vcf_line, CHROM;
2027
+ vcf_entry e(N_indv);
2028
+ for (unsigned int s=0; s<N_entries; s++)
2029
+ {
2030
+ if (include_entry[s] == true)
2031
+ {
2032
+ get_vcf_entry(s, vcf_line);
2033
+ e.reset(vcf_line);
2034
+ e.parse_basic_entry();
2035
+
2036
+ CHROM = e.get_CHROM();
2037
+
2038
+ if (max_pos.find(CHROM) != max_pos.end())
2039
+ {
2040
+ if (e.get_POS() > max_pos[CHROM])
2041
+ max_pos[CHROM] = e.get_POS();
2042
+ }
2043
+ else
2044
+ max_pos[CHROM] = e.get_POS();
2045
+ }
2046
+ }
2047
+
2048
+ map<string, int>::iterator it;
2049
+ unsigned int N_bins;
2050
+ map<string, vector< pair<int, double> > > bins;
2051
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2052
+ {
2053
+ CHROM = (*it).first;
2054
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2055
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2056
+ }
2057
+
2058
+ unsigned int idx;
2059
+ double C = 1.0 / double(window_size);
2060
+ vector<int> allele_counts;
2061
+ unsigned int N_non_missing_chr;
2062
+ unsigned int N_alleles;
2063
+ for (unsigned int s=0; s<N_entries; s++)
2064
+ {
2065
+ if (include_entry[s] == false)
2066
+ continue;
2067
+
2068
+ get_vcf_entry(s, vcf_line);
2069
+ e.reset(vcf_line);
2070
+ e.parse_basic_entry(true);
2071
+ N_alleles = e.get_N_alleles();
2072
+
2073
+ if (N_alleles != 2)
2074
+ {
2075
+ one_off_warning("\tTajimaD: Only using bialleleic sites.");
2076
+ continue;
2077
+ }
2078
+
2079
+ CHROM = e.get_CHROM();
2080
+ idx = (unsigned int)(e.get_POS() * C);
2081
+ e.parse_genotype_entries(true);
2082
+
2083
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2084
+ {
2085
+ one_off_warning("\tTajimaD: Only using fully diploid sites.");
2086
+ continue;
2087
+ }
2088
+
2089
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2090
+
2091
+ double p = double(allele_counts[0]) / N_non_missing_chr;
2092
+ if ((p > 0.0) && (p < 1.0))
2093
+ {
2094
+ bins[CHROM][idx].first++;
2095
+ bins[CHROM][idx].second += p * (1.0-p);
2096
+ }
2097
+ }
2098
+
2099
+ ofstream out(output_file.c_str());
2100
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2101
+ out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl;
2102
+
2103
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2104
+ {
2105
+ CHROM = (*it).first;
2106
+ bool output = false;
2107
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2108
+ {
2109
+ int S = bins[CHROM][s].first;
2110
+ double D = 0;
2111
+ if (S > 1)
2112
+ {
2113
+ double pi = 2.0*bins[CHROM][s].second*n/double(n-1);
2114
+ double tw = double(S) / a1;
2115
+ double var = (e1*S) + e2*S*(S-1);
2116
+ D = (pi - tw) / sqrt(var);
2117
+ output = true;
2118
+ }
2119
+ if (S > 0)
2120
+ output = true;
2121
+ if (output == true)
2122
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl;
2123
+ }
2124
+ }
2125
+
2126
+ out.close();
2127
+ }
2128
+
2129
+ void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
2130
+ {
2131
+ // Output nucleotide diversity, as calculated in windows.
2132
+ // Average number of pairwise differences in windows.
2133
+ if (window_size <= 0)
2134
+ return;
2135
+
2136
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2137
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
2138
+
2139
+ printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
2140
+ string output_file = output_file_prefix + ".windowed.pi";
2141
+
2142
+ // Find maximum position
2143
+ map<string, int> max_pos;
2144
+ map<string, int>::iterator it;
2145
+ string vcf_line, CHROM;
2146
+ vcf_entry e(N_indv);
2147
+ for (unsigned int s=0; s<N_entries; s++)
2148
+ {
2149
+ if (include_entry[s] == true)
2150
+ {
2151
+ get_vcf_entry(s, vcf_line);
2152
+ e.reset(vcf_line);
2153
+ e.parse_basic_entry();
2154
+
2155
+ CHROM = e.get_CHROM();
2156
+
2157
+ if (max_pos.find(CHROM) != max_pos.end())
2158
+ {
2159
+ if (e.get_POS() > max_pos[CHROM])
2160
+ max_pos[CHROM] = e.get_POS();
2161
+ }
2162
+ else
2163
+ max_pos[CHROM] = e.get_POS();
2164
+ }
2165
+ }
2166
+
2167
+ unsigned int N_bins;
2168
+ map<string, vector<pair<int, double> > > bins;
2169
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2170
+ {
2171
+ CHROM = (*it).first;
2172
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2173
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2174
+ }
2175
+
2176
+ unsigned int idx;
2177
+ double C = 1.0 / double(window_size);
2178
+ vector<int> allele_counts;
2179
+ unsigned int N_non_missing_chr;
2180
+ unsigned int N_alleles;
2181
+ for (unsigned int s=0; s<N_entries; s++)
2182
+ {
2183
+ if (include_entry[s] == false)
2184
+ continue;
2185
+
2186
+ get_vcf_entry(s, vcf_line);
2187
+ e.reset(vcf_line);
2188
+ e.parse_basic_entry(true);
2189
+ N_alleles = e.get_N_alleles();
2190
+
2191
+ if (N_alleles != 2)
2192
+ {
2193
+ one_off_warning("\twindowPi: Only using bialleleic sites.");
2194
+ continue;
2195
+ }
2196
+
2197
+ CHROM = e.get_CHROM();
2198
+ idx = (unsigned int)(e.get_POS() * C);
2199
+ e.parse_genotype_entries(true);
2200
+
2201
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2202
+ {
2203
+ one_off_warning("\twindowPi: Only using fully diploid sites.");
2204
+ continue;
2205
+ }
2206
+
2207
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2208
+
2209
+ double p = double(allele_counts[0]) / N_non_missing_chr;
2210
+ if ((p>0.0) && (p<1.0))
2211
+ {
2212
+ bins[CHROM][idx].first++;
2213
+ bins[CHROM][idx].second += (double(N_non_missing_chr) / (N_non_missing_chr - 1.0)) * 2.0 * p * (1.0 - p);
2214
+ }
2215
+ }
2216
+
2217
+ ofstream out(output_file.c_str());
2218
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2219
+ out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
2220
+
2221
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2222
+ {
2223
+ CHROM = (*it).first;
2224
+ bool output = false;
2225
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2226
+ {
2227
+ if (bins[CHROM][s].first > 0)
2228
+ output = true;
2229
+ if (output == true)
2230
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
2231
+ }
2232
+ }
2233
+
2234
+ out.close();
2235
+ }
2236
+
2237
+ /*
2238
+ void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
2239
+ {
2240
+ // Output nucleotide diversity, as calculated in windows.
2241
+ // Average number of pairwise differences in windows.
2242
+ // Requires phased data.
2243
+ if (window_size <= 0)
2244
+ return;
2245
+
2246
+ if (has_genotypes == false)
2247
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
2248
+
2249
+ printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
2250
+ string output_file = output_file_prefix + ".windowed.pi";
2251
+
2252
+ map<string, int>::iterator it;
2253
+
2254
+ // Find maximum position
2255
+ map<string, int> max_pos;
2256
+ string vcf_line, CHROM;
2257
+ vcf_entry e(N_indv);
2258
+ for (unsigned int s=0; s<N_entries; s++)
2259
+ {
2260
+ if (include_entry[s] == true)
2261
+ {
2262
+ get_vcf_entry(s, vcf_line);
2263
+ e.reset(vcf_line);
2264
+ e.parse_basic_entry();
2265
+
2266
+ CHROM = e.get_CHROM();
2267
+
2268
+ if (max_pos.find(CHROM) != max_pos.end())
2269
+ {
2270
+ if (e.get_POS() > max_pos[CHROM])
2271
+ max_pos[CHROM] = e.get_POS();
2272
+ }
2273
+ else
2274
+ max_pos[CHROM] = e.get_POS();
2275
+ }
2276
+ }
2277
+
2278
+ unsigned int N_bins;
2279
+ map<string, vector<pair<int, double> > > bins;
2280
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2281
+ {
2282
+ CHROM = (*it).first;
2283
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2284
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2285
+ }
2286
+
2287
+ unsigned int last_idx = (unsigned)(-1);
2288
+ unsigned int idx;
2289
+ string last_CHROM;
2290
+ vector<vector<int> > haplotypes(2*N_indv);
2291
+ pair<int, int> genotype1;
2292
+ unsigned int N_SNPs=0;;
2293
+ double C = 1.0 / double(window_size);
2294
+ for (unsigned int s=0; s<N_entries; s++)
2295
+ {
2296
+ if (include_entry[s] == false)
2297
+ continue;
2298
+
2299
+ get_vcf_entry(s, vcf_line);
2300
+ e.reset(vcf_line);
2301
+ e.parse_basic_entry();
2302
+
2303
+ CHROM = e.get_CHROM();
2304
+ idx = (unsigned int)(e.get_POS() * C);
2305
+
2306
+ if (((last_idx != idx) || (CHROM != last_CHROM)) && (last_idx != (unsigned)-1))
2307
+ { // Process haplotype window.
2308
+ double pi=0.0;
2309
+ double n=0.0;
2310
+ for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
2311
+ {
2312
+ if (include_indv[ui/2] == false)
2313
+ continue;
2314
+ for (unsigned int uj=(ui+1); uj<haplotypes.size(); uj++)
2315
+ {
2316
+ if (include_indv[uj/2] == false)
2317
+ continue;
2318
+ for (unsigned int snp=0; snp<N_SNPs; snp++)
2319
+ {
2320
+ if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
2321
+ {
2322
+ if (haplotypes[ui][snp] != haplotypes[uj][snp])
2323
+ { pi++; }
2324
+ n++;
2325
+ }
2326
+ }
2327
+ }
2328
+ }
2329
+ pi /= n;
2330
+ bins[last_CHROM][last_idx].first = N_SNPs;
2331
+ bins[last_CHROM][last_idx].second = pi;
2332
+
2333
+ N_SNPs = 0;
2334
+ for (unsigned int ui=0; ui<haplotypes.size(); ui++)
2335
+ {
2336
+ haplotypes[ui].clear();
2337
+ }
2338
+ }
2339
+
2340
+ e.parse_genotype_entries(true);
2341
+ for (unsigned int ui=0; ui<N_indv; ui++)
2342
+ {
2343
+ if (include_indv[ui] == false)
2344
+ continue;
2345
+
2346
+ if (include_genotype[s][ui] == true)
2347
+ {
2348
+ e.get_indv_GENOTYPE_ids(ui, genotype1);
2349
+ haplotypes[(2*ui)].push_back(genotype1.first);
2350
+ haplotypes[(2*ui)+1].push_back(genotype1.second);
2351
+ }
2352
+ else
2353
+ {
2354
+ haplotypes[(2*ui)].push_back(-1);
2355
+ haplotypes[(2*ui)+1].push_back(-1);
2356
+ }
2357
+ }
2358
+ N_SNPs++;
2359
+ last_CHROM = CHROM;
2360
+ last_idx = idx;
2361
+ }
2362
+
2363
+ if (N_SNPs > 0)
2364
+ { // Output last window
2365
+ double pi=0.0;
2366
+ double n=0.0;
2367
+ for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
2368
+ {
2369
+ if (include_indv[ui/2] == false)
2370
+ continue;
2371
+ for (unsigned int uj=ui+1; uj<haplotypes.size(); uj++)
2372
+ {
2373
+ if (include_indv[uj/2] == false)
2374
+ continue;
2375
+ for (unsigned int snp=0; snp<N_SNPs; snp++)
2376
+ {
2377
+ if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
2378
+ {
2379
+ if (haplotypes[ui][snp] != haplotypes[uj][snp])
2380
+ pi++;
2381
+ n++;
2382
+ }
2383
+ }
2384
+ }
2385
+ }
2386
+ pi /= n;
2387
+ bins[last_CHROM][last_idx].first = N_SNPs;
2388
+ bins[last_CHROM][last_idx].second = pi;
2389
+ }
2390
+
2391
+ ofstream out(output_file.c_str());
2392
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2393
+ out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
2394
+
2395
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2396
+ {
2397
+ CHROM = (*it).first;
2398
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2399
+ {
2400
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
2401
+ }
2402
+ }
2403
+
2404
+ out.close();
2405
+ }
2406
+ */
2407
+
2408
+ void vcf_file::output_kept_and_removed_sites(const string &output_file_prefix)
2409
+ {
2410
+ // Output lists of sites that have been filtered (or not).
2411
+ printLOG("Outputting Kept and Removed Sites...\n");
2412
+ string output_file1 = output_file_prefix + ".kept.sites";
2413
+ string output_file2 = output_file_prefix + ".removed.sites";
2414
+
2415
+ string vcf_line, CHROM;
2416
+ int POS;
2417
+ vcf_entry e(N_indv);
2418
+
2419
+ ofstream out1(output_file1.c_str());
2420
+ if (!out1.is_open()) error("Could not open output file: " + output_file1, 12);
2421
+ out1 << "CHROM\tPOS" << endl;
2422
+
2423
+ ofstream out2(output_file2.c_str());
2424
+ if (!out2.is_open()) error("Could not open output file: " + output_file2, 12);
2425
+ out2 << "CHROM\tPOS" << endl;
2426
+
2427
+ for (unsigned int s=0; s<N_entries; s++)
2428
+ {
2429
+ get_vcf_entry(s, vcf_line);
2430
+ e.reset(vcf_line);
2431
+ e.parse_basic_entry();
2432
+ POS = e.get_POS();
2433
+ CHROM = e.get_CHROM();
2434
+ if (include_entry[s] == true)
2435
+ {
2436
+ out1 << CHROM << "\t" << POS << endl;
2437
+ }
2438
+ else
2439
+ {
2440
+ out2 << CHROM << "\t" << POS << endl;
2441
+ }
2442
+ }
2443
+ out1.close();
2444
+ out2.close();
2445
+ }
2446
+
2447
+
2448
+ void vcf_file::output_LROH(const string &output_file_prefix)
2449
+ {
2450
+ // Detect and output Long Runs of Homozygosity, following the method
2451
+ // developed by Adam Boyko, and described in Auton et al., Genome Research, 2009
2452
+ // (Although using Forward-backwards algorithm in place of Viterbi).
2453
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2454
+ error("Require Genotypes in VCF file in order to output LROH.");
2455
+
2456
+ printLOG("Outputting Long Runs of Homozygosity (Experimental)... \n");
2457
+ string output_file = output_file_prefix + ".LROH";
2458
+
2459
+ unsigned int nGen=4; // Number of generations since common ancestry
2460
+ double genotype_error_rate = 0.01; // Assumed genotype error rate
2461
+ double p_auto_prior = 0.05; // Prior probability of being in autozygous state
2462
+ double p_auto_threshold = 0.99; // Threshold for reporting autozygous region
2463
+ int min_SNPs=0; // Threshold for reporting autozygous region
2464
+
2465
+ string vcf_line, CHROM;
2466
+ int POS;
2467
+ vcf_entry e(N_indv);
2468
+ pair<int, int> alleles;
2469
+ vector<unsigned int> s_vector;
2470
+ vector<pair<double, double> > p_emission;
2471
+ vector<vector<double> > p_trans;
2472
+
2473
+ ofstream out(output_file.c_str());
2474
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2475
+ out << "CHROM\tAUTO_START\tAUTO_END\tN_SNPs\tINDV" << endl;
2476
+
2477
+ for (unsigned int ui=0; ui<N_indv; ui++)
2478
+ {
2479
+ if (include_indv[ui] == false)
2480
+ continue;
2481
+
2482
+ printLOG("\t" + indv[ui] + "\n");
2483
+
2484
+ int last_POS = -1;
2485
+ s_vector.resize(0); p_emission.resize(0); p_trans.resize(0);
2486
+
2487
+ for (unsigned int s=0; s<N_entries; s++)
2488
+ {
2489
+ if ((include_entry[s] == false) || (include_genotype[s][ui] == false))
2490
+ continue;
2491
+
2492
+ get_vcf_entry(s, vcf_line);
2493
+ e.reset(vcf_line);
2494
+ e.parse_basic_entry(true);
2495
+
2496
+ if (e.get_N_alleles() != 2)
2497
+ {
2498
+ one_off_warning("\tLROH: Only using bialleleic sites.");
2499
+ continue; // TODO: Probably could do without this...
2500
+ }
2501
+
2502
+ POS = e.get_POS();
2503
+
2504
+ e.parse_genotype_entry(ui, true);
2505
+ e.get_indv_GENOTYPE_ids(ui, alleles);
2506
+
2507
+ if (e.get_indv_ploidy(ui) != 2)
2508
+ {
2509
+ one_off_warning("\tLROH: Only using diploid sites.");
2510
+ continue;
2511
+ }
2512
+
2513
+ if ((alleles.first == -1) || (alleles.second == -1))
2514
+ continue;
2515
+
2516
+ unsigned int X = alleles.first + alleles.second;
2517
+
2518
+ // Calculate heterozyogosity of this site.
2519
+ // TODO: Would be better to do this once, but for simplicity, do it for each individual.
2520
+ unsigned int N_genotypes = 0;
2521
+ unsigned int N_hets = 0;
2522
+ for (unsigned int uj=0; uj<N_indv; uj++)
2523
+ {
2524
+ if ((include_indv[uj] == false) || (include_genotype[s][ui] == false))
2525
+ continue;
2526
+
2527
+ e.parse_genotype_entry(uj, true);
2528
+ e.get_indv_GENOTYPE_ids(uj, alleles);
2529
+ if ((alleles.first != -1) && (alleles.second != -1))
2530
+ {
2531
+ N_genotypes++;
2532
+ if (alleles.first != alleles.second)
2533
+ N_hets++;
2534
+ }
2535
+ }
2536
+ double h = N_hets / double(N_genotypes);
2537
+ double p_emission_given_nonauto;
2538
+ double p_emission_given_auto;
2539
+
2540
+ if (X == 1)
2541
+ { // Heterozygote
2542
+ p_emission_given_nonauto = h;
2543
+ p_emission_given_auto = genotype_error_rate;
2544
+ p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
2545
+ }
2546
+ else
2547
+ { // Homozygote
2548
+ p_emission_given_nonauto = 1.0-h;
2549
+ p_emission_given_auto = 1.0-genotype_error_rate;
2550
+ p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
2551
+ }
2552
+
2553
+ double r = 0;
2554
+ if (last_POS > 0)
2555
+ { // Assume 1cM/Mb.
2556
+ r = (POS - last_POS) / 1000000.0 / 100.0; // Morgans
2557
+ }
2558
+
2559
+ double e = (1.0 - exp(-2.0*nGen*r));
2560
+ double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1]
2561
+ double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2]
2562
+ double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0]
2563
+ double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3]
2564
+ vector<double> A(4);
2565
+ A[0] = p_trans_auto_to_auto;
2566
+ A[1] = p_trans_auto_to_nonauto;
2567
+ A[2] = p_trans_nonauto_to_auto;
2568
+ A[3] = p_trans_nonauto_to_nonauto;
2569
+
2570
+ s_vector.push_back(s);
2571
+
2572
+ p_trans.push_back(A);
2573
+ last_POS = POS;
2574
+ }
2575
+
2576
+ // Forward-backward algorithm
2577
+ int N_obs = (int)p_emission.size();
2578
+ if (N_obs == 0)
2579
+ continue;
2580
+
2581
+ vector<vector<double> > alpha(N_obs, vector<double>(2,0));
2582
+ vector<vector<double> > beta(N_obs, vector<double>(2,0));
2583
+
2584
+ alpha[0][0] = p_emission[0].first;
2585
+ alpha[0][1] = p_emission[0].second;
2586
+ for (int i=1; i<N_obs; i++)
2587
+ {
2588
+ alpha[i][0] = alpha[i-1][0] * p_trans[i-1][0] * p_emission[i].first;
2589
+ alpha[i][0] += alpha[i-1][1] * p_trans[i-1][2] * p_emission[i].first;
2590
+
2591
+ alpha[i][1] = alpha[i-1][1] * p_trans[i-1][3] * p_emission[i].second;
2592
+ alpha[i][1] += alpha[i-1][0] * p_trans[i-1][1] * p_emission[i].second;
2593
+
2594
+ while (alpha[i][0] + alpha[i][1] < 1e-20)
2595
+ { // Renormalise to prevent underflow
2596
+ alpha[i][0] *= 1e20;
2597
+ alpha[i][1] *= 1e20;
2598
+ }
2599
+ }
2600
+
2601
+ beta[N_obs-1][0] = 1.0;
2602
+ beta[N_obs-1][1] = 1.0;
2603
+ for (int i=N_obs-2; i>=0; i--)
2604
+ {
2605
+ beta[i][0] = beta[i+1][0] * p_trans[i][0] * p_emission[i].first;
2606
+ beta[i][0] += beta[i+1][1] * p_trans[i][2] * p_emission[i].first;
2607
+
2608
+ beta[i][1] = beta[i+1][1] * p_trans[i][3] * p_emission[i].second;
2609
+ beta[i][1] += beta[i+1][0] * p_trans[i][1] * p_emission[i].second;
2610
+
2611
+ while (beta[i][0] + beta[i][1] < 1e-20)
2612
+ { // Renormalise to prevent underflow
2613
+ beta[i][0] *= 1e20;
2614
+ beta[i][1] *= 1e20;
2615
+ }
2616
+ }
2617
+
2618
+ // Calculate probability of each site being autozygous
2619
+ vector<double> p_auto(N_obs);
2620
+ for (int i=0; i<N_obs; i++)
2621
+ {
2622
+ p_auto[i] = alpha[i][0] * beta[i][0] / (alpha[i][0] * beta[i][0] + alpha[i][1] * beta[i][1]);
2623
+ }
2624
+
2625
+ // Generate output
2626
+ // TODO: Would be good to report actual limits of homozygosity
2627
+ // (i.e. extend regions out until first heterozygote),
2628
+ // as opposed to regions with p>threshold.
2629
+ // TODO: Also would be good to report heterozygotic SNPs found in homozygotic regions.
2630
+ bool in_auto=false;
2631
+ int start_pos=0, end_pos=0;
2632
+ int N_SNPs = 0;
2633
+ for (int i=0; i<N_obs; i++)
2634
+ {
2635
+ if (p_auto[i] > p_auto_threshold)
2636
+ {
2637
+ if (in_auto == false)
2638
+ { // Start of autozygous region
2639
+ unsigned int s = s_vector[i];
2640
+ get_vcf_entry(s, vcf_line);
2641
+ e.reset(vcf_line);
2642
+ e.parse_basic_entry(true);
2643
+ CHROM = e.get_CHROM();
2644
+ start_pos = e.get_POS();
2645
+ }
2646
+ N_SNPs++;
2647
+ in_auto = true;
2648
+ }
2649
+ else
2650
+ {
2651
+ if (in_auto == true)
2652
+ { // end of autozygous region
2653
+ unsigned int s = s_vector[i];
2654
+ get_vcf_entry(s, vcf_line);
2655
+ e.reset(vcf_line);
2656
+ e.parse_basic_entry(true);
2657
+ end_pos = e.get_POS();
2658
+ if (N_SNPs >= min_SNPs)
2659
+ out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
2660
+ }
2661
+ in_auto = false;
2662
+ N_SNPs = 0;
2663
+ }
2664
+ }
2665
+ if (in_auto == true)
2666
+ { // Report final region if needed
2667
+ unsigned int s = s_vector[N_obs-1];
2668
+ get_vcf_entry(s, vcf_line);
2669
+ e.reset(vcf_line);
2670
+ e.parse_basic_entry(true);
2671
+ end_pos = e.get_POS();
2672
+ if (N_SNPs >= min_SNPs)
2673
+ out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
2674
+ }
2675
+ }
2676
+ out.close();
2677
+ }
2678
+
2679
+ void vcf_file::output_indv_relatedness(const string &output_file_prefix)
2680
+ {
2681
+ // Calculate and output a relatedness statistic based on the method of
2682
+ // Yang et al, 2010 (doi:10.1038/ng.608). Specifically, calculate the
2683
+ // unadjusted Ajk statistic (equation 6 of paper).
2684
+ // Expectation of Ajk is zero for individuals within a populations, and
2685
+ // one for an individual with themselves.
2686
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2687
+ error("Require Genotypes in VCF file in order to output Individual Relatedness.");
2688
+
2689
+ printLOG("Outputting Individual Relatedness\n");
2690
+ string output = output_file_prefix + ".relatedness";
2691
+ ofstream out(output.c_str());
2692
+ if (!out.is_open())
2693
+ error("Could not open Individual Relatedness Output File: " + output, 2);
2694
+ out << "INDV1\tINDV2\tRELATEDNESS" << endl;
2695
+
2696
+ string vcf_line;
2697
+ vcf_entry e(N_indv);
2698
+ vector<int> allele_counts;
2699
+ unsigned int N_alleles, N_non_missing_chr;
2700
+ double freq;
2701
+ pair<int, int> geno_id;
2702
+ vector<vector<double> > Ajk(N_indv, vector<double>(N_indv, 0.0));
2703
+ vector<vector<double> > N_sites(N_indv, vector<double>(N_indv, 0.0));
2704
+
2705
+ for (unsigned int s=0; s<N_entries; s++)
2706
+ {
2707
+ if (include_entry[s] == false)
2708
+ continue;
2709
+
2710
+ get_vcf_entry(s, vcf_line);
2711
+ e.reset(vcf_line);
2712
+
2713
+ e.parse_basic_entry(true);
2714
+ N_alleles = e.get_N_alleles();
2715
+
2716
+ if (N_alleles != 2)
2717
+ {
2718
+ one_off_warning("\tRelatedness: Only using biallelic sites.");
2719
+ continue; // Only use biallelic loci
2720
+ }
2721
+
2722
+ e.parse_genotype_entries(true);
2723
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2724
+ {
2725
+ one_off_warning("\tRelatedness: Only using fully diploid sites.");
2726
+ continue;
2727
+ }
2728
+
2729
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2730
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2731
+
2732
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2733
+ continue;
2734
+
2735
+ vector<double> x(N_indv, -1.0);
2736
+ for (unsigned int ui=0; ui<N_indv; ui++)
2737
+ {
2738
+ if (include_indv[ui] == false)
2739
+ continue;
2740
+
2741
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2742
+ x[ui] = geno_id.first + geno_id.second;
2743
+ }
2744
+
2745
+ double div = 1.0/(2.0*freq*(1.0-freq));
2746
+ for (unsigned int ui=0; ui<N_indv; ui++)
2747
+ {
2748
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (x[ui] < 0))
2749
+ continue;
2750
+ Ajk[ui][ui] += (x[ui]*x[ui] - (1 + 2.0*freq)*x[ui] + 2.0*freq*freq) * div;
2751
+ N_sites[ui][ui]++;
2752
+ for (unsigned int uj=(ui+1); uj<N_indv; uj++)
2753
+ {
2754
+ if ((include_indv[uj] == false) || (include_genotype[s][uj] == false) || (x[uj] < 0))
2755
+ continue;
2756
+ Ajk[ui][uj] += (x[ui] - 2.0*freq) * (x[uj] - 2.0*freq) * div;
2757
+ N_sites[ui][uj]++;
2758
+ }
2759
+ }
2760
+ }
2761
+
2762
+ for (unsigned int ui=0; ui<N_indv; ui++)
2763
+ {
2764
+ if (include_indv[ui] == false)
2765
+ continue;
2766
+ Ajk[ui][ui] = 1.0 + (Ajk[ui][ui] / N_sites[ui][ui]);
2767
+ out << indv[ui] << "\t" << indv[ui] << "\t" << Ajk[ui][ui] << endl;
2768
+ for (unsigned int uj=(ui+1); uj<N_indv; uj++)
2769
+ {
2770
+ if (include_indv[uj] == false)
2771
+ continue;
2772
+ Ajk[ui][uj] /= N_sites[ui][uj];
2773
+ out << indv[ui] << "\t" << indv[uj] << "\t" << Ajk[ui][uj] << endl;
2774
+ }
2775
+ }
2776
+
2777
+ out.close();
2778
+ }
2779
+
2780
+ void vcf_file::output_PCA(const string &output_file_prefix, bool use_normalisation, int SNP_loadings_N_PCs)
2781
+ {
2782
+ #ifndef VCFTOOLS_PCA
2783
+ use_normalisation = true;
2784
+ SNP_loadings_N_PCs = -1;
2785
+ string out = output_file_prefix;
2786
+ out = "Cannot run PCA analysis. Vcftools has been compiled without PCA enabled (requires LAPACK).";
2787
+ error(out);
2788
+ #else
2789
+ // Output PCA, following method of Patterson, Price and Reich 2006.
2790
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2791
+ error("Require Genotypes in VCF file in order to perform PCA.");
2792
+
2793
+ if (use_normalisation)
2794
+ printLOG("Outputting Principal Component Analysis (with normalisation)\n");
2795
+ else
2796
+ printLOG("Outputting Principal Component Analysis (without normalisation)\n");
2797
+ string output = output_file_prefix + ".pca";
2798
+ ofstream out(output.c_str());
2799
+ if (!out.is_open())
2800
+ error("Could not open Principal Component Analysis Output File: " + output, 2);
2801
+
2802
+ unsigned int N_indvs = N_kept_individuals();
2803
+ unsigned int N_sites = N_kept_sites();
2804
+
2805
+ if (N_indvs >= N_sites)
2806
+ error("PCA computation requires that there are more sites than individuals.");
2807
+
2808
+ string vcf_line;
2809
+ vcf_entry e(N_indv);
2810
+ pair<int, int> geno_id;
2811
+ double x, freq;
2812
+ vector<int> allele_counts;
2813
+ unsigned int N_alleles, N_non_missing_chr;
2814
+
2815
+ // Store list of included individuals
2816
+ vector<string> included_indvs(N_indvs);
2817
+ unsigned int ui_prime = 0;
2818
+ for (unsigned int ui=0; ui<N_indv; ui++)
2819
+ {
2820
+ if (include_indv[ui] == false)
2821
+ continue;
2822
+ included_indvs[ui_prime] = indv[ui];
2823
+ ui_prime++;
2824
+ }
2825
+
2826
+ // Potentially uses a lot of memory. Should issue a warning about this.
2827
+ double **M = new double*[N_indvs]; // m rows = indv
2828
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2829
+ M[ui] = new double[N_sites]; // n columns
2830
+
2831
+ // Populate M
2832
+ unsigned int s_prime = 0;
2833
+ for (unsigned int s=0; s<N_entries; s++)
2834
+ {
2835
+ if (include_entry[s]==false)
2836
+ continue;
2837
+
2838
+ get_vcf_entry(s, vcf_line);
2839
+ e.reset(vcf_line);
2840
+
2841
+ e.parse_basic_entry(true);
2842
+ N_alleles = e.get_N_alleles();
2843
+ if (N_alleles != 2)
2844
+ error("PCA only works for biallelic sites.");
2845
+
2846
+ e.parse_genotype_entries(true);
2847
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2848
+ error("PCA only works for fully diploid sites.");
2849
+
2850
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2851
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2852
+
2853
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2854
+ continue;
2855
+
2856
+ double mu = freq*2.0;
2857
+ double div = 1.0 / sqrt(freq * (1.0-freq));
2858
+
2859
+ ui_prime = 0;
2860
+ for (unsigned int ui=0; ui<N_indv; ui++)
2861
+ {
2862
+ if (include_indv[ui] == false)
2863
+ continue;
2864
+
2865
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2866
+ x = geno_id.first + geno_id.second;
2867
+ if (x > -1)
2868
+ {
2869
+ if (use_normalisation == true)
2870
+ M[ui_prime][s_prime] = (x - mu) * div;
2871
+ else
2872
+ M[ui_prime][s_prime] = (x - mu);
2873
+ }
2874
+ ui_prime++;
2875
+ }
2876
+ s_prime++;
2877
+ }
2878
+
2879
+ // Now construct X = (1/n)MM'.
2880
+ double **X = new double *[N_indvs];
2881
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2882
+ X[ui] = new double[N_indvs];
2883
+
2884
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2885
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2886
+ X[ui][uj] = 0;
2887
+
2888
+ // Only populate one half of matrix
2889
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2890
+ for (unsigned int uj=ui; uj<N_indvs; uj++)
2891
+ for (unsigned int s=0; s<N_sites; s++)
2892
+ X[ui][uj] += M[ui][s] * M[uj][s];
2893
+
2894
+ delete [] M;
2895
+
2896
+ // Populate other half
2897
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2898
+ for (unsigned int uj=0; uj<ui; uj++)
2899
+ X[ui][uj] = X[uj][ui];
2900
+
2901
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2902
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2903
+ X[ui][uj] /= N_sites;
2904
+
2905
+ double *Er = new double[N_indvs];
2906
+ double *Ei = new double[N_indvs];
2907
+ double **Evecs = new double*[N_indvs];
2908
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2909
+ Evecs[ui] = new double[N_indvs];
2910
+
2911
+ // Call LAPACK routine to calculate eigenvectors and eigenvalues
2912
+ dgeev(X, N_indvs, Er, Ei, Evecs);
2913
+
2914
+ // Check there are no complex eigenvalues.
2915
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2916
+ if (Ei[ui] != 0)
2917
+ error("Complex eigenvalue.");
2918
+
2919
+ // Output results
2920
+ out << "INDV";
2921
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2922
+ out << "\tEIG_" << ui;
2923
+ out << endl;
2924
+
2925
+ out << "EIGENVALUE";
2926
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2927
+ out << "\t" << Er[ui];
2928
+ out << endl;
2929
+
2930
+ // Output eigenvectors (as columns)
2931
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2932
+ {
2933
+ out << included_indvs[ui];
2934
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2935
+ out << "\t" << Evecs[ui][uj];
2936
+ out << endl;
2937
+ }
2938
+
2939
+ out.close();
2940
+
2941
+ if (SNP_loadings_N_PCs > 0)
2942
+ { // Output SNP loadings
2943
+ printLOG("Outputting " + int2str(SNP_loadings_N_PCs) + " SNP loadings\n");
2944
+ output = output_file_prefix + ".pca.loadings";
2945
+ out.open(output.c_str());
2946
+ if (!out.good())
2947
+ error("Could not open Principal Component SNP Loading Output File: " + output, 2);
2948
+ out << "CHROM\tPOS";
2949
+ for (unsigned int ui=0; ui<(unsigned int)SNP_loadings_N_PCs; ui++)
2950
+ out << "\tGAMMA_" << ui;
2951
+ out << endl;
2952
+
2953
+ for (unsigned int s=0; s<N_entries; s++)
2954
+ {
2955
+ if (include_entry[s]==false)
2956
+ continue;
2957
+
2958
+ get_vcf_entry(s, vcf_line);
2959
+ e.reset(vcf_line);
2960
+
2961
+ e.parse_basic_entry(true);
2962
+ N_alleles = e.get_N_alleles();
2963
+ if (N_alleles != 2)
2964
+ error("PCA only works for biallelic sites.");
2965
+
2966
+ e.parse_genotype_entries(true);
2967
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2968
+ error("PCA only works for fully diploid sites.");
2969
+
2970
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2971
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2972
+
2973
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2974
+ continue;
2975
+
2976
+ vector<double> gamma(SNP_loadings_N_PCs, 0.0);
2977
+ vector<double> a_sum(SNP_loadings_N_PCs, 0.0);
2978
+
2979
+ ui_prime = 0;
2980
+ for (unsigned int ui=0; ui<N_indv; ui++)
2981
+ {
2982
+ if (include_indv[ui] == false)
2983
+ continue;
2984
+
2985
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2986
+ x = geno_id.first + geno_id.second;
2987
+ if (x > -1)
2988
+ {
2989
+ for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
2990
+ {
2991
+ gamma[uj] += (x * Evecs[ui_prime][uj]);
2992
+ a_sum[uj] += (Evecs[ui_prime][uj]*Evecs[ui_prime][uj]);
2993
+ }
2994
+ }
2995
+ ui_prime++;
2996
+ }
2997
+
2998
+ out << e.get_CHROM() << "\t" << e.get_POS();
2999
+ for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
3000
+ out << "\t" << gamma[uj] / a_sum[uj];
3001
+ out << endl;
3002
+ }
3003
+ out.close();
3004
+ }
3005
+
3006
+ delete [] Er;
3007
+ delete [] Ei;
3008
+ delete [] Evecs;
3009
+ delete [] X;
3010
+ #endif
3011
+ }
3012
+