ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,3012 @@
1
+ /*
2
+ * vcf_file_output.cpp
3
+ *
4
+ * Created on: Aug 28, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 249 $)
7
+ */
8
+ #include "vcf_file.h"
9
+
10
+ void vcf_file::output_frequency(const string &output_file_prefix, bool output_counts, bool suppress_allele_output)
11
+ {
12
+ // Output statistics of frequency at each site
13
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
14
+ error("Require Genotypes in VCF file in order to output Frequency Statistics.");
15
+
16
+ printLOG("Outputting Frequency Statistics...\n");
17
+ string output_file = output_file_prefix + ".frq";
18
+ if (output_counts)
19
+ output_file += ".count";
20
+
21
+ ofstream out(output_file.c_str());
22
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
23
+ if (suppress_allele_output == false)
24
+ {
25
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:";
26
+ if (output_counts)
27
+ out << "COUNT}" << endl;
28
+ else
29
+ out << "FREQ}" << endl;
30
+ }
31
+ else
32
+ {
33
+ if (output_counts)
34
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}" << endl;
35
+ else
36
+ out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}" << endl;
37
+ }
38
+
39
+ vector<int> allele_counts;
40
+ unsigned int N_non_missing_chr;
41
+ unsigned int N_alleles;
42
+ string vcf_line;
43
+ vcf_entry e(N_indv);
44
+ for (unsigned int s=0; s<N_entries; s++)
45
+ {
46
+ if (include_entry[s] == false)
47
+ continue;
48
+
49
+ get_vcf_entry(s, vcf_line);
50
+ e.reset(vcf_line);
51
+ e.parse_basic_entry(true);
52
+ e.parse_genotype_entries(true);
53
+ N_alleles = e.get_N_alleles();
54
+
55
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
56
+
57
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << N_alleles << "\t" << N_non_missing_chr;
58
+ if (output_counts)
59
+ {
60
+ if (suppress_allele_output == false)
61
+ {
62
+ out << "\t" << e.get_REF() << ":" << allele_counts[0];
63
+ for (unsigned int ui=1; ui<N_alleles; ui++)
64
+ {
65
+ out << "\t" << e.get_ALT_allele(ui-1) << ":" << allele_counts[ui];
66
+ }
67
+ out << endl;
68
+ }
69
+ else
70
+ {
71
+ for (unsigned ui=0; ui<N_alleles; ui++)
72
+ {
73
+ out << "\t" << allele_counts[ui];
74
+ }
75
+ out << endl;
76
+ }
77
+ }
78
+ else
79
+ {
80
+ double freq;
81
+ if (suppress_allele_output == false)
82
+ {
83
+ freq = allele_counts[0] / (double)N_non_missing_chr;
84
+ out << "\t" << e.get_REF() << ":" << freq;
85
+ for (unsigned int ui=1; ui<N_alleles; ui++)
86
+ {
87
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
88
+ out << "\t" << e.get_ALT_allele(ui-1) << ":" << freq;
89
+ }
90
+ out << endl;
91
+ }
92
+ else
93
+ {
94
+ for (unsigned int ui=0; ui<N_alleles; ui++)
95
+ {
96
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
97
+ out << "\t" << freq;
98
+ }
99
+ out << endl;
100
+ }
101
+ }
102
+ }
103
+ out.close();
104
+ }
105
+
106
+ void vcf_file::output_het(const string &output_file_prefix)
107
+ {
108
+ // Output statistics on Heterozygosity for each individual
109
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
110
+ error("Require Genotypes in VCF file in order to output Heterozygosity Statistics.");
111
+ // Following the calculations in PLINK....
112
+ // Note this assumes Biallelic SNPs.
113
+
114
+ printLOG("Outputting Individual Heterozygosity\n");
115
+
116
+ string output_file = output_file_prefix + ".het";
117
+ ofstream out(output_file.c_str());
118
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
119
+ out << "INDV\tO(HOM)\tE(HOM)\tN_SITES\tF" << endl;
120
+
121
+ // P(Homo) = F + (1-F)P(Homo by chance)
122
+ // P(Homo by chance) = p^2+q^2 for a biallelic locus.
123
+ // For an individual with N genotyped loci, we
124
+ // 1. count the total observed number of loci which are homozygous (O),
125
+ // 2. calculate the total expected number of loci homozygous by chance (E)
126
+ // Then, using the method of moments, we have
127
+ // O = NF + (1-F)E
128
+ // Which rearranges to give
129
+ // F = (O-E)/(N-E)
130
+
131
+ // First, calc frequency of each site (should really move this to a subroutine)
132
+ vector<double> freq(N_entries, 0.0);
133
+ vector<int> allele_counts;
134
+ vector<unsigned int> N_non_missing_chr(N_entries,0);
135
+ string vcf_line;
136
+ vcf_entry e(N_indv);
137
+ for (unsigned int s=0; s<N_entries; s++)
138
+ {
139
+ if (include_entry[s] == false)
140
+ continue;
141
+
142
+ get_vcf_entry(s, vcf_line);
143
+ e.reset(vcf_line);
144
+ e.parse_basic_entry(true);
145
+
146
+ if (e.get_N_alleles() != 2)
147
+ {
148
+ one_off_warning("\tIndividual Heterozygosity: Only using biallelic SNPs.");
149
+ continue;
150
+ }
151
+
152
+ e.parse_genotype_entries(true);
153
+
154
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
155
+ {
156
+ one_off_warning("\tIndividual Heterozygosity: Only using fully diploid SNPs.");
157
+ continue;
158
+ }
159
+
160
+ // Frequency of non-reference allele
161
+ e.get_allele_counts(allele_counts, N_non_missing_chr[s], include_indv, include_genotype[s]);
162
+
163
+ if (N_non_missing_chr[s] > 0)
164
+ freq[s] = allele_counts[1] / double(N_non_missing_chr[s]);
165
+ else
166
+ freq[s] = -1;
167
+ }
168
+
169
+ vector<int> N_sites_included(N_indv, 0);
170
+ vector<int> N_obs_hom(N_indv, 0);
171
+ vector<double> N_expected_hom(N_indv, 0.0);
172
+ pair<int, int> alleles;
173
+
174
+ for (unsigned int s=0; s<N_entries; s++)
175
+ {
176
+ if (include_entry[s] == false)
177
+ continue;
178
+
179
+ get_vcf_entry(s, vcf_line);
180
+ e.reset(vcf_line);
181
+ e.parse_basic_entry(true);
182
+
183
+ if (e.get_N_alleles() != 2)
184
+ continue;
185
+
186
+ e.parse_genotype_entries(true);
187
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
188
+ continue;
189
+
190
+ if ((freq[s] <= numeric_limits<double>::epsilon()) || (1.0 - freq[s] <= numeric_limits<double>::epsilon()))
191
+ continue;
192
+
193
+ for (unsigned int ui=0; ui<N_indv; ui++)
194
+ {
195
+ if (include_indv[ui] == false)
196
+ continue;
197
+
198
+ if (include_genotype[s][ui] == true)
199
+ {
200
+ e.get_indv_GENOTYPE_ids(ui, alleles);
201
+ if ((alleles.first != -1) && (alleles.second != -1))
202
+ {
203
+ N_sites_included[ui]++;
204
+ if (alleles.first == alleles.second)
205
+ N_obs_hom[ui]++;
206
+ }
207
+
208
+ /////////////////////////
209
+ // Expected homozygosity
210
+ // E = 1 - (2pq . 2N/(2N-1))
211
+ // (Using Nei's unbiased estimator)
212
+ N_expected_hom[ui] += 1.0 - (2.0 * freq[s] * (1.0 - freq[s]) * (N_non_missing_chr[s] / (N_non_missing_chr[s] - 1.0)));
213
+ }
214
+ }
215
+ }
216
+
217
+ out.setf(ios::fixed,ios::floatfield);
218
+ for (unsigned int ui=0; ui<N_indv; ui++)
219
+ {
220
+ if (include_indv[ui] == false)
221
+ continue;
222
+ if (N_sites_included[ui] > 0)
223
+ {
224
+ double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]);
225
+ out << indv[ui] << "\t" << N_obs_hom[ui] << "\t";
226
+ out.precision(1);
227
+ out << N_expected_hom[ui] << "\t";
228
+ out.precision(5);
229
+ out << N_sites_included[ui] << "\t" << F << endl;
230
+ }
231
+ }
232
+
233
+ out.close();
234
+ }
235
+
236
+ void vcf_file::output_hwe(const string &output_file_prefix)
237
+ {
238
+ // Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005)
239
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
240
+ error("Require Genotypes in VCF file in order to output HWE Statistics.");
241
+ // Note this assumes Biallelic SNPs.
242
+ printLOG("Outputting HWE statistics (but only for biallelic loci)\n");
243
+
244
+ string output_file = output_file_prefix + ".hwe";
245
+ ofstream out(output_file.c_str());
246
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
247
+ out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq\tP" << endl;
248
+
249
+ /* PLINK code:
250
+ // b11 = Nhom1, b12 = Nhet, b22 = Nhom2
251
+ double tot = b11 + b12 + b22;
252
+ double exp_11 = freq * freq * tot;
253
+ double exp_12 = 2 * freq * (1-freq) * tot;
254
+ double exp_22 = (1-freq) * (1-freq) * tot;
255
+
256
+ double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
257
+ + ( (b12-exp_12)*(b12-exp_12) ) / exp_12
258
+ + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ;
259
+
260
+ p = chiprobP(chisq,1);
261
+ */
262
+
263
+ double freq;
264
+ unsigned int b11, b12, b22;
265
+ double exp_11, exp_12, exp_22;
266
+ double chisq;
267
+ double tot;
268
+ double p;
269
+ unsigned int precision = out.precision();
270
+ vector<int> allele_counts;
271
+ unsigned int N_non_missing_chr;
272
+ string vcf_line;
273
+ vcf_entry e(N_indv);
274
+ for (unsigned int s=0; s<N_entries; s++)
275
+ {
276
+ if (include_entry[s] == false)
277
+ continue;
278
+
279
+ get_vcf_entry(s, vcf_line);
280
+ e.reset(vcf_line);
281
+ e.parse_basic_entry(true);
282
+
283
+ if (e.get_N_alleles() != 2)
284
+ {
285
+ one_off_warning("\tHWE: Only using biallelic SNPs.");
286
+ continue; // Isn't biallelic
287
+ }
288
+
289
+ e.parse_genotype_entries(true);
290
+
291
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
292
+ {
293
+ one_off_warning("\tHWE: Only using fully diploid SNPs.");
294
+ continue; // Isn't diploid
295
+ }
296
+
297
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
298
+ freq = allele_counts[0] / (double)N_non_missing_chr;
299
+ e.get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22);
300
+ tot = b11 + b12 + b22;
301
+ exp_11 = freq * freq * tot;
302
+ exp_12 = 2.0 * freq * (1.0-freq) * tot;
303
+ exp_22 = (1.0-freq) * (1.0-freq) * tot;
304
+
305
+ chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11
306
+ + ( (b12-exp_12)*(b12-exp_12) ) / exp_12
307
+ + ( (b22-exp_22)*(b22-exp_22) ) / exp_22;
308
+
309
+ p = vcf_entry::SNPHWE(b12, b11, b22);
310
+ out << e.get_CHROM() << "\t" << e.get_POS();
311
+ out << "\t" << b11 << "/" << b12 << "/" << b22;
312
+ out.precision(2);
313
+ out << fixed << "\t" << exp_11 << "/" << exp_12 << "/" << exp_22;
314
+ out.precision(precision);
315
+ out << "\t" << chisq << "\t" << p << endl;
316
+ }
317
+ }
318
+
319
+ void vcf_file::output_individuals_by_mean_depth(const string &output_file_prefix)
320
+ {
321
+ // Output information regarding the mean depth for each individual
322
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
323
+ error("Require Genotypes in VCF file in order to output Individuals by Mean Depth Statistics.");
324
+
325
+ printLOG("Outputting Mean Depth by Individual\n");
326
+ string output = output_file_prefix + ".idepth";
327
+ ofstream out(output.c_str());
328
+ if (!out.is_open())
329
+ error("Could not open Individual Depth Output File: " + output, 2);
330
+ out << "INDV\tN_SITES\tMEAN_DEPTH" << endl;
331
+ vector<double> depth_sum(N_indv, 0.0);
332
+ vector<int> count(N_indv, 0);
333
+ int depth;
334
+ string vcf_line;
335
+ vcf_entry e(N_indv);
336
+ for (unsigned int s=0; s<N_entries; s++)
337
+ {
338
+ if (include_entry[s] == false)
339
+ continue;
340
+
341
+ get_vcf_entry(s, vcf_line);
342
+ e.reset(vcf_line);
343
+
344
+ for (unsigned int ui=0; ui<N_indv; ui++)
345
+ {
346
+ if (include_indv[ui] == false)
347
+ continue;
348
+
349
+ if (include_genotype[s][ui] == true)
350
+ {
351
+ e.parse_genotype_entry(ui, false, false, true);
352
+ depth = e.get_indv_DEPTH(ui);
353
+ if (depth >= 0)
354
+ {
355
+ depth_sum[ui] += depth;
356
+ count[ui]++;
357
+ }
358
+ }
359
+ }
360
+ }
361
+
362
+ for (unsigned int ui=0; ui<N_indv; ui++)
363
+ {
364
+ if (include_indv[ui] == false)
365
+ continue;
366
+
367
+ double mean_depth = depth_sum[ui] / count[ui];
368
+ out << indv[ui] << "\t" << count[ui] << "\t" << mean_depth << endl;
369
+ }
370
+
371
+ out.close();
372
+ }
373
+
374
+ void vcf_file::output_SNP_density(const string &output_file_prefix, int bin_size)
375
+ {
376
+ // Output SNP density (technically variant density)
377
+ if (bin_size <= 0)
378
+ return;
379
+ printLOG("Outputting SNP density\n");
380
+
381
+ string output = output_file_prefix + ".snpden";
382
+ ofstream out(output.c_str());
383
+ if (!out.is_open())
384
+ error("Could not open SNP Density Output File: " + output, 2);
385
+
386
+ // Find maximum position
387
+ unsigned int s;
388
+ map<string, int> max_pos;
389
+ string vcf_line;
390
+ string CHROM; int POS;
391
+ vcf_entry e(N_indv);
392
+ for (s=0; s<N_entries; s++)
393
+ {
394
+ if (include_entry[s] == true)
395
+ {
396
+ //get_vcf_entry(s, vcf_line);
397
+ //e.reset(vcf_line);
398
+ //e.parse_basic_entry();
399
+
400
+ //CHROM = e.get_CHROM();
401
+ //POS = e.get_POS();
402
+
403
+ set_filepos(entry_file_locations[s]);
404
+ read_CHROM_and_POS_only(CHROM, POS);
405
+ if (max_pos.find(CHROM) != max_pos.end())
406
+ {
407
+ if (POS > max_pos[CHROM])
408
+ max_pos[CHROM] = POS;
409
+ }
410
+ else
411
+ max_pos[CHROM] = POS;
412
+ }
413
+ }
414
+
415
+ map<string, int>::iterator it;
416
+
417
+ unsigned int N_bins;
418
+ map<string, vector<int> > bins;
419
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
420
+ {
421
+ CHROM = (*it).first;
422
+ N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
423
+ bins[CHROM].resize(N_bins, 0);
424
+ }
425
+
426
+
427
+ unsigned int idx;
428
+ double C = 1.0 / double(bin_size);
429
+ for (s=0; s<N_entries; s++)
430
+ {
431
+ if (include_entry[s] == true)
432
+ {
433
+ //get_vcf_entry(s, vcf_line);
434
+ //e.reset(vcf_line);
435
+ //e.parse_basic_entry();
436
+
437
+ //CHROM = e.get_CHROM();
438
+ //POS = e.get_POS();
439
+ set_filepos(entry_file_locations[s]);
440
+ read_CHROM_and_POS_only(CHROM, POS);
441
+ idx = (unsigned int)(POS * C);
442
+ bins[CHROM][idx]++;
443
+ }
444
+ }
445
+
446
+ out << "CHROM\tBIN_START\tSNP_COUNT\tSNPS/KB" << endl;
447
+ double sum1=0.0, sum2=0.0;
448
+ int bin_tot;
449
+ C = 1000.0 / bin_size;
450
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
451
+ {
452
+ bool output = false;
453
+ CHROM = (*it).first;
454
+ sum2 += max_pos[CHROM];
455
+ for (s=0; s<bins[CHROM].size(); s++)
456
+ {
457
+ bin_tot = bins[CHROM][s];
458
+ sum1 += bin_tot;
459
+ if (bin_tot > 0)
460
+ output = true;
461
+ if (output == true)
462
+ out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl;
463
+ }
464
+ }
465
+ out.close();
466
+
467
+ double mean_SNP_density = sum1 / sum2 * 1000;
468
+ printLOG("Mean SNP density: " + dbl2str(mean_SNP_density, 5) + " SNPs / kb\n");
469
+ }
470
+
471
+ void vcf_file::output_missingness(const string &output_file_prefix)
472
+ {
473
+ // Output missingness by individual and site
474
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
475
+ error("Require Genotypes in VCF file in order to output Missingness Statistics.");
476
+
477
+ printLOG("Outputting Site and Individual Missingness\n");
478
+ string output1 = output_file_prefix + ".imiss";
479
+ ofstream out1(output1.c_str());
480
+ if (!out1.is_open())
481
+ error("Could not open Individual Missingness Output File: " + output1, 3);
482
+
483
+ string output2 = output_file_prefix + ".lmiss";
484
+ ofstream out2(output2.c_str());
485
+ if (!out2.is_open())
486
+ error("Could not open Site Missingness Output File: " + output2, 4);
487
+
488
+ out1 << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl;
489
+ unsigned int ui, s;
490
+ vector<unsigned int> indv_N_missing(N_indv, 0), indv_N_tot(N_indv, 0);
491
+ vector<unsigned int> indv_N_geno_filtered(N_indv, 0);
492
+ unsigned int site_N_missing, site_N_tot, site_N_geno_filtered;
493
+ pair<int, int> alleles;
494
+ string vcf_line;
495
+ vcf_entry e(N_indv);
496
+
497
+ out2 << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl;
498
+ for (s=0; s<N_entries; s++)
499
+ {
500
+ if (include_entry[s] == false)
501
+ continue;
502
+
503
+ get_vcf_entry(s, vcf_line);
504
+ e.reset(vcf_line);
505
+ e.parse_basic_entry();
506
+
507
+ site_N_missing = 0;
508
+ site_N_tot = 0;
509
+ site_N_geno_filtered = 0;
510
+ for (ui=0; ui<N_indv; ui++)
511
+ {
512
+ if (include_indv[ui] == false)
513
+ continue;
514
+ if (include_genotype[s][ui] == false)
515
+ {
516
+ site_N_geno_filtered++;
517
+ indv_N_geno_filtered[ui]++;
518
+ continue;
519
+ }
520
+
521
+ e.parse_genotype_entry(ui, true);
522
+ e.get_indv_GENOTYPE_ids(ui, alleles);
523
+ if (alleles.first == -1)
524
+ {
525
+ site_N_missing++;
526
+ indv_N_missing[ui]++;
527
+ }
528
+ indv_N_tot[ui]++;
529
+
530
+ if (alleles.second == -1)
531
+ {
532
+ site_N_missing++;
533
+ }
534
+ site_N_tot+=2;
535
+
536
+ if ((alleles.second == -1) && (e.get_indv_PHASE(ui) == '|'))
537
+ { // Phased missing genotypes indicate haploid genome
538
+ site_N_tot--;
539
+ }
540
+ }
541
+ out2 << e.get_CHROM() << "\t" << e.get_POS() << "\t" << site_N_tot << "\t" << site_N_geno_filtered << "\t";
542
+ out2 << site_N_missing << "\t" << double(site_N_missing) / double(site_N_tot) << endl;
543
+ }
544
+
545
+ for (ui=0; ui<N_indv; ui++)
546
+ {
547
+ if (include_indv[ui] == false)
548
+ continue;
549
+ out1 << indv[ui] << "\t" << indv_N_tot[ui] << "\t";
550
+ out1 << indv_N_geno_filtered[ui] << "\t" << indv_N_missing[ui] << "\t";
551
+ out1 << indv_N_missing[ui] / double(indv_N_tot[ui]) << endl;
552
+ }
553
+
554
+ out2.close();
555
+ out1.close();
556
+ }
557
+
558
+ void vcf_file::output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
559
+ {
560
+ // Output pairwise LD statistics, using traditional r^2. Requires phased haplotypes.
561
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
562
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
563
+
564
+ unsigned int s, s2;
565
+ unsigned int ui;
566
+
567
+ printLOG("Outputting Pairwise LD (phased bi-allelic only)\n");
568
+ string output = output_file_prefix + ".hap.ld";
569
+ ofstream out(output.c_str());
570
+ if (!out.is_open())
571
+ error("Could not open LD Output File: " + output, 3);
572
+
573
+ out << "CHR\tPOS1\tPOS2\tN_CHR\tR^2\tD\tDprime" << endl;
574
+
575
+ //For D, D' computations
576
+ double D, Dmax, Dprime;
577
+ int x11, x12, x21, x22;
578
+ double p1, p2, q1, q2;
579
+ double rel_x11, rel_x12, rel_x21, rel_x22;
580
+
581
+ unsigned int chr_count;
582
+ double r2;
583
+ int sx, sy;
584
+ double X, X2, Y, Y2, XY;
585
+ double var1, var2, cov12;
586
+ pair<int,int> geno1, geno2;
587
+ string vcf_line, vcf_line2;
588
+ vcf_entry e(N_indv), e2(N_indv);
589
+ for (s=0; s<(N_entries-1); s++)
590
+ {
591
+ if (include_entry[s] == false)
592
+ continue;
593
+
594
+ get_vcf_entry(s, vcf_line);
595
+ e.reset(vcf_line);
596
+ e.parse_basic_entry(true);
597
+
598
+ if (e.get_N_alleles() != 2)
599
+ {
600
+ one_off_warning("\tLD: Only using biallelic SNPs.");
601
+ continue; // Isn't biallelic
602
+ }
603
+
604
+ for (s2 = s+1; s2<N_entries; s2++)
605
+ {
606
+ if (include_entry[s2] == false)
607
+ continue;
608
+
609
+ if (int(s2 - s) > snp_window_size)
610
+ {
611
+ s2 = N_entries; // SNPs sorted, so no need to go any further
612
+ continue;
613
+ }
614
+
615
+ get_vcf_entry(s2, vcf_line2);
616
+ e2.reset(vcf_line2);
617
+ e2.parse_basic_entry(true);
618
+
619
+ if (e.get_CHROM() != e2.get_CHROM())
620
+ {
621
+ s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
622
+ continue;
623
+ }
624
+
625
+ if ((e2.get_POS() - e.get_POS()) > bp_window_size)
626
+ {
627
+ s2 = N_entries; // No need to go any further (assuming SNPs are sorted)
628
+ continue;
629
+ }
630
+
631
+ if (e2.get_N_alleles() != 2)
632
+ {
633
+ one_off_warning("\tLD: Only using biallelic SNPs.");
634
+ continue;
635
+ }
636
+
637
+ x11=0; x12=0; x21=0; x22=0;
638
+
639
+ X=0, X2=0; Y=0; Y2=0; XY=0;
640
+ chr_count = 0;
641
+ for (ui=0; ui<N_indv; ui++)
642
+ {
643
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
644
+ continue;
645
+
646
+ e.parse_genotype_entry(ui, true);
647
+ e.get_indv_GENOTYPE_ids(ui, geno1);
648
+
649
+ e2.parse_genotype_entry(ui, true);
650
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
651
+
652
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
653
+ {
654
+ one_off_warning("\tLD: Only using diploid individuals.");
655
+ continue;
656
+ }
657
+
658
+ if ((e.get_indv_PHASE(ui) != '|') || (e2.get_indv_PHASE(ui) != '|'))
659
+ error("Require phased haplotypes for r^2 calculation (use --phased)\n");
660
+
661
+ for (unsigned int c=0; c<2; c++)
662
+ {
663
+ int allele1, allele2;
664
+ if (c==0)
665
+ {
666
+ allele1 = geno1.first;
667
+ allele2 = geno2.first;
668
+ }
669
+ else
670
+ {
671
+ allele1 = geno1.second;
672
+ allele2 = geno2.second;
673
+ }
674
+
675
+ if ((allele1 == -1) || (allele2 == -1))
676
+ continue;
677
+
678
+ if (allele1 == 0 && allele2 == 0){
679
+ x11++;
680
+ } else if (allele1 == 0 && allele2 != 0){
681
+ x12++;
682
+ } else if (allele1 != 0 && allele2 == 0){
683
+ x21++;
684
+ } else { // (allele1 !=0 && allele2 != 0)
685
+ x22++;
686
+ }
687
+
688
+ sx=0, sy=0;
689
+ if (allele1 == 0)
690
+ sx += 1;
691
+
692
+ if (allele2 == 0)
693
+ sy += 1;
694
+
695
+ X += sx; Y += sy;
696
+ XY += sx*sy;
697
+ sx *= sx; sy *= sy;
698
+ X2 += sx;
699
+ Y2 += sy;
700
+
701
+ chr_count++;
702
+ }
703
+ }
704
+
705
+ rel_x11 = 1.0*x11/chr_count;
706
+ rel_x12 = 1.0*x12/chr_count;
707
+ rel_x21 = 1.0*x21/chr_count;
708
+ rel_x22 = 1.0*x22/chr_count;
709
+ p1 = rel_x11 + rel_x12;
710
+ p2 = rel_x21 + rel_x22;
711
+ q1 = rel_x11 + rel_x21;
712
+ q2 = rel_x12 + rel_x22;
713
+ D = rel_x11 - p1*q1;
714
+ if (D < 0){
715
+ Dmax = min(p1*q1,p2*q2);
716
+ } else {
717
+ Dmax = min(p1*q2,p2*q1);
718
+ };
719
+ Dprime = D/Dmax;
720
+
721
+ X /= chr_count; X2 /= chr_count;
722
+ Y /= chr_count; Y2 /= chr_count;
723
+ XY /= chr_count;
724
+
725
+ var1 = X2 - X*X;
726
+ var2 = Y2 - Y*Y;
727
+ cov12 = XY - X*Y;
728
+
729
+ r2 = cov12 * cov12 / (var1 * var2);
730
+
731
+ if (min_r2 > 0)
732
+ if ((r2 < min_r2) | (r2 != r2))
733
+ continue;
734
+
735
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << "\t" << endl;
736
+ }
737
+ }
738
+ out.close();
739
+ }
740
+
741
+ void vcf_file::output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2)
742
+ {
743
+ // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
744
+ // correlation coefficient between genotypes numbered as 0, 1, 2.
745
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
746
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
747
+
748
+ unsigned int s, s2;
749
+ unsigned int ui;
750
+
751
+ printLOG("Outputting Pairwise LD (bi-allelic only)\n");
752
+ string output = output_file_prefix + ".geno.ld";
753
+ ofstream out(output.c_str());
754
+ if (!out.is_open())
755
+ error("Could not open LD Output File: " + output, 3);
756
+
757
+ out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl;
758
+
759
+ unsigned int indv_count;
760
+ double r2;
761
+ int sx, sy;
762
+ double X, X2, Y, Y2, XY;
763
+ double var1, var2, cov12;
764
+ pair<int,int> geno1, geno2;
765
+ string vcf_line, vcf_line2;
766
+ vcf_entry e(N_indv), e2(N_indv);
767
+ for (s=0; s<(N_entries-1); s++)
768
+ {
769
+ if (include_entry[s] == false)
770
+ continue;
771
+
772
+ get_vcf_entry(s, vcf_line);
773
+ e.reset(vcf_line);
774
+ e.parse_basic_entry(true);
775
+
776
+ if (e.get_N_alleles() != 2)
777
+ {
778
+ one_off_warning("\tgenoLD: Only using biallelic SNPs.");
779
+ continue; // Isn't biallelic
780
+ }
781
+
782
+ for (s2 = s+1; s2<N_entries; s2++)
783
+ {
784
+ if (include_entry[s2] == false)
785
+ continue;
786
+
787
+ if (int(s2 - s) > snp_window_size)
788
+ {
789
+ s2 = N_entries; // SNPs sorted, so no need to go any further
790
+ continue;
791
+ }
792
+
793
+ get_vcf_entry(s2, vcf_line2);
794
+ e2.reset(vcf_line2);
795
+ e2.parse_basic_entry(true);
796
+
797
+ if (e2.get_N_alleles() != 2)
798
+ {
799
+ one_off_warning("\tgenoLD: Only using biallelic SNPs.");
800
+ continue; // Isn't biallelic
801
+ }
802
+
803
+ if (e.get_CHROM() != e2.get_CHROM())
804
+ {
805
+ s2 = N_entries; // SNPs sorted, so no need to go any further
806
+ continue;
807
+ }
808
+
809
+ if ((e2.get_POS() - e.get_POS()) > bp_window_size)
810
+ {
811
+ s2 = N_entries; // SNPs sorted, so no need to go any further
812
+ continue;
813
+ }
814
+
815
+ X=0, X2=0; Y=0; Y2=0; XY=0;
816
+ indv_count = 0;
817
+ for (ui=0; ui<N_indv; ui++)
818
+ {
819
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
820
+ continue;
821
+
822
+ e.parse_genotype_entry(ui, true);
823
+ e.get_indv_GENOTYPE_ids(ui, geno1);
824
+
825
+ e2.parse_genotype_entry(ui, true);
826
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
827
+
828
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
829
+ {
830
+ one_off_warning("\tgenoLD: Only using diploid individuals.");
831
+ continue;
832
+ }
833
+
834
+ if ((geno1.first == -1) || (geno1.second == -1))
835
+ continue;
836
+
837
+ if ((geno2.first == -1) || (geno2.second == -1))
838
+ continue;
839
+
840
+ sx=0, sy=0;
841
+ if (geno1.first == geno1.second)
842
+ {
843
+ if (geno1.first == 0)
844
+ {
845
+ sx = 2;
846
+ }
847
+ }
848
+ else
849
+ sx = 1;
850
+
851
+ if (geno2.first == geno2.second)
852
+ {
853
+ if (geno2.first == 0)
854
+ {
855
+ sy = 2;
856
+ }
857
+ }
858
+ else
859
+ sy = 1;
860
+
861
+ X += sx; Y += sy;
862
+ XY += sx*sy;
863
+ sx *= sx; sy *= sy;
864
+ X2 += sx; Y2 += sy;
865
+
866
+ indv_count++;
867
+ }
868
+
869
+ X /= indv_count; X2 /= indv_count;
870
+ Y /= indv_count; Y2 /= indv_count;
871
+ XY /= indv_count;
872
+
873
+ var1 = X2 - X*X;
874
+ var2 = Y2 - Y*Y;
875
+ cov12 = XY - X*Y;
876
+
877
+ r2 = cov12 * cov12 / (var1 * var2);
878
+
879
+ if (min_r2 > 0)
880
+ if ((r2 < min_r2) | (r2 != r2))
881
+ continue;
882
+
883
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
884
+ }
885
+ }
886
+ out.close();
887
+ }
888
+
889
+ // TODO - provide similar function for haplotype r2.
890
+ void vcf_file::output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2)
891
+ {
892
+ // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared
893
+ // correlation coefficient between genotypes numbered as 0, 1, 2.
894
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
895
+ error("Require Genotypes in VCF file in order to output LD Statistics.");
896
+
897
+ unsigned int s, s2;
898
+ unsigned int ui;
899
+
900
+ printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n");
901
+ string output = output_file_prefix + ".interchrom.geno.ld";
902
+ ofstream out(output.c_str());
903
+ if (!out.is_open())
904
+ error("Could not open LD Output File: " + output, 3);
905
+
906
+ out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl;
907
+
908
+ unsigned int indv_count;
909
+ double r2;
910
+ int sx, sy;
911
+ double X, X2, Y, Y2, XY;
912
+ double var1, var2, cov12;
913
+ pair<int,int> geno1, geno2;
914
+ string vcf_line, vcf_line2;
915
+ vcf_entry e(N_indv), e2(N_indv);
916
+ for (s=0; s<(N_entries-1); s++)
917
+ {
918
+ if (include_entry[s] == false)
919
+ continue;
920
+
921
+ get_vcf_entry(s, vcf_line);
922
+ e.reset(vcf_line);
923
+ e.parse_basic_entry(true);
924
+
925
+ if (e.get_N_alleles() != 2)
926
+ {
927
+ one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
928
+ continue; // Isn't biallelic
929
+ }
930
+
931
+ for (s2 = s+1; s2<N_entries; s2++)
932
+ {
933
+ if (include_entry[s2] == false)
934
+ continue;
935
+
936
+ get_vcf_entry(s2, vcf_line2);
937
+ e2.reset(vcf_line2);
938
+ e2.parse_basic_entry(true);
939
+
940
+ if (e2.get_N_alleles() != 2)
941
+ {
942
+ one_off_warning("\tinterchromLD: Only using biallelic SNPs.");
943
+ continue; // Isn't biallelic
944
+ }
945
+
946
+ if (e.get_CHROM() == e2.get_CHROM())
947
+ {
948
+ continue;
949
+ }
950
+
951
+ X=0, X2=0; Y=0; Y2=0; XY=0;
952
+ indv_count = 0;
953
+ for (ui=0; ui<N_indv; ui++)
954
+ {
955
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (include_genotype[s2][ui] == false))
956
+ continue;
957
+
958
+ e.parse_genotype_entry(ui, true);
959
+ e.get_indv_GENOTYPE_ids(ui, geno1);
960
+
961
+ e2.parse_genotype_entry(ui, true);
962
+ e2.get_indv_GENOTYPE_ids(ui, geno2);
963
+
964
+ if ((e.get_indv_ploidy(ui) != 2) || (e2.get_indv_ploidy(ui) != 2))
965
+ {
966
+ one_off_warning("\tinterchromLD: Only using diploid individuals.");
967
+ continue;
968
+ }
969
+
970
+ if ((geno1.first == -1) || (geno1.second == -1))
971
+ continue;
972
+
973
+ if ((geno2.first == -1) || (geno2.second == -1))
974
+ continue;
975
+
976
+ sx=0, sy=0;
977
+ if (geno1.first == geno1.second)
978
+ {
979
+ if (geno1.first == 0)
980
+ {
981
+ sx = 2;
982
+ }
983
+ }
984
+ else
985
+ sx = 1;
986
+
987
+ if (geno2.first == geno2.second)
988
+ {
989
+ if (geno2.first == 0)
990
+ {
991
+ sy = 2;
992
+ }
993
+ }
994
+ else
995
+ sy = 1;
996
+
997
+ X += sx; Y += sy;
998
+ XY += sx*sy;
999
+ sx *= sx; sy *= sy;
1000
+ X2 += sx; Y2 += sy;
1001
+
1002
+ indv_count++;
1003
+ }
1004
+
1005
+ X /= indv_count; X2 /= indv_count;
1006
+ Y /= indv_count; Y2 /= indv_count;
1007
+ XY /= indv_count;
1008
+
1009
+ var1 = X2 - X*X;
1010
+ var2 = Y2 - Y*Y;
1011
+ cov12 = XY - X*Y;
1012
+
1013
+ r2 = cov12 * cov12 / (var1 * var2);
1014
+
1015
+ if (min_r2 > 0)
1016
+ if ((r2 < min_r2) | (r2 != r2))
1017
+ continue;
1018
+
1019
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_CHROM() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl;
1020
+ }
1021
+ }
1022
+ out.close();
1023
+ }
1024
+
1025
+ void vcf_file::output_singletons(const string &output_file_prefix)
1026
+ {
1027
+ // Locate and output singletons (and private doubletons)
1028
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1029
+ error("Require Genotypes in VCF file in order to output Singletons.");
1030
+
1031
+ printLOG("Outputting Singleton Locations\n");
1032
+ string output = output_file_prefix + ".singletons";
1033
+ ofstream out(output.c_str());
1034
+ if (!out.is_open())
1035
+ error("Could not open Singleton Output File: " + output, 3);
1036
+
1037
+ out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl;
1038
+
1039
+ unsigned int ui;
1040
+ int a;
1041
+ vector<int> allele_counts;
1042
+ unsigned int N_non_missing_chr;
1043
+ unsigned int N_alleles;
1044
+ pair<int, int> geno;
1045
+ string allele;
1046
+ string vcf_line;
1047
+ vcf_entry e(N_indv);
1048
+ for (unsigned int s=0; s<N_entries; s++)
1049
+ {
1050
+ if (include_entry[s] == false)
1051
+ continue;
1052
+
1053
+ get_vcf_entry(s, vcf_line);
1054
+ e.reset(vcf_line);
1055
+ e.parse_basic_entry(true);
1056
+ e.parse_genotype_entries(true);
1057
+
1058
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
1059
+ N_alleles = e.get_N_alleles();
1060
+
1061
+ for (a=0; a<(signed)N_alleles; a++)
1062
+ {
1063
+ if (allele_counts[a] == 1)
1064
+ { // Singleton
1065
+ for (ui=0; ui<N_indv; ui++)
1066
+ {
1067
+ if (include_indv[ui] == false)
1068
+ continue;
1069
+ e.get_indv_GENOTYPE_ids(ui, geno);
1070
+ if ((geno.first == a) || (geno.second == a))
1071
+ {
1072
+ e.get_allele(a, allele);
1073
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\tS\t" << allele << "\t" << indv[ui] << endl;
1074
+ ui=N_indv;
1075
+ break;
1076
+ }
1077
+ }
1078
+ }
1079
+ else if (allele_counts[a] == 2)
1080
+ { // Possible doubleton
1081
+ for (ui=0; ui<N_indv; ui++)
1082
+ {
1083
+ if (include_indv[ui] == false)
1084
+ continue;
1085
+ e.get_indv_GENOTYPE_ids(ui, geno);
1086
+ if ((geno.first == a) && (geno.second == a))
1087
+ {
1088
+ e.get_allele(a, allele);
1089
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\tD\t" << allele << "\t" << indv[ui] << endl;
1090
+ ui=N_indv;
1091
+ break;
1092
+ }
1093
+ }
1094
+ }
1095
+ }
1096
+ }
1097
+
1098
+ out.close();
1099
+ }
1100
+
1101
+ void vcf_file::output_genotype_depth(const string &output_file_prefix)
1102
+ {
1103
+ // Output genotype depth in tab-delimited format.
1104
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1105
+ error("Require Genotypes in VCF file in order to output Genotype Depth Statistics.");
1106
+
1107
+ printLOG("Outputting Depth for Each Genotype\n");
1108
+ string output = output_file_prefix + ".gdepth";
1109
+ ofstream out(output.c_str());
1110
+ if (!out.is_open())
1111
+ error("Could not open Genotype Depth Output File: " + output, 7);
1112
+
1113
+ out << "CHROM\tPOS";
1114
+ for (unsigned int ui=0; ui<N_indv; ui++)
1115
+ {
1116
+ if (include_indv[ui] == false)
1117
+ continue;
1118
+
1119
+ out << "\t" << indv[ui];
1120
+ }
1121
+ out << endl;
1122
+
1123
+ string vcf_line;
1124
+ vcf_entry e(N_indv);
1125
+ for (unsigned int s=0; s<N_entries; s++)
1126
+ {
1127
+ if (include_entry[s] == false)
1128
+ continue;
1129
+
1130
+ get_vcf_entry(s, vcf_line);
1131
+ e.reset(vcf_line);
1132
+ e.parse_basic_entry();
1133
+
1134
+ out << e.get_CHROM() << "\t" << e.get_POS();
1135
+
1136
+ for (unsigned int ui=0; ui<N_indv; ui++)
1137
+ {
1138
+ if (include_indv[ui] == false)
1139
+ continue;
1140
+
1141
+ if (include_genotype[s][ui] == true)
1142
+ {
1143
+ e.parse_genotype_entry(ui, false, false, true);
1144
+ out << "\t" << e.get_indv_DEPTH(ui);
1145
+ }
1146
+ else
1147
+ out << "\t-1";
1148
+ }
1149
+ out << endl;
1150
+ }
1151
+ out.close();
1152
+ }
1153
+
1154
+ void vcf_file::output_FILTER_summary(const string &output_file_prefix)
1155
+ {
1156
+ // Output a summary of sites in various FILTER categories.
1157
+ printLOG("Outputting Filter Summary (for bi-allelic loci only)\n");
1158
+
1159
+ map<string, unsigned int> model_to_idx;
1160
+ model_to_idx["AC"] = 0;
1161
+ model_to_idx["AG"] = 1;
1162
+ model_to_idx["AT"] = 2;
1163
+ model_to_idx["CG"] = 3;
1164
+ model_to_idx["CT"] = 4;
1165
+ model_to_idx["GT"] = 5;
1166
+ string FILTER;
1167
+ string vcf_line;
1168
+ vcf_entry e(N_indv);
1169
+
1170
+ map<string, pair<int, int> > FILTER_to_TsTv;
1171
+ map<string, int > FILTER_to_Nsites;
1172
+ map<string, int >::iterator FILTER_to_Nsites_it;
1173
+ for (unsigned int s=0; s<N_entries; s++)
1174
+ {
1175
+ if (include_entry[s] == false)
1176
+ continue;
1177
+
1178
+ get_vcf_entry(s, vcf_line);
1179
+ e.reset(vcf_line);
1180
+ e.parse_basic_entry(true, true);
1181
+
1182
+ string model = e.get_REF() + e.get_ALT_allele(0);
1183
+ sort(model.begin(), model.end());
1184
+
1185
+ FILTER = e.get_FILTER();
1186
+ FILTER_to_Nsites[FILTER]++;
1187
+ if (model_to_idx.find(model) != model_to_idx.end())
1188
+ {
1189
+ switch (model_to_idx[model])
1190
+ {
1191
+ case 1:
1192
+ case 4:
1193
+ FILTER_to_TsTv[FILTER].first++;
1194
+ break;
1195
+ case 0:
1196
+ case 2:
1197
+ case 3:
1198
+ case 5:
1199
+ FILTER_to_TsTv[FILTER].second++;
1200
+ break;
1201
+ default:
1202
+ // Don't count this snp towards Ts/Tv
1203
+ break;
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ vector<pair<int, string > > count_to_FILTER;
1209
+ for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it )
1210
+ {
1211
+ FILTER = (*FILTER_to_Nsites_it).first;
1212
+ int Nsites = (*FILTER_to_Nsites_it).second;
1213
+
1214
+ count_to_FILTER.push_back(make_pair(Nsites, FILTER));
1215
+ }
1216
+
1217
+ sort(count_to_FILTER.begin(), count_to_FILTER.end());
1218
+
1219
+ string output = output_file_prefix + ".FILTER.summary";
1220
+ ofstream out(output.c_str());
1221
+ if (!out.is_open())
1222
+ error("Could not open Filter Summary Output File: " + output, 7);
1223
+
1224
+ out << "FILTER\tN_SNPs\tN_Ts\tN_Tv\tTs/Tv" << endl;
1225
+
1226
+ for (int i=count_to_FILTER.size()-1; i > -1; i--)
1227
+ {
1228
+ FILTER = count_to_FILTER[i].second;
1229
+ int Ts = FILTER_to_TsTv[FILTER].first;
1230
+ int Tv = FILTER_to_TsTv[FILTER].second;
1231
+ int Nsites = FILTER_to_Nsites[FILTER];
1232
+ out << FILTER << "\t" << Nsites << "\t";
1233
+ out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl;
1234
+ }
1235
+
1236
+ out.close();
1237
+ }
1238
+
1239
+ void vcf_file::output_TsTv(const string &output_file_prefix, int bin_size)
1240
+ {
1241
+ // Output Ts/Tv ratios in bins of a given size.
1242
+ printLOG("Outputting Ts/Tv in bins of " + int2str(bin_size) + "bp\n");
1243
+
1244
+ map<string, unsigned int> model_to_idx;
1245
+ model_to_idx["AC"] = 0;
1246
+ model_to_idx["AG"] = 1;
1247
+ model_to_idx["AT"] = 2;
1248
+ model_to_idx["CG"] = 3;
1249
+ model_to_idx["CT"] = 4;
1250
+ model_to_idx["GT"] = 5;
1251
+
1252
+ map<string, int> max_pos;
1253
+ string vcf_line, CHROM;
1254
+ vcf_entry e(N_indv);
1255
+ for (unsigned int s=0; s<N_entries; s++)
1256
+ {
1257
+ if (include_entry[s] == true)
1258
+ {
1259
+ get_vcf_entry(s, vcf_line);
1260
+ e.reset(vcf_line);
1261
+ e.parse_basic_entry();
1262
+
1263
+ CHROM = e.get_CHROM();
1264
+
1265
+ if (max_pos.find(CHROM) != max_pos.end())
1266
+ {
1267
+ if (e.get_POS() > max_pos[CHROM])
1268
+ max_pos[CHROM] = e.get_POS();
1269
+ }
1270
+ else
1271
+ max_pos[CHROM] = e.get_POS();
1272
+ }
1273
+ }
1274
+
1275
+ map<string, int>::iterator it;
1276
+
1277
+ unsigned int N_bins;
1278
+ map<string, vector<int> > Ts_counts;
1279
+ map<string, vector<int> > Tv_counts;
1280
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
1281
+ {
1282
+ CHROM = (*it).first;
1283
+ N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size));
1284
+ Ts_counts[CHROM].resize(N_bins, 0);
1285
+ Tv_counts[CHROM].resize(N_bins, 0);
1286
+ }
1287
+
1288
+ vector<unsigned int> model_counts(6,0);
1289
+ double C = 1.0 / double(bin_size);
1290
+ unsigned int idx;
1291
+
1292
+ string model;
1293
+ for (unsigned int s=0; s<N_entries; s++)
1294
+ {
1295
+ if (include_entry[s] == false)
1296
+ continue;
1297
+
1298
+ get_vcf_entry(s, vcf_line);
1299
+ e.reset(vcf_line);
1300
+ e.parse_basic_entry(true);
1301
+
1302
+ if (!e.is_biallelic_SNP())
1303
+ continue;
1304
+
1305
+ model = e.get_REF() + e.get_ALT_allele(0);
1306
+ sort(model.begin(), model.end());
1307
+
1308
+ CHROM = e.get_CHROM();
1309
+ idx = (unsigned int)(e.get_POS() * C);
1310
+
1311
+ if (model_to_idx.find(model) != model_to_idx.end())
1312
+ {
1313
+ model_counts[model_to_idx[model]]++;
1314
+ switch (model_to_idx[model])
1315
+ {
1316
+ case 1:
1317
+ case 4:
1318
+ Ts_counts[CHROM][idx]++;
1319
+ break;
1320
+ case 0:
1321
+ case 2:
1322
+ case 3:
1323
+ case 5:
1324
+ Tv_counts[CHROM][idx]++;
1325
+ break;
1326
+ default:
1327
+ error("Unknown idx\n");
1328
+ }
1329
+ }
1330
+ else
1331
+ warning("Unknown model type. Not a SNP? " + CHROM + ":" + int2str(e.get_POS()) +"\n");
1332
+ }
1333
+
1334
+ string output = output_file_prefix + ".TsTv";
1335
+ ofstream out(output.c_str());
1336
+ if (!out.is_open())
1337
+ error("Could not open TsTv Output File: " + output, 7);
1338
+
1339
+ out << "CHROM\tBinStart\tSNP_count\tTs/Tv" << endl;
1340
+ double ratio;
1341
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
1342
+ {
1343
+ CHROM = (*it).first;
1344
+ for (unsigned int s=0; s<Ts_counts[CHROM].size(); s++)
1345
+ {
1346
+ ratio = 0.0;
1347
+ if (Tv_counts[CHROM][s] != 0)
1348
+ ratio = double(Ts_counts[CHROM][s]) / Tv_counts[CHROM][s];
1349
+ out << CHROM << "\t" << s*bin_size << "\t" << Ts_counts[CHROM][s]+Tv_counts[CHROM][s] << "\t" << ratio << endl;
1350
+ }
1351
+ }
1352
+ out.close();
1353
+
1354
+ output = output_file_prefix + ".TsTv.summary";
1355
+ out.open(output.c_str());
1356
+ if (!out.is_open())
1357
+ error("Could not open TsTv Summary Output File: " + output, 7);
1358
+
1359
+ out << "MODEL\tCOUNT" << endl;
1360
+ out << "AC\t" << model_counts[0] << endl;
1361
+ out << "AG\t" << model_counts[1] << endl;
1362
+ out << "AT\t" << model_counts[2] << endl;
1363
+ out << "CG\t" << model_counts[3] << endl;
1364
+ out << "CT\t" << model_counts[4] << endl;
1365
+ out << "GT\t" << model_counts[5] << endl;
1366
+ unsigned int Ts = model_counts[1] + model_counts[4];
1367
+ unsigned int Tv = model_counts[0] + model_counts[2] + model_counts[3] + model_counts[5];
1368
+ out << "Ts\t" << Ts << endl;
1369
+ out << "Tv\t" << Tv << endl;
1370
+
1371
+ printLOG("Ts/Tv ratio: " + dbl2str(double(Ts)/Tv, 4) + "\n");
1372
+
1373
+ out.close();
1374
+ }
1375
+
1376
+ void vcf_file::output_TsTv_by_count(const string &output_file_prefix)
1377
+ {
1378
+ // Output Ts/Tv ratios in bins of a given size.
1379
+ printLOG("Outputting Ts/Tv by Alternative Allele Count\n");
1380
+ vector<unsigned int> Ts_counts, Tv_counts;
1381
+ unsigned int N_kept_indv = N_kept_individuals();
1382
+ Ts_counts.resize(2*N_kept_indv);
1383
+ Tv_counts.resize(2*N_kept_indv);
1384
+
1385
+ string vcf_line, model;
1386
+ vcf_entry e(N_indv);
1387
+ map<string, unsigned int> model_to_Ts_or_Tv;
1388
+ model_to_Ts_or_Tv["AC"] = 1;
1389
+ model_to_Ts_or_Tv["CA"] = 1;
1390
+ model_to_Ts_or_Tv["AG"] = 0; // Ts
1391
+ model_to_Ts_or_Tv["GA"] = 0; // Ts
1392
+ model_to_Ts_or_Tv["AT"] = 1;
1393
+ model_to_Ts_or_Tv["TA"] = 1;
1394
+ model_to_Ts_or_Tv["CG"] = 1;
1395
+ model_to_Ts_or_Tv["GC"] = 1;
1396
+ model_to_Ts_or_Tv["CT"] = 0; // Ts
1397
+ model_to_Ts_or_Tv["TC"] = 0; // Ts
1398
+ model_to_Ts_or_Tv["GT"] = 1;
1399
+ model_to_Ts_or_Tv["TG"] = 1;
1400
+ unsigned int idx;
1401
+ vector<int> allele_counts;
1402
+ unsigned int allele_count;
1403
+ unsigned int N_included_indv;
1404
+ for (unsigned int s=0; s<N_entries; s++)
1405
+ {
1406
+ if (include_entry[s] == true)
1407
+ {
1408
+ get_vcf_entry(s, vcf_line);
1409
+ e.reset(vcf_line);
1410
+ e.parse_basic_entry(true);
1411
+
1412
+ if (!e.is_biallelic_SNP())
1413
+ continue;
1414
+
1415
+ e.parse_genotype_entries(true);
1416
+ e.get_allele_counts(allele_counts, N_included_indv, include_indv, include_genotype[s]);
1417
+ allele_count = allele_counts[1];
1418
+
1419
+ model = e.get_REF() + e.get_ALT_allele(0);
1420
+ if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
1421
+ {
1422
+ idx = model_to_Ts_or_Tv[model];
1423
+ if (idx == 0) // Ts
1424
+ Ts_counts[allele_count]++;
1425
+ else if (idx == 1) // Tv;
1426
+ Tv_counts[allele_count]++;
1427
+ else
1428
+ error("Unknown model type\n");
1429
+ }
1430
+ else
1431
+ warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
1432
+ }
1433
+ }
1434
+
1435
+ string output = output_file_prefix + ".TsTv.count";
1436
+ ofstream out(output.c_str());
1437
+ if (!out.is_open())
1438
+ error("Could not open TsTv by Count Output File: " + output, 7);
1439
+
1440
+ double ratio;
1441
+ out << "ALT_ALLELE_COUNT\tN_Ts\tN_Tv\tTs/Tv" << endl;
1442
+ for (unsigned int ui=0; ui<2*N_kept_indv; ui++)
1443
+ {
1444
+ ratio = double(Ts_counts[ui]) / Tv_counts[ui];
1445
+ out << ui << "\t" << Ts_counts[ui] << "\t" << Tv_counts[ui] << "\t" << ratio << endl;
1446
+ }
1447
+ out.close();
1448
+ }
1449
+
1450
+ void vcf_file::output_TsTv_by_quality(const string &output_file_prefix)
1451
+ {
1452
+ // Output Ts/Tv ratios in bins of a given size.
1453
+ printLOG("Outputting Ts/Tv By Quality\n");
1454
+ map<double, pair<unsigned int, unsigned int> > TsTv_counts;
1455
+ double max_qual = -numeric_limits<double>::max(), min_qual=numeric_limits<double>::max();
1456
+
1457
+ string vcf_line, model;
1458
+ vcf_entry e(N_indv);
1459
+ map<string, unsigned int> model_to_Ts_or_Tv;
1460
+ model_to_Ts_or_Tv["AC"] = 1;
1461
+ model_to_Ts_or_Tv["CA"] = 1;
1462
+ model_to_Ts_or_Tv["AG"] = 0; // Ts
1463
+ model_to_Ts_or_Tv["GA"] = 0; // Ts
1464
+ model_to_Ts_or_Tv["AT"] = 1;
1465
+ model_to_Ts_or_Tv["TA"] = 1;
1466
+ model_to_Ts_or_Tv["CG"] = 1;
1467
+ model_to_Ts_or_Tv["GC"] = 1;
1468
+ model_to_Ts_or_Tv["CT"] = 0; // Ts
1469
+ model_to_Ts_or_Tv["TC"] = 0; // Ts
1470
+ model_to_Ts_or_Tv["GT"] = 1;
1471
+ model_to_Ts_or_Tv["TG"] = 1;
1472
+ unsigned int idx;
1473
+ double QUAL;
1474
+ for (unsigned int s=0; s<N_entries; s++)
1475
+ {
1476
+ if (include_entry[s] == true)
1477
+ {
1478
+ get_vcf_entry(s, vcf_line);
1479
+ e.reset(vcf_line);
1480
+ e.parse_basic_entry(true);
1481
+
1482
+ if (!e.is_biallelic_SNP())
1483
+ continue;
1484
+
1485
+ QUAL = e.get_QUAL();
1486
+ if (QUAL > max_qual)
1487
+ max_qual = QUAL;
1488
+ if (QUAL < min_qual)
1489
+ min_qual = QUAL;
1490
+
1491
+ model = e.get_REF() + e.get_ALT_allele(0);
1492
+ if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end())
1493
+ {
1494
+ idx = model_to_Ts_or_Tv[model];
1495
+ if (idx == 0) // Ts
1496
+ {
1497
+ TsTv_counts[QUAL].first++;
1498
+ }
1499
+ else if (idx == 1) // Tv;
1500
+ TsTv_counts[QUAL].second++;
1501
+ else
1502
+ error("Unknown model type\n");
1503
+ }
1504
+ else
1505
+ warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + int2str(e.get_POS()) +"\n");
1506
+ }
1507
+ }
1508
+
1509
+ string output = output_file_prefix + ".TsTv.qual";
1510
+ ofstream out(output.c_str());
1511
+ if (!out.is_open())
1512
+ error("Could not open TsTv by Count Output File: " + output, 7);
1513
+
1514
+ out << "QUAL_THRESHOLD";
1515
+ out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD";
1516
+ out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl;
1517
+
1518
+ unsigned int N_TsTv = TsTv_counts.size();
1519
+
1520
+ vector<double> Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0);
1521
+ vector<double> QUAL_vector(N_TsTv+1, 0.0);
1522
+ QUAL_vector[0] = min_qual;
1523
+ QUAL_vector[N_TsTv] = max_qual;
1524
+ idx = 1;
1525
+ for (map<double, pair<unsigned int, unsigned int> >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it)
1526
+ {
1527
+ QUAL = (it->first);
1528
+ double Ts = (it->second).first;
1529
+ double Tv = (it->second).second;
1530
+ Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts;
1531
+ Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv;
1532
+ QUAL_vector[idx-1] = QUAL;
1533
+ idx++;
1534
+ }
1535
+ QUAL_vector[N_TsTv] = max_qual;
1536
+
1537
+ vector<double> Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0);
1538
+ idx = N_TsTv;
1539
+ for (map<double, pair<unsigned int, unsigned int> >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it)
1540
+ {
1541
+ QUAL = (it->first);
1542
+ double Ts = (it->second).first;
1543
+ double Tv = (it->second).second;
1544
+ Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts;
1545
+ Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv;
1546
+ idx--;
1547
+ }
1548
+
1549
+ double Ts_sum, Tv_sum, ratio;
1550
+ for (unsigned int ui=1; ui<(N_TsTv+1); ui++)
1551
+ {
1552
+ QUAL = QUAL_vector[ui-1];
1553
+ out << QUAL;
1554
+ Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1];
1555
+ ratio = Ts_sum / Tv_sum;
1556
+ out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
1557
+ Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1];
1558
+ ratio = Ts_sum / Tv_sum;
1559
+ out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio;
1560
+ out << endl;
1561
+ }
1562
+ out.close();
1563
+ }
1564
+
1565
+ void vcf_file::output_site_quality(const string &output_file_prefix)
1566
+ {
1567
+ // Output per-site quality information.
1568
+ printLOG("Outputting Quality for Each Site\n");
1569
+ string output = output_file_prefix + ".lqual";
1570
+
1571
+ ofstream out(output.c_str());
1572
+ if (!out.is_open())
1573
+ error("Could not open Site Depth Output File: " + output, 7);
1574
+
1575
+ out << "CHROM\tPOS\tQUAL" << endl;
1576
+
1577
+ string vcf_line;
1578
+ vcf_entry e(N_indv);
1579
+ for (unsigned int s=0; s<N_entries; s++)
1580
+ {
1581
+ if (include_entry[s] == false)
1582
+ continue;
1583
+
1584
+ get_vcf_entry(s, vcf_line);
1585
+ e.reset(vcf_line);
1586
+ e.parse_basic_entry();
1587
+
1588
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e.get_QUAL() << endl;
1589
+ }
1590
+ out.close();
1591
+ }
1592
+
1593
+ void vcf_file::output_site_depth(const string &output_file_prefix, bool output_mean)
1594
+ {
1595
+ // Output per-site depth information
1596
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1597
+ error("Require Genotypes in VCF file in order to output Site Depth Statistics.");
1598
+
1599
+ printLOG("Outputting Depth for Each Site\n");
1600
+ string output = output_file_prefix + ".ldepth";
1601
+ if (output_mean)
1602
+ output += ".mean";
1603
+ ofstream out(output.c_str());
1604
+ if (!out.is_open())
1605
+ error("Could not open Site Depth Output File: " + output, 7);
1606
+
1607
+ out << "CHROM\tPOS\t";
1608
+ if (output_mean)
1609
+ out << "MEAN_DEPTH\tVAR_DEPTH" << endl;
1610
+ else
1611
+ out << "SUM_DEPTH\tSUMSQ_DEPTH" << endl;
1612
+
1613
+ int depth;
1614
+ string vcf_line;
1615
+ vcf_entry e(N_indv);
1616
+ for (unsigned int s=0; s<N_entries; s++)
1617
+ {
1618
+ if (include_entry[s] == false)
1619
+ continue;
1620
+
1621
+ get_vcf_entry(s, vcf_line);
1622
+ e.reset(vcf_line);
1623
+ e.parse_basic_entry();
1624
+
1625
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t";
1626
+
1627
+ unsigned int sum=0;
1628
+ unsigned int sumsq=0;
1629
+ unsigned int n=0;
1630
+ for (unsigned int ui=0; ui<N_indv; ui++)
1631
+ {
1632
+ if (include_indv[ui] == false)
1633
+ continue;
1634
+ if (include_genotype[s][ui] == false)
1635
+ continue;
1636
+
1637
+ e.parse_genotype_entry(ui, false, false, true);
1638
+ depth = e.get_indv_DEPTH(ui);
1639
+ if (depth >= 0)
1640
+ {
1641
+ sum += depth;
1642
+ sumsq += (depth*depth);
1643
+ n++;
1644
+ }
1645
+ }
1646
+
1647
+ if (output_mean)
1648
+ {
1649
+ double mean = double(sum) / n;
1650
+ double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1);
1651
+ out << mean << "\t" << var << endl;
1652
+ }
1653
+ else
1654
+ out << sum << "\t" << sumsq << endl;
1655
+ }
1656
+ out.close();
1657
+ }
1658
+
1659
+ void vcf_file::output_fst(const string &output_file_prefix, vcf_file &vcf_fst)
1660
+ {
1661
+ // Calculate, and output, Fst using the formula outlined in HapMap I
1662
+ // Namely:
1663
+ // Fst = 1 - (Pi_within / Pi_combined)
1664
+ // where
1665
+ // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
1666
+ // and
1667
+ // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
1668
+ // where j is the population index, and i is the SNP index
1669
+ printLOG("Outputting Fst estimates (for bi-allelic only)\n");
1670
+
1671
+ string output = output_file_prefix + ".fst";
1672
+ ofstream out(output.c_str());
1673
+ if (!out.is_open())
1674
+ error("Could not open Fst Output File: " + output, 7);
1675
+
1676
+ out << "CHROM\tPOS\tFST" << endl;
1677
+
1678
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
1679
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
1680
+
1681
+ return_site_union(vcf_fst, CHROMPOS_to_filepos_pair);
1682
+
1683
+ string vcf_line;
1684
+
1685
+ int n_1, n_2, n_1_choose_2 = 0, n_2_choose_2=0;
1686
+ int last_n_1=-1, last_n_2=-1;
1687
+
1688
+ unsigned int n_i1, n_i2, n_iT;
1689
+ int N_alleles1, N_alleles2;
1690
+ vector<int> allele_counts1, allele_counts2;
1691
+ double x_i1, x_i2, x_iT;
1692
+ int POS;
1693
+ int s1, s2;
1694
+
1695
+ double tmp1, tmp2, tmpT;
1696
+ double sum1=0.0, sum2=0.0, sumT=0.0;
1697
+ double Fst;
1698
+ string CHROM;
1699
+
1700
+ unsigned int N_intersecting_sites = 0;
1701
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
1702
+ {
1703
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
1704
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
1705
+
1706
+ if ((s1 == -1) || (s2 == -1))
1707
+ continue;
1708
+
1709
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
1710
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
1711
+
1712
+ get_vcf_entry(s1, vcf_line);
1713
+ vcf_entry e1(N_indv, vcf_line);
1714
+ vcf_fst.get_vcf_entry(s2, vcf_line);
1715
+ vcf_entry e2(vcf_fst.N_indv, vcf_line);
1716
+
1717
+ e1.parse_basic_entry(true);
1718
+ e2.parse_basic_entry(true);
1719
+
1720
+ // Check sites have same alternative alleles
1721
+ N_alleles1 = e1.get_N_alleles();
1722
+ N_alleles2 = e2.get_N_alleles();
1723
+
1724
+ if ((N_alleles1 != 2) || (N_alleles2 != 2))
1725
+ {
1726
+ one_off_warning("\tFst: Only using biallelic SNPs.");
1727
+ continue;
1728
+ }
1729
+
1730
+ if ((N_alleles1 == 2) && (N_alleles2 == 2))
1731
+ if (e1.get_ALT_allele(0) != e2.get_ALT_allele(0))
1732
+ {
1733
+ one_off_warning("\tFst: Only using sites with matching reference alleles.");
1734
+ continue;
1735
+ }
1736
+
1737
+ e1.parse_genotype_entries(true);
1738
+ e2.parse_genotype_entries(true);
1739
+
1740
+ // Calculate allele frequencies
1741
+ e1.get_allele_counts(allele_counts1, n_i1, include_indv, include_genotype[s1]);
1742
+ e2.get_allele_counts(allele_counts2, n_i2, vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
1743
+
1744
+ if ((n_i1 == 0) || (n_i2 == 0))
1745
+ continue;
1746
+
1747
+ n_1 = e1.get_N_chr(include_indv, include_genotype[s1]);
1748
+ n_2 = e2.get_N_chr(vcf_fst.include_indv, vcf_fst.include_genotype[s2]);
1749
+
1750
+ if (last_n_1 != -1)
1751
+ {
1752
+ if ((n_1 != last_n_1) || (n_2 != last_n_2))
1753
+ {
1754
+ error("Cannot mix sites with different ploidy. Are you including sex-chromosomes?\n"+CHROM+":"+int2str(POS)+"\n");
1755
+ }
1756
+ }
1757
+ else
1758
+ {
1759
+ last_n_1 = n_1;
1760
+ last_n_2 = n_2;
1761
+ }
1762
+
1763
+ n_1_choose_2 = n_1 * (n_1 - 1) / 2;
1764
+ n_2_choose_2 = n_2 * (n_2 - 1) / 2;
1765
+
1766
+ N_intersecting_sites++;
1767
+
1768
+ x_i1 = allele_counts1[0] / double(n_i1);
1769
+ x_i2 = allele_counts2[0] / double(n_i2);
1770
+ n_iT = (n_i1 + n_i2);
1771
+ x_iT = (allele_counts1[0] + allele_counts2[0]) / double(n_iT);
1772
+
1773
+ tmp1 = 2 * (n_i1 / (n_i1 - 1.0)) * x_i1 * (1-x_i1);
1774
+ tmp2 = 2 * (n_i2 / (n_i2 - 1.0)) * x_i2 * (1-x_i2);
1775
+ tmpT = 2 * (n_iT / (n_iT - 1.0)) * x_iT * (1-x_iT);
1776
+
1777
+ Fst = 1.0 - (((n_1_choose_2 * tmp1) + (n_2_choose_2 * tmp2)) / (n_1_choose_2 + n_2_choose_2) / tmpT);
1778
+
1779
+ out << CHROM << "\t" << POS << "\t" << Fst << endl;
1780
+
1781
+ sum1 += tmp1;
1782
+ sum2 += tmp2;
1783
+ sumT += tmpT;
1784
+
1785
+ last_n_1 = n_1; last_n_2 = n_2;
1786
+ }
1787
+
1788
+ Fst = 1.0 - (((n_1_choose_2 * sum1) + (n_2_choose_2 * sum2)) / (n_1_choose_2 + n_2_choose_2) / sumT);
1789
+
1790
+ printLOG("Found " + int2str(N_intersecting_sites) + " intersecting sites\n");
1791
+ printLOG("Fst = " + dbl2str(Fst, 6) + "\n");
1792
+
1793
+ out.close();
1794
+ }
1795
+
1796
+
1797
+ void vcf_file::output_fst_version_2(const string &output_file_prefix, const vector<string> &indv_files)
1798
+ {
1799
+ // Calculate Fst using individuals in one (rather than two VCF files)
1800
+ // Calculate, and output, Fst using the formula outlined in HapMap I
1801
+ // Namely:
1802
+ // Fst = 1 - (Pi_within / Pi_combined)
1803
+ // where
1804
+ // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2))
1805
+ // and
1806
+ // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1))
1807
+ // where j is the population index, and i is the SNP index
1808
+
1809
+ if (indv_files.size() == 1)
1810
+ {
1811
+ printLOG("Require at least two populations to estimate Fst. Skipping\n");
1812
+ return;
1813
+ }
1814
+
1815
+ printLOG("Outputting Fst estimates.\n");
1816
+
1817
+ // First, read in the relevant files.
1818
+ vector< vector<bool> > indvs_in_pops;
1819
+ unsigned int N_pops = indv_files.size();
1820
+ indvs_in_pops.resize(N_pops, vector<bool>(N_indv, false));
1821
+ vector<bool> all_indv(N_indv,false);
1822
+ map<string, int> indv_to_idx;
1823
+ for (unsigned int ui=0; ui<N_indv; ui++)
1824
+ if (include_indv[ui] == true)
1825
+ indv_to_idx[indv[ui]] = ui;
1826
+ for (unsigned int ui=0; ui<N_pops; ui++)
1827
+ {
1828
+ ifstream indv_file(indv_files[ui].c_str());
1829
+ if (!indv_file.is_open())
1830
+ error("Could not open Individual file: " + indv_files[ui]);
1831
+ string line;
1832
+ string tmp_indv;
1833
+ stringstream ss;
1834
+ while (!indv_file.eof())
1835
+ {
1836
+ getline(indv_file, line);
1837
+ ss.str(line);
1838
+ ss >> tmp_indv;
1839
+ if (indv_to_idx.find(tmp_indv) != indv_to_idx.end())
1840
+ {
1841
+ indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true;
1842
+ all_indv[indv_to_idx[tmp_indv]]=true;
1843
+ }
1844
+ ss.clear();
1845
+ }
1846
+ indv_file.close();
1847
+ }
1848
+
1849
+ string output = output_file_prefix + ".fst";
1850
+ ofstream out(output.c_str());
1851
+ if (!out.is_open())
1852
+ error("Could not open Fst Output File: " + output, 7);
1853
+
1854
+ out << "CHROM\tPOS\tFST" << endl;
1855
+
1856
+ vcf_entry e(N_indv);
1857
+ string vcf_line;
1858
+ vector<int> allele_counts1;
1859
+ double Fst_tot_num=0.0, Fst_tot_denom=0.0;
1860
+ for (unsigned int s=0; s<N_entries; s++)
1861
+ {
1862
+ if (include_entry[s] == false)
1863
+ continue;
1864
+
1865
+ get_vcf_entry(s, vcf_line);
1866
+ e.reset(vcf_line);
1867
+ e.parse_basic_entry(true);
1868
+
1869
+ if (e.get_N_alleles() != 2)
1870
+ {
1871
+ one_off_warning("\tFst: Only using biallelic sites.");
1872
+ continue;
1873
+ }
1874
+
1875
+ e.parse_full_entry(true);
1876
+ e.parse_genotype_entries(true);
1877
+
1878
+ unsigned int N_chr;
1879
+ e.get_allele_counts(allele_counts1, N_chr, all_indv, include_genotype[s]);
1880
+ double count_all = allele_counts1[1];
1881
+ double N_chr_all = N_chr;
1882
+
1883
+ if ((count_all == 0) || (count_all == N_chr_all))
1884
+ continue; // No polymorphism
1885
+
1886
+ vector<double> counts(N_pops, 0);
1887
+ vector<double> pop_N_chr(N_pops, 0);
1888
+ vector<double> pop_N_choose_2(N_pops, 0);
1889
+ for (unsigned int p=0; p<N_pops; p++)
1890
+ {
1891
+ e.get_allele_counts(allele_counts1, N_chr, indvs_in_pops[p], include_genotype[s]);
1892
+ counts[p] = allele_counts1[1];
1893
+ pop_N_chr[p] = N_chr;
1894
+ pop_N_choose_2[p] = N_chr * (N_chr-1.0) / 2.0;
1895
+ }
1896
+
1897
+ double Fst_SNP = 0;
1898
+ double f;
1899
+ double sum1=0.0;
1900
+ for (unsigned int p=0; p<N_pops; p++)
1901
+ {
1902
+ f = counts[p] / pop_N_chr[p];
1903
+ Fst_SNP += 2.0*pop_N_choose_2[p]*(pop_N_chr[p]/(pop_N_chr[p]-1.0))*f*(1.0-f);
1904
+ sum1 += pop_N_choose_2[p];
1905
+ }
1906
+ Fst_SNP /= sum1;
1907
+ Fst_tot_num += Fst_SNP;
1908
+ f = count_all / N_chr_all;
1909
+ double tmp = (2.0*(N_chr_all / (N_chr_all-1.0))*f*(1.0-f));
1910
+ Fst_SNP /= tmp;
1911
+ Fst_tot_denom += tmp;
1912
+ Fst_SNP = 1.0 - Fst_SNP;
1913
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << Fst_SNP << endl;
1914
+
1915
+ // TODO add other methods of calculating Fst (such as Weir-Cockerham)
1916
+ }
1917
+ double Fst_tot = 1.0 - (Fst_tot_num / Fst_tot_denom);
1918
+ printLOG("Fst = " + dbl2str(Fst_tot, 6) + "\n");
1919
+
1920
+ out.close();
1921
+ }
1922
+
1923
+ void vcf_file::output_per_site_nucleotide_diversity(const string &output_file_prefix)
1924
+ {
1925
+ // Output nucleotide diversity, calculated on a per-site basis.
1926
+ // Pi = average number of pairwise differences
1927
+ // Assumes a constant distance of 1 between all possible mutations
1928
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
1929
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
1930
+
1931
+ printLOG("Outputting Per-Site Nucleotide Diversity Statistics...\n");
1932
+ string output_file = output_file_prefix + ".sites.pi";
1933
+
1934
+ ofstream out(output_file.c_str());
1935
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
1936
+ out << "CHROM\tPOS\tPI" << endl;
1937
+
1938
+ string vcf_line, FORMAT_out;
1939
+ vcf_entry e(N_indv);
1940
+ pair<int, int> genotype1, genotype2;
1941
+ for (unsigned int s=0; s<N_entries; s++)
1942
+ {
1943
+ if (include_entry[s] == false)
1944
+ continue;
1945
+
1946
+ get_vcf_entry(s, vcf_line);
1947
+ e.reset(vcf_line);
1948
+ e.parse_basic_entry(true);
1949
+
1950
+ if (e.get_N_alleles() != 2)
1951
+ {
1952
+ one_off_warning("\tsitePi: Only using biallelic sites.");
1953
+ continue;
1954
+ }
1955
+
1956
+ e.parse_full_entry(true);
1957
+ e.parse_genotype_entries(true);
1958
+
1959
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
1960
+ {
1961
+ one_off_warning("\tsitePi: Only using fully diploid sites.");
1962
+ continue;
1963
+ }
1964
+
1965
+ int total_alleles_count = 0;
1966
+ int first_allele_count = 0;
1967
+ int first_allele = -1;
1968
+ for (unsigned int ui=0; ui < N_indv; ui++)
1969
+ {
1970
+ if (include_indv[ui] == false)
1971
+ continue;
1972
+ if (include_genotype[s][ui] == false)
1973
+ continue;
1974
+ e.get_indv_GENOTYPE_ids(ui, genotype1);
1975
+ if ((genotype1.first != -1) && (genotype1.second != -1))
1976
+ {
1977
+ total_alleles_count += 2;
1978
+ if (first_allele == -1)
1979
+ first_allele = genotype1.first; //initialize to the first allele found
1980
+ if (genotype1.first == first_allele)
1981
+ first_allele_count++;
1982
+ if (genotype1.second == first_allele)
1983
+ first_allele_count++;
1984
+ }
1985
+ }
1986
+ int n = total_alleles_count;
1987
+ int k = first_allele_count;
1988
+ double pi= (2.0*k*(n-k))/(n*(n-1));
1989
+
1990
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << pi << endl;
1991
+ }
1992
+ }
1993
+
1994
+ // Output Tajima's D
1995
+ // Carlson et al. Genome Res (2005)
1996
+ void vcf_file::output_Tajima_D(const string &output_file_prefix, int window_size)
1997
+ {
1998
+ if (window_size <= 0)
1999
+ return;
2000
+
2001
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2002
+ error("Require Genotypes in VCF file in order to output Tajima's D Statistic.");
2003
+
2004
+ printLOG("Outputting Tajima's D Statistic...\n");
2005
+ string output_file = output_file_prefix + ".Tajima.D";
2006
+
2007
+ double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2;
2008
+ unsigned int n = N_kept_individuals()*2;
2009
+ if (n < 2)
2010
+ error("Require at least two chromosomes!");
2011
+
2012
+ for (unsigned int ui=1; ui<n; ui++)
2013
+ {
2014
+ a1 += 1.0 / double(ui);
2015
+ a2 += 1.0 / double(ui * ui);
2016
+ }
2017
+ b1 = double(n+1) / 3.0 / double(n-1);
2018
+ b2 = 2.0 * double(n*n + n + 3) / 9.0 / double(n) / double(n-1);
2019
+ c1 = b1 - (1.0 / a1);
2020
+ c2 = b2 - (double(n+2)/double(a1*n)) + (a2/a1/a1);
2021
+ e1 = c1 / a1;
2022
+ e2 = c2 / ((a1*a1) + a2);
2023
+
2024
+ // Find maximum position
2025
+ map<string, int> max_pos;
2026
+ string vcf_line, CHROM;
2027
+ vcf_entry e(N_indv);
2028
+ for (unsigned int s=0; s<N_entries; s++)
2029
+ {
2030
+ if (include_entry[s] == true)
2031
+ {
2032
+ get_vcf_entry(s, vcf_line);
2033
+ e.reset(vcf_line);
2034
+ e.parse_basic_entry();
2035
+
2036
+ CHROM = e.get_CHROM();
2037
+
2038
+ if (max_pos.find(CHROM) != max_pos.end())
2039
+ {
2040
+ if (e.get_POS() > max_pos[CHROM])
2041
+ max_pos[CHROM] = e.get_POS();
2042
+ }
2043
+ else
2044
+ max_pos[CHROM] = e.get_POS();
2045
+ }
2046
+ }
2047
+
2048
+ map<string, int>::iterator it;
2049
+ unsigned int N_bins;
2050
+ map<string, vector< pair<int, double> > > bins;
2051
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2052
+ {
2053
+ CHROM = (*it).first;
2054
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2055
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2056
+ }
2057
+
2058
+ unsigned int idx;
2059
+ double C = 1.0 / double(window_size);
2060
+ vector<int> allele_counts;
2061
+ unsigned int N_non_missing_chr;
2062
+ unsigned int N_alleles;
2063
+ for (unsigned int s=0; s<N_entries; s++)
2064
+ {
2065
+ if (include_entry[s] == false)
2066
+ continue;
2067
+
2068
+ get_vcf_entry(s, vcf_line);
2069
+ e.reset(vcf_line);
2070
+ e.parse_basic_entry(true);
2071
+ N_alleles = e.get_N_alleles();
2072
+
2073
+ if (N_alleles != 2)
2074
+ {
2075
+ one_off_warning("\tTajimaD: Only using bialleleic sites.");
2076
+ continue;
2077
+ }
2078
+
2079
+ CHROM = e.get_CHROM();
2080
+ idx = (unsigned int)(e.get_POS() * C);
2081
+ e.parse_genotype_entries(true);
2082
+
2083
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2084
+ {
2085
+ one_off_warning("\tTajimaD: Only using fully diploid sites.");
2086
+ continue;
2087
+ }
2088
+
2089
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2090
+
2091
+ double p = double(allele_counts[0]) / N_non_missing_chr;
2092
+ if ((p > 0.0) && (p < 1.0))
2093
+ {
2094
+ bins[CHROM][idx].first++;
2095
+ bins[CHROM][idx].second += p * (1.0-p);
2096
+ }
2097
+ }
2098
+
2099
+ ofstream out(output_file.c_str());
2100
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2101
+ out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl;
2102
+
2103
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2104
+ {
2105
+ CHROM = (*it).first;
2106
+ bool output = false;
2107
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2108
+ {
2109
+ int S = bins[CHROM][s].first;
2110
+ double D = 0;
2111
+ if (S > 1)
2112
+ {
2113
+ double pi = 2.0*bins[CHROM][s].second*n/double(n-1);
2114
+ double tw = double(S) / a1;
2115
+ double var = (e1*S) + e2*S*(S-1);
2116
+ D = (pi - tw) / sqrt(var);
2117
+ output = true;
2118
+ }
2119
+ if (S > 0)
2120
+ output = true;
2121
+ if (output == true)
2122
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl;
2123
+ }
2124
+ }
2125
+
2126
+ out.close();
2127
+ }
2128
+
2129
+ void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
2130
+ {
2131
+ // Output nucleotide diversity, as calculated in windows.
2132
+ // Average number of pairwise differences in windows.
2133
+ if (window_size <= 0)
2134
+ return;
2135
+
2136
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2137
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
2138
+
2139
+ printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
2140
+ string output_file = output_file_prefix + ".windowed.pi";
2141
+
2142
+ // Find maximum position
2143
+ map<string, int> max_pos;
2144
+ map<string, int>::iterator it;
2145
+ string vcf_line, CHROM;
2146
+ vcf_entry e(N_indv);
2147
+ for (unsigned int s=0; s<N_entries; s++)
2148
+ {
2149
+ if (include_entry[s] == true)
2150
+ {
2151
+ get_vcf_entry(s, vcf_line);
2152
+ e.reset(vcf_line);
2153
+ e.parse_basic_entry();
2154
+
2155
+ CHROM = e.get_CHROM();
2156
+
2157
+ if (max_pos.find(CHROM) != max_pos.end())
2158
+ {
2159
+ if (e.get_POS() > max_pos[CHROM])
2160
+ max_pos[CHROM] = e.get_POS();
2161
+ }
2162
+ else
2163
+ max_pos[CHROM] = e.get_POS();
2164
+ }
2165
+ }
2166
+
2167
+ unsigned int N_bins;
2168
+ map<string, vector<pair<int, double> > > bins;
2169
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2170
+ {
2171
+ CHROM = (*it).first;
2172
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2173
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2174
+ }
2175
+
2176
+ unsigned int idx;
2177
+ double C = 1.0 / double(window_size);
2178
+ vector<int> allele_counts;
2179
+ unsigned int N_non_missing_chr;
2180
+ unsigned int N_alleles;
2181
+ for (unsigned int s=0; s<N_entries; s++)
2182
+ {
2183
+ if (include_entry[s] == false)
2184
+ continue;
2185
+
2186
+ get_vcf_entry(s, vcf_line);
2187
+ e.reset(vcf_line);
2188
+ e.parse_basic_entry(true);
2189
+ N_alleles = e.get_N_alleles();
2190
+
2191
+ if (N_alleles != 2)
2192
+ {
2193
+ one_off_warning("\twindowPi: Only using bialleleic sites.");
2194
+ continue;
2195
+ }
2196
+
2197
+ CHROM = e.get_CHROM();
2198
+ idx = (unsigned int)(e.get_POS() * C);
2199
+ e.parse_genotype_entries(true);
2200
+
2201
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2202
+ {
2203
+ one_off_warning("\twindowPi: Only using fully diploid sites.");
2204
+ continue;
2205
+ }
2206
+
2207
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2208
+
2209
+ double p = double(allele_counts[0]) / N_non_missing_chr;
2210
+ if ((p>0.0) && (p<1.0))
2211
+ {
2212
+ bins[CHROM][idx].first++;
2213
+ bins[CHROM][idx].second += (double(N_non_missing_chr) / (N_non_missing_chr - 1.0)) * 2.0 * p * (1.0 - p);
2214
+ }
2215
+ }
2216
+
2217
+ ofstream out(output_file.c_str());
2218
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2219
+ out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
2220
+
2221
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2222
+ {
2223
+ CHROM = (*it).first;
2224
+ bool output = false;
2225
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2226
+ {
2227
+ if (bins[CHROM][s].first > 0)
2228
+ output = true;
2229
+ if (output == true)
2230
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
2231
+ }
2232
+ }
2233
+
2234
+ out.close();
2235
+ }
2236
+
2237
+ /*
2238
+ void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size)
2239
+ {
2240
+ // Output nucleotide diversity, as calculated in windows.
2241
+ // Average number of pairwise differences in windows.
2242
+ // Requires phased data.
2243
+ if (window_size <= 0)
2244
+ return;
2245
+
2246
+ if (has_genotypes == false)
2247
+ error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
2248
+
2249
+ printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
2250
+ string output_file = output_file_prefix + ".windowed.pi";
2251
+
2252
+ map<string, int>::iterator it;
2253
+
2254
+ // Find maximum position
2255
+ map<string, int> max_pos;
2256
+ string vcf_line, CHROM;
2257
+ vcf_entry e(N_indv);
2258
+ for (unsigned int s=0; s<N_entries; s++)
2259
+ {
2260
+ if (include_entry[s] == true)
2261
+ {
2262
+ get_vcf_entry(s, vcf_line);
2263
+ e.reset(vcf_line);
2264
+ e.parse_basic_entry();
2265
+
2266
+ CHROM = e.get_CHROM();
2267
+
2268
+ if (max_pos.find(CHROM) != max_pos.end())
2269
+ {
2270
+ if (e.get_POS() > max_pos[CHROM])
2271
+ max_pos[CHROM] = e.get_POS();
2272
+ }
2273
+ else
2274
+ max_pos[CHROM] = e.get_POS();
2275
+ }
2276
+ }
2277
+
2278
+ unsigned int N_bins;
2279
+ map<string, vector<pair<int, double> > > bins;
2280
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2281
+ {
2282
+ CHROM = (*it).first;
2283
+ N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size));
2284
+ bins[CHROM].resize(N_bins, make_pair(0,0));
2285
+ }
2286
+
2287
+ unsigned int last_idx = (unsigned)(-1);
2288
+ unsigned int idx;
2289
+ string last_CHROM;
2290
+ vector<vector<int> > haplotypes(2*N_indv);
2291
+ pair<int, int> genotype1;
2292
+ unsigned int N_SNPs=0;;
2293
+ double C = 1.0 / double(window_size);
2294
+ for (unsigned int s=0; s<N_entries; s++)
2295
+ {
2296
+ if (include_entry[s] == false)
2297
+ continue;
2298
+
2299
+ get_vcf_entry(s, vcf_line);
2300
+ e.reset(vcf_line);
2301
+ e.parse_basic_entry();
2302
+
2303
+ CHROM = e.get_CHROM();
2304
+ idx = (unsigned int)(e.get_POS() * C);
2305
+
2306
+ if (((last_idx != idx) || (CHROM != last_CHROM)) && (last_idx != (unsigned)-1))
2307
+ { // Process haplotype window.
2308
+ double pi=0.0;
2309
+ double n=0.0;
2310
+ for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
2311
+ {
2312
+ if (include_indv[ui/2] == false)
2313
+ continue;
2314
+ for (unsigned int uj=(ui+1); uj<haplotypes.size(); uj++)
2315
+ {
2316
+ if (include_indv[uj/2] == false)
2317
+ continue;
2318
+ for (unsigned int snp=0; snp<N_SNPs; snp++)
2319
+ {
2320
+ if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
2321
+ {
2322
+ if (haplotypes[ui][snp] != haplotypes[uj][snp])
2323
+ { pi++; }
2324
+ n++;
2325
+ }
2326
+ }
2327
+ }
2328
+ }
2329
+ pi /= n;
2330
+ bins[last_CHROM][last_idx].first = N_SNPs;
2331
+ bins[last_CHROM][last_idx].second = pi;
2332
+
2333
+ N_SNPs = 0;
2334
+ for (unsigned int ui=0; ui<haplotypes.size(); ui++)
2335
+ {
2336
+ haplotypes[ui].clear();
2337
+ }
2338
+ }
2339
+
2340
+ e.parse_genotype_entries(true);
2341
+ for (unsigned int ui=0; ui<N_indv; ui++)
2342
+ {
2343
+ if (include_indv[ui] == false)
2344
+ continue;
2345
+
2346
+ if (include_genotype[s][ui] == true)
2347
+ {
2348
+ e.get_indv_GENOTYPE_ids(ui, genotype1);
2349
+ haplotypes[(2*ui)].push_back(genotype1.first);
2350
+ haplotypes[(2*ui)+1].push_back(genotype1.second);
2351
+ }
2352
+ else
2353
+ {
2354
+ haplotypes[(2*ui)].push_back(-1);
2355
+ haplotypes[(2*ui)+1].push_back(-1);
2356
+ }
2357
+ }
2358
+ N_SNPs++;
2359
+ last_CHROM = CHROM;
2360
+ last_idx = idx;
2361
+ }
2362
+
2363
+ if (N_SNPs > 0)
2364
+ { // Output last window
2365
+ double pi=0.0;
2366
+ double n=0.0;
2367
+ for (unsigned int ui=0; ui<(haplotypes.size()-1); ui++)
2368
+ {
2369
+ if (include_indv[ui/2] == false)
2370
+ continue;
2371
+ for (unsigned int uj=ui+1; uj<haplotypes.size(); uj++)
2372
+ {
2373
+ if (include_indv[uj/2] == false)
2374
+ continue;
2375
+ for (unsigned int snp=0; snp<N_SNPs; snp++)
2376
+ {
2377
+ if ((haplotypes[ui][snp] != -1) && (haplotypes[uj][snp] != -1))
2378
+ {
2379
+ if (haplotypes[ui][snp] != haplotypes[uj][snp])
2380
+ pi++;
2381
+ n++;
2382
+ }
2383
+ }
2384
+ }
2385
+ }
2386
+ pi /= n;
2387
+ bins[last_CHROM][last_idx].first = N_SNPs;
2388
+ bins[last_CHROM][last_idx].second = pi;
2389
+ }
2390
+
2391
+ ofstream out(output_file.c_str());
2392
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2393
+ out << "CHROM\tBIN_START\tN_SNPS\tPI" << endl;
2394
+
2395
+ for (it=max_pos.begin(); it != max_pos.end(); ++it)
2396
+ {
2397
+ CHROM = (*it).first;
2398
+ for (unsigned int s=0; s<bins[CHROM].size(); s++)
2399
+ {
2400
+ out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << bins[CHROM][s].second << endl;
2401
+ }
2402
+ }
2403
+
2404
+ out.close();
2405
+ }
2406
+ */
2407
+
2408
+ void vcf_file::output_kept_and_removed_sites(const string &output_file_prefix)
2409
+ {
2410
+ // Output lists of sites that have been filtered (or not).
2411
+ printLOG("Outputting Kept and Removed Sites...\n");
2412
+ string output_file1 = output_file_prefix + ".kept.sites";
2413
+ string output_file2 = output_file_prefix + ".removed.sites";
2414
+
2415
+ string vcf_line, CHROM;
2416
+ int POS;
2417
+ vcf_entry e(N_indv);
2418
+
2419
+ ofstream out1(output_file1.c_str());
2420
+ if (!out1.is_open()) error("Could not open output file: " + output_file1, 12);
2421
+ out1 << "CHROM\tPOS" << endl;
2422
+
2423
+ ofstream out2(output_file2.c_str());
2424
+ if (!out2.is_open()) error("Could not open output file: " + output_file2, 12);
2425
+ out2 << "CHROM\tPOS" << endl;
2426
+
2427
+ for (unsigned int s=0; s<N_entries; s++)
2428
+ {
2429
+ get_vcf_entry(s, vcf_line);
2430
+ e.reset(vcf_line);
2431
+ e.parse_basic_entry();
2432
+ POS = e.get_POS();
2433
+ CHROM = e.get_CHROM();
2434
+ if (include_entry[s] == true)
2435
+ {
2436
+ out1 << CHROM << "\t" << POS << endl;
2437
+ }
2438
+ else
2439
+ {
2440
+ out2 << CHROM << "\t" << POS << endl;
2441
+ }
2442
+ }
2443
+ out1.close();
2444
+ out2.close();
2445
+ }
2446
+
2447
+
2448
+ void vcf_file::output_LROH(const string &output_file_prefix)
2449
+ {
2450
+ // Detect and output Long Runs of Homozygosity, following the method
2451
+ // developed by Adam Boyko, and described in Auton et al., Genome Research, 2009
2452
+ // (Although using Forward-backwards algorithm in place of Viterbi).
2453
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2454
+ error("Require Genotypes in VCF file in order to output LROH.");
2455
+
2456
+ printLOG("Outputting Long Runs of Homozygosity (Experimental)... \n");
2457
+ string output_file = output_file_prefix + ".LROH";
2458
+
2459
+ unsigned int nGen=4; // Number of generations since common ancestry
2460
+ double genotype_error_rate = 0.01; // Assumed genotype error rate
2461
+ double p_auto_prior = 0.05; // Prior probability of being in autozygous state
2462
+ double p_auto_threshold = 0.99; // Threshold for reporting autozygous region
2463
+ int min_SNPs=0; // Threshold for reporting autozygous region
2464
+
2465
+ string vcf_line, CHROM;
2466
+ int POS;
2467
+ vcf_entry e(N_indv);
2468
+ pair<int, int> alleles;
2469
+ vector<unsigned int> s_vector;
2470
+ vector<pair<double, double> > p_emission;
2471
+ vector<vector<double> > p_trans;
2472
+
2473
+ ofstream out(output_file.c_str());
2474
+ if (!out.is_open()) error("Could not open output file: " + output_file, 12);
2475
+ out << "CHROM\tAUTO_START\tAUTO_END\tN_SNPs\tINDV" << endl;
2476
+
2477
+ for (unsigned int ui=0; ui<N_indv; ui++)
2478
+ {
2479
+ if (include_indv[ui] == false)
2480
+ continue;
2481
+
2482
+ printLOG("\t" + indv[ui] + "\n");
2483
+
2484
+ int last_POS = -1;
2485
+ s_vector.resize(0); p_emission.resize(0); p_trans.resize(0);
2486
+
2487
+ for (unsigned int s=0; s<N_entries; s++)
2488
+ {
2489
+ if ((include_entry[s] == false) || (include_genotype[s][ui] == false))
2490
+ continue;
2491
+
2492
+ get_vcf_entry(s, vcf_line);
2493
+ e.reset(vcf_line);
2494
+ e.parse_basic_entry(true);
2495
+
2496
+ if (e.get_N_alleles() != 2)
2497
+ {
2498
+ one_off_warning("\tLROH: Only using bialleleic sites.");
2499
+ continue; // TODO: Probably could do without this...
2500
+ }
2501
+
2502
+ POS = e.get_POS();
2503
+
2504
+ e.parse_genotype_entry(ui, true);
2505
+ e.get_indv_GENOTYPE_ids(ui, alleles);
2506
+
2507
+ if (e.get_indv_ploidy(ui) != 2)
2508
+ {
2509
+ one_off_warning("\tLROH: Only using diploid sites.");
2510
+ continue;
2511
+ }
2512
+
2513
+ if ((alleles.first == -1) || (alleles.second == -1))
2514
+ continue;
2515
+
2516
+ unsigned int X = alleles.first + alleles.second;
2517
+
2518
+ // Calculate heterozyogosity of this site.
2519
+ // TODO: Would be better to do this once, but for simplicity, do it for each individual.
2520
+ unsigned int N_genotypes = 0;
2521
+ unsigned int N_hets = 0;
2522
+ for (unsigned int uj=0; uj<N_indv; uj++)
2523
+ {
2524
+ if ((include_indv[uj] == false) || (include_genotype[s][ui] == false))
2525
+ continue;
2526
+
2527
+ e.parse_genotype_entry(uj, true);
2528
+ e.get_indv_GENOTYPE_ids(uj, alleles);
2529
+ if ((alleles.first != -1) && (alleles.second != -1))
2530
+ {
2531
+ N_genotypes++;
2532
+ if (alleles.first != alleles.second)
2533
+ N_hets++;
2534
+ }
2535
+ }
2536
+ double h = N_hets / double(N_genotypes);
2537
+ double p_emission_given_nonauto;
2538
+ double p_emission_given_auto;
2539
+
2540
+ if (X == 1)
2541
+ { // Heterozygote
2542
+ p_emission_given_nonauto = h;
2543
+ p_emission_given_auto = genotype_error_rate;
2544
+ p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
2545
+ }
2546
+ else
2547
+ { // Homozygote
2548
+ p_emission_given_nonauto = 1.0-h;
2549
+ p_emission_given_auto = 1.0-genotype_error_rate;
2550
+ p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto));
2551
+ }
2552
+
2553
+ double r = 0;
2554
+ if (last_POS > 0)
2555
+ { // Assume 1cM/Mb.
2556
+ r = (POS - last_POS) / 1000000.0 / 100.0; // Morgans
2557
+ }
2558
+
2559
+ double e = (1.0 - exp(-2.0*nGen*r));
2560
+ double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1]
2561
+ double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2]
2562
+ double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0]
2563
+ double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3]
2564
+ vector<double> A(4);
2565
+ A[0] = p_trans_auto_to_auto;
2566
+ A[1] = p_trans_auto_to_nonauto;
2567
+ A[2] = p_trans_nonauto_to_auto;
2568
+ A[3] = p_trans_nonauto_to_nonauto;
2569
+
2570
+ s_vector.push_back(s);
2571
+
2572
+ p_trans.push_back(A);
2573
+ last_POS = POS;
2574
+ }
2575
+
2576
+ // Forward-backward algorithm
2577
+ int N_obs = (int)p_emission.size();
2578
+ if (N_obs == 0)
2579
+ continue;
2580
+
2581
+ vector<vector<double> > alpha(N_obs, vector<double>(2,0));
2582
+ vector<vector<double> > beta(N_obs, vector<double>(2,0));
2583
+
2584
+ alpha[0][0] = p_emission[0].first;
2585
+ alpha[0][1] = p_emission[0].second;
2586
+ for (int i=1; i<N_obs; i++)
2587
+ {
2588
+ alpha[i][0] = alpha[i-1][0] * p_trans[i-1][0] * p_emission[i].first;
2589
+ alpha[i][0] += alpha[i-1][1] * p_trans[i-1][2] * p_emission[i].first;
2590
+
2591
+ alpha[i][1] = alpha[i-1][1] * p_trans[i-1][3] * p_emission[i].second;
2592
+ alpha[i][1] += alpha[i-1][0] * p_trans[i-1][1] * p_emission[i].second;
2593
+
2594
+ while (alpha[i][0] + alpha[i][1] < 1e-20)
2595
+ { // Renormalise to prevent underflow
2596
+ alpha[i][0] *= 1e20;
2597
+ alpha[i][1] *= 1e20;
2598
+ }
2599
+ }
2600
+
2601
+ beta[N_obs-1][0] = 1.0;
2602
+ beta[N_obs-1][1] = 1.0;
2603
+ for (int i=N_obs-2; i>=0; i--)
2604
+ {
2605
+ beta[i][0] = beta[i+1][0] * p_trans[i][0] * p_emission[i].first;
2606
+ beta[i][0] += beta[i+1][1] * p_trans[i][2] * p_emission[i].first;
2607
+
2608
+ beta[i][1] = beta[i+1][1] * p_trans[i][3] * p_emission[i].second;
2609
+ beta[i][1] += beta[i+1][0] * p_trans[i][1] * p_emission[i].second;
2610
+
2611
+ while (beta[i][0] + beta[i][1] < 1e-20)
2612
+ { // Renormalise to prevent underflow
2613
+ beta[i][0] *= 1e20;
2614
+ beta[i][1] *= 1e20;
2615
+ }
2616
+ }
2617
+
2618
+ // Calculate probability of each site being autozygous
2619
+ vector<double> p_auto(N_obs);
2620
+ for (int i=0; i<N_obs; i++)
2621
+ {
2622
+ p_auto[i] = alpha[i][0] * beta[i][0] / (alpha[i][0] * beta[i][0] + alpha[i][1] * beta[i][1]);
2623
+ }
2624
+
2625
+ // Generate output
2626
+ // TODO: Would be good to report actual limits of homozygosity
2627
+ // (i.e. extend regions out until first heterozygote),
2628
+ // as opposed to regions with p>threshold.
2629
+ // TODO: Also would be good to report heterozygotic SNPs found in homozygotic regions.
2630
+ bool in_auto=false;
2631
+ int start_pos=0, end_pos=0;
2632
+ int N_SNPs = 0;
2633
+ for (int i=0; i<N_obs; i++)
2634
+ {
2635
+ if (p_auto[i] > p_auto_threshold)
2636
+ {
2637
+ if (in_auto == false)
2638
+ { // Start of autozygous region
2639
+ unsigned int s = s_vector[i];
2640
+ get_vcf_entry(s, vcf_line);
2641
+ e.reset(vcf_line);
2642
+ e.parse_basic_entry(true);
2643
+ CHROM = e.get_CHROM();
2644
+ start_pos = e.get_POS();
2645
+ }
2646
+ N_SNPs++;
2647
+ in_auto = true;
2648
+ }
2649
+ else
2650
+ {
2651
+ if (in_auto == true)
2652
+ { // end of autozygous region
2653
+ unsigned int s = s_vector[i];
2654
+ get_vcf_entry(s, vcf_line);
2655
+ e.reset(vcf_line);
2656
+ e.parse_basic_entry(true);
2657
+ end_pos = e.get_POS();
2658
+ if (N_SNPs >= min_SNPs)
2659
+ out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
2660
+ }
2661
+ in_auto = false;
2662
+ N_SNPs = 0;
2663
+ }
2664
+ }
2665
+ if (in_auto == true)
2666
+ { // Report final region if needed
2667
+ unsigned int s = s_vector[N_obs-1];
2668
+ get_vcf_entry(s, vcf_line);
2669
+ e.reset(vcf_line);
2670
+ e.parse_basic_entry(true);
2671
+ end_pos = e.get_POS();
2672
+ if (N_SNPs >= min_SNPs)
2673
+ out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl;
2674
+ }
2675
+ }
2676
+ out.close();
2677
+ }
2678
+
2679
+ void vcf_file::output_indv_relatedness(const string &output_file_prefix)
2680
+ {
2681
+ // Calculate and output a relatedness statistic based on the method of
2682
+ // Yang et al, 2010 (doi:10.1038/ng.608). Specifically, calculate the
2683
+ // unadjusted Ajk statistic (equation 6 of paper).
2684
+ // Expectation of Ajk is zero for individuals within a populations, and
2685
+ // one for an individual with themselves.
2686
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2687
+ error("Require Genotypes in VCF file in order to output Individual Relatedness.");
2688
+
2689
+ printLOG("Outputting Individual Relatedness\n");
2690
+ string output = output_file_prefix + ".relatedness";
2691
+ ofstream out(output.c_str());
2692
+ if (!out.is_open())
2693
+ error("Could not open Individual Relatedness Output File: " + output, 2);
2694
+ out << "INDV1\tINDV2\tRELATEDNESS" << endl;
2695
+
2696
+ string vcf_line;
2697
+ vcf_entry e(N_indv);
2698
+ vector<int> allele_counts;
2699
+ unsigned int N_alleles, N_non_missing_chr;
2700
+ double freq;
2701
+ pair<int, int> geno_id;
2702
+ vector<vector<double> > Ajk(N_indv, vector<double>(N_indv, 0.0));
2703
+ vector<vector<double> > N_sites(N_indv, vector<double>(N_indv, 0.0));
2704
+
2705
+ for (unsigned int s=0; s<N_entries; s++)
2706
+ {
2707
+ if (include_entry[s] == false)
2708
+ continue;
2709
+
2710
+ get_vcf_entry(s, vcf_line);
2711
+ e.reset(vcf_line);
2712
+
2713
+ e.parse_basic_entry(true);
2714
+ N_alleles = e.get_N_alleles();
2715
+
2716
+ if (N_alleles != 2)
2717
+ {
2718
+ one_off_warning("\tRelatedness: Only using biallelic sites.");
2719
+ continue; // Only use biallelic loci
2720
+ }
2721
+
2722
+ e.parse_genotype_entries(true);
2723
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2724
+ {
2725
+ one_off_warning("\tRelatedness: Only using fully diploid sites.");
2726
+ continue;
2727
+ }
2728
+
2729
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2730
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2731
+
2732
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2733
+ continue;
2734
+
2735
+ vector<double> x(N_indv, -1.0);
2736
+ for (unsigned int ui=0; ui<N_indv; ui++)
2737
+ {
2738
+ if (include_indv[ui] == false)
2739
+ continue;
2740
+
2741
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2742
+ x[ui] = geno_id.first + geno_id.second;
2743
+ }
2744
+
2745
+ double div = 1.0/(2.0*freq*(1.0-freq));
2746
+ for (unsigned int ui=0; ui<N_indv; ui++)
2747
+ {
2748
+ if ((include_indv[ui] == false) || (include_genotype[s][ui] == false) || (x[ui] < 0))
2749
+ continue;
2750
+ Ajk[ui][ui] += (x[ui]*x[ui] - (1 + 2.0*freq)*x[ui] + 2.0*freq*freq) * div;
2751
+ N_sites[ui][ui]++;
2752
+ for (unsigned int uj=(ui+1); uj<N_indv; uj++)
2753
+ {
2754
+ if ((include_indv[uj] == false) || (include_genotype[s][uj] == false) || (x[uj] < 0))
2755
+ continue;
2756
+ Ajk[ui][uj] += (x[ui] - 2.0*freq) * (x[uj] - 2.0*freq) * div;
2757
+ N_sites[ui][uj]++;
2758
+ }
2759
+ }
2760
+ }
2761
+
2762
+ for (unsigned int ui=0; ui<N_indv; ui++)
2763
+ {
2764
+ if (include_indv[ui] == false)
2765
+ continue;
2766
+ Ajk[ui][ui] = 1.0 + (Ajk[ui][ui] / N_sites[ui][ui]);
2767
+ out << indv[ui] << "\t" << indv[ui] << "\t" << Ajk[ui][ui] << endl;
2768
+ for (unsigned int uj=(ui+1); uj<N_indv; uj++)
2769
+ {
2770
+ if (include_indv[uj] == false)
2771
+ continue;
2772
+ Ajk[ui][uj] /= N_sites[ui][uj];
2773
+ out << indv[ui] << "\t" << indv[uj] << "\t" << Ajk[ui][uj] << endl;
2774
+ }
2775
+ }
2776
+
2777
+ out.close();
2778
+ }
2779
+
2780
+ void vcf_file::output_PCA(const string &output_file_prefix, bool use_normalisation, int SNP_loadings_N_PCs)
2781
+ {
2782
+ #ifndef VCFTOOLS_PCA
2783
+ use_normalisation = true;
2784
+ SNP_loadings_N_PCs = -1;
2785
+ string out = output_file_prefix;
2786
+ out = "Cannot run PCA analysis. Vcftools has been compiled without PCA enabled (requires LAPACK).";
2787
+ error(out);
2788
+ #else
2789
+ // Output PCA, following method of Patterson, Price and Reich 2006.
2790
+ if ((has_genotypes == false) | (N_kept_individuals() == 0))
2791
+ error("Require Genotypes in VCF file in order to perform PCA.");
2792
+
2793
+ if (use_normalisation)
2794
+ printLOG("Outputting Principal Component Analysis (with normalisation)\n");
2795
+ else
2796
+ printLOG("Outputting Principal Component Analysis (without normalisation)\n");
2797
+ string output = output_file_prefix + ".pca";
2798
+ ofstream out(output.c_str());
2799
+ if (!out.is_open())
2800
+ error("Could not open Principal Component Analysis Output File: " + output, 2);
2801
+
2802
+ unsigned int N_indvs = N_kept_individuals();
2803
+ unsigned int N_sites = N_kept_sites();
2804
+
2805
+ if (N_indvs >= N_sites)
2806
+ error("PCA computation requires that there are more sites than individuals.");
2807
+
2808
+ string vcf_line;
2809
+ vcf_entry e(N_indv);
2810
+ pair<int, int> geno_id;
2811
+ double x, freq;
2812
+ vector<int> allele_counts;
2813
+ unsigned int N_alleles, N_non_missing_chr;
2814
+
2815
+ // Store list of included individuals
2816
+ vector<string> included_indvs(N_indvs);
2817
+ unsigned int ui_prime = 0;
2818
+ for (unsigned int ui=0; ui<N_indv; ui++)
2819
+ {
2820
+ if (include_indv[ui] == false)
2821
+ continue;
2822
+ included_indvs[ui_prime] = indv[ui];
2823
+ ui_prime++;
2824
+ }
2825
+
2826
+ // Potentially uses a lot of memory. Should issue a warning about this.
2827
+ double **M = new double*[N_indvs]; // m rows = indv
2828
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2829
+ M[ui] = new double[N_sites]; // n columns
2830
+
2831
+ // Populate M
2832
+ unsigned int s_prime = 0;
2833
+ for (unsigned int s=0; s<N_entries; s++)
2834
+ {
2835
+ if (include_entry[s]==false)
2836
+ continue;
2837
+
2838
+ get_vcf_entry(s, vcf_line);
2839
+ e.reset(vcf_line);
2840
+
2841
+ e.parse_basic_entry(true);
2842
+ N_alleles = e.get_N_alleles();
2843
+ if (N_alleles != 2)
2844
+ error("PCA only works for biallelic sites.");
2845
+
2846
+ e.parse_genotype_entries(true);
2847
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2848
+ error("PCA only works for fully diploid sites.");
2849
+
2850
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2851
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2852
+
2853
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2854
+ continue;
2855
+
2856
+ double mu = freq*2.0;
2857
+ double div = 1.0 / sqrt(freq * (1.0-freq));
2858
+
2859
+ ui_prime = 0;
2860
+ for (unsigned int ui=0; ui<N_indv; ui++)
2861
+ {
2862
+ if (include_indv[ui] == false)
2863
+ continue;
2864
+
2865
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2866
+ x = geno_id.first + geno_id.second;
2867
+ if (x > -1)
2868
+ {
2869
+ if (use_normalisation == true)
2870
+ M[ui_prime][s_prime] = (x - mu) * div;
2871
+ else
2872
+ M[ui_prime][s_prime] = (x - mu);
2873
+ }
2874
+ ui_prime++;
2875
+ }
2876
+ s_prime++;
2877
+ }
2878
+
2879
+ // Now construct X = (1/n)MM'.
2880
+ double **X = new double *[N_indvs];
2881
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2882
+ X[ui] = new double[N_indvs];
2883
+
2884
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2885
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2886
+ X[ui][uj] = 0;
2887
+
2888
+ // Only populate one half of matrix
2889
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2890
+ for (unsigned int uj=ui; uj<N_indvs; uj++)
2891
+ for (unsigned int s=0; s<N_sites; s++)
2892
+ X[ui][uj] += M[ui][s] * M[uj][s];
2893
+
2894
+ delete [] M;
2895
+
2896
+ // Populate other half
2897
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2898
+ for (unsigned int uj=0; uj<ui; uj++)
2899
+ X[ui][uj] = X[uj][ui];
2900
+
2901
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2902
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2903
+ X[ui][uj] /= N_sites;
2904
+
2905
+ double *Er = new double[N_indvs];
2906
+ double *Ei = new double[N_indvs];
2907
+ double **Evecs = new double*[N_indvs];
2908
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2909
+ Evecs[ui] = new double[N_indvs];
2910
+
2911
+ // Call LAPACK routine to calculate eigenvectors and eigenvalues
2912
+ dgeev(X, N_indvs, Er, Ei, Evecs);
2913
+
2914
+ // Check there are no complex eigenvalues.
2915
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2916
+ if (Ei[ui] != 0)
2917
+ error("Complex eigenvalue.");
2918
+
2919
+ // Output results
2920
+ out << "INDV";
2921
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2922
+ out << "\tEIG_" << ui;
2923
+ out << endl;
2924
+
2925
+ out << "EIGENVALUE";
2926
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2927
+ out << "\t" << Er[ui];
2928
+ out << endl;
2929
+
2930
+ // Output eigenvectors (as columns)
2931
+ for (unsigned int ui=0; ui<N_indvs; ui++)
2932
+ {
2933
+ out << included_indvs[ui];
2934
+ for (unsigned int uj=0; uj<N_indvs; uj++)
2935
+ out << "\t" << Evecs[ui][uj];
2936
+ out << endl;
2937
+ }
2938
+
2939
+ out.close();
2940
+
2941
+ if (SNP_loadings_N_PCs > 0)
2942
+ { // Output SNP loadings
2943
+ printLOG("Outputting " + int2str(SNP_loadings_N_PCs) + " SNP loadings\n");
2944
+ output = output_file_prefix + ".pca.loadings";
2945
+ out.open(output.c_str());
2946
+ if (!out.good())
2947
+ error("Could not open Principal Component SNP Loading Output File: " + output, 2);
2948
+ out << "CHROM\tPOS";
2949
+ for (unsigned int ui=0; ui<(unsigned int)SNP_loadings_N_PCs; ui++)
2950
+ out << "\tGAMMA_" << ui;
2951
+ out << endl;
2952
+
2953
+ for (unsigned int s=0; s<N_entries; s++)
2954
+ {
2955
+ if (include_entry[s]==false)
2956
+ continue;
2957
+
2958
+ get_vcf_entry(s, vcf_line);
2959
+ e.reset(vcf_line);
2960
+
2961
+ e.parse_basic_entry(true);
2962
+ N_alleles = e.get_N_alleles();
2963
+ if (N_alleles != 2)
2964
+ error("PCA only works for biallelic sites.");
2965
+
2966
+ e.parse_genotype_entries(true);
2967
+ if (e.is_diploid(include_indv, include_genotype[s]) == false)
2968
+ error("PCA only works for fully diploid sites.");
2969
+
2970
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
2971
+ freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency
2972
+
2973
+ if ((freq <= numeric_limits<double>::epsilon()) || (freq >= (1.0-numeric_limits<double>::epsilon())))
2974
+ continue;
2975
+
2976
+ vector<double> gamma(SNP_loadings_N_PCs, 0.0);
2977
+ vector<double> a_sum(SNP_loadings_N_PCs, 0.0);
2978
+
2979
+ ui_prime = 0;
2980
+ for (unsigned int ui=0; ui<N_indv; ui++)
2981
+ {
2982
+ if (include_indv[ui] == false)
2983
+ continue;
2984
+
2985
+ e.get_indv_GENOTYPE_ids(ui, geno_id);
2986
+ x = geno_id.first + geno_id.second;
2987
+ if (x > -1)
2988
+ {
2989
+ for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
2990
+ {
2991
+ gamma[uj] += (x * Evecs[ui_prime][uj]);
2992
+ a_sum[uj] += (Evecs[ui_prime][uj]*Evecs[ui_prime][uj]);
2993
+ }
2994
+ }
2995
+ ui_prime++;
2996
+ }
2997
+
2998
+ out << e.get_CHROM() << "\t" << e.get_POS();
2999
+ for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++)
3000
+ out << "\t" << gamma[uj] / a_sum[uj];
3001
+ out << endl;
3002
+ }
3003
+ out.close();
3004
+ }
3005
+
3006
+ delete [] Er;
3007
+ delete [] Ei;
3008
+ delete [] Evecs;
3009
+ delete [] X;
3010
+ #endif
3011
+ }
3012
+