ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1138 @@
1
+ /*
2
+ * vcf_file_output.cpp
3
+ *
4
+ * Created on: Aug 28, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 249 $)
7
+ */
8
+ #include "vcf_file.h"
9
+ /*
10
+ void vcf_file::output_as_plink(const string &output_file_prefix)
11
+ {
12
+ // Output as PLINK formatted PED/MAP files.
13
+ if (has_genotypes == false)
14
+ error("Require Genotypes in VCF file in order to output as PLINK.");
15
+
16
+ printLOG("Writing PLINK PED file ... ");
17
+ string ped_file = output_file_prefix + ".ped";
18
+ string map_file = output_file_prefix + ".map";
19
+
20
+ ofstream PED(ped_file.c_str());
21
+ if (!PED.is_open()) error("Could not open output file: " + ped_file, 12);
22
+
23
+ vector<string> alleles;
24
+ char phase;
25
+ pair<int, int> genotype;
26
+ string vcf_line;
27
+ vcf_entry e(N_indv);
28
+ for (unsigned int ui=0; ui<N_indv; ui++)
29
+ {
30
+ if (include_indv[ui] == false)
31
+ continue;
32
+ PED << indv[ui] << "\t" << indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0;
33
+
34
+ for (unsigned int s=0; s<N_entries; s++)
35
+ {
36
+ if (include_entry[s] == false)
37
+ continue;
38
+
39
+ get_vcf_entry(s, vcf_line);
40
+ e.reset(vcf_line);
41
+ e.parse_basic_entry(true);
42
+
43
+ if (e.get_N_alleles() <= 2) // Only output sites with one alternative allele
44
+ {
45
+ e.get_alleles_vector(alleles);
46
+ genotype = make_pair(-1,-1);
47
+ phase = '/';
48
+ if (include_genotype[s][ui] == true)
49
+ {
50
+ e.parse_genotype_entry(ui, true);
51
+ e.get_indv_GENOTYPE_ids(ui, genotype);
52
+ phase = e.get_indv_PHASE(ui);
53
+ }
54
+
55
+ if (genotype.first == -1)
56
+ PED << "\t0";
57
+ else
58
+ PED << "\t" << alleles[genotype.first];
59
+
60
+ if (genotype.second == -1)
61
+ {
62
+ if (phase == '/')
63
+ PED << "\t0";
64
+ else if (genotype.first != -1)
65
+ PED << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc
66
+ else
67
+ PED << "\t0";
68
+ }
69
+ else
70
+ PED << "\t" << alleles[genotype.second];
71
+ }
72
+ }
73
+ PED << endl;
74
+ }
75
+
76
+ PED.close();
77
+
78
+ printLOG("Writing PLINK MAP file ... ");
79
+ ofstream MAP(map_file.c_str());
80
+ if (!MAP.is_open()) error("Could not open output file: " + map_file, 12);
81
+ for (unsigned int s=0; s<N_entries; s++)
82
+ {
83
+ if (include_entry[s] == false)
84
+ continue;
85
+
86
+ get_vcf_entry(s, vcf_line);
87
+ e.reset(vcf_line);
88
+ e.parse_basic_entry(true);
89
+ if (e.get_N_alleles() <= 2) // Only output sites with one alternative allele
90
+ {
91
+ if (e.get_ID() == ".")
92
+ MAP << e.get_CHROM() << "\t" << e.get_POS() << "\t0\t" << e.get_POS() << endl;
93
+ else
94
+ MAP << e.get_CHROM() << "\t" << e.get_ID() << "\t0\t" << e.get_POS() << endl;
95
+ }
96
+ }
97
+
98
+ MAP.close();
99
+ printLOG("Done.\n");
100
+ }
101
+ */
102
+
103
+ void vcf_file::output_as_plink(const string &output_file_prefix)
104
+ {
105
+ // Output as PLINK formatted PED/MAP files.
106
+ if (has_genotypes == false)
107
+ error("Require Genotypes in VCF file in order to output as PLINK.");
108
+
109
+ printLOG("Writing PLINK PED file ... \n");
110
+ string ped_file = output_file_prefix + ".ped";
111
+ string map_file = output_file_prefix + ".map";
112
+
113
+ vector<ofstream *> tmp_files(N_indv);
114
+ vector<string> tmp_filenames(N_indv);
115
+ for (unsigned int ui=0; ui<N_indv; ui++)
116
+ {
117
+ if (include_indv[ui] == false)
118
+ continue;
119
+ string filename(tmpnam(NULL));
120
+ ofstream *tmp_file = new ofstream(filename.c_str());
121
+ if (!tmp_file->good())
122
+ error("\n\nCould not open temporary file.\n\n"
123
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
124
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n"
125
+ "Alternatively, try the --plink-tped command.", 12);
126
+ (*tmp_file) << indv[ui] << "\t" << indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0;
127
+ tmp_files[ui] = tmp_file;
128
+ tmp_filenames[ui] = filename;
129
+ }
130
+
131
+ vector<string> alleles;
132
+ char phase;
133
+ pair<int, int> genotype;
134
+ string vcf_line;
135
+ vcf_entry e(N_indv);
136
+ ofstream *tmp_file;
137
+ for (unsigned int s=0; s<N_entries; s++)
138
+ {
139
+ if (include_entry[s] == false)
140
+ continue;
141
+
142
+ get_vcf_entry(s, vcf_line);
143
+ e.reset(vcf_line);
144
+ e.parse_basic_entry(true);
145
+
146
+ if (e.get_N_alleles() > 2)
147
+ {
148
+ one_off_warning("\tPLINK: Only outputting biallelic loci.");
149
+ continue;
150
+ }
151
+
152
+ e.get_alleles_vector(alleles);
153
+
154
+ for (unsigned int ui=0; ui<N_indv; ui++)
155
+ {
156
+ if (include_indv[ui] == false)
157
+ continue;
158
+
159
+ tmp_file = tmp_files[ui];
160
+
161
+ genotype = make_pair(-1,-1);
162
+ phase = '/';
163
+ if (include_genotype[s][ui] == true)
164
+ {
165
+ e.parse_genotype_entry(ui, true);
166
+ e.get_indv_GENOTYPE_ids(ui, genotype);
167
+ phase = e.get_indv_PHASE(ui);
168
+ }
169
+
170
+ if (genotype.first == -1)
171
+ (*tmp_file) << "\t0";
172
+ else
173
+ (*tmp_file) << "\t" << alleles[genotype.first];
174
+
175
+ if (genotype.second == -1)
176
+ {
177
+ if (phase == '/')
178
+ (*tmp_file) << "\t0";
179
+ else if (genotype.first != -1)
180
+ (*tmp_file) << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc
181
+ else
182
+ (*tmp_file) << "\t0";
183
+ }
184
+ else
185
+ (*tmp_file) << "\t" << alleles[genotype.second];
186
+ }
187
+ }
188
+
189
+ ofstream PED(ped_file.c_str());
190
+ if (!PED.is_open()) error("Could not open output file: " + ped_file, 12);
191
+ string tmp_line;
192
+ for (unsigned int ui=0; ui<N_indv; ui++)
193
+ {
194
+ if (include_indv[ui] == false)
195
+ continue;
196
+
197
+ ofstream *tmp_file = tmp_files[ui];
198
+ (*tmp_file) << endl;
199
+ tmp_file->close();
200
+
201
+ ifstream read_file(tmp_filenames[ui].c_str());
202
+ if (!read_file.good())
203
+ error("\n\nCould not open temporary file.\n\n"
204
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
205
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n"
206
+ "Alternatively, try the --plink-tped command.", 12);
207
+ getline(read_file, tmp_line);
208
+ PED << tmp_line << endl;
209
+ read_file.close();
210
+ remove(tmp_filenames[ui].c_str());
211
+ }
212
+ PED.close();
213
+
214
+ printLOG("Writing PLINK MAP file ... ");
215
+ ofstream MAP(map_file.c_str());
216
+ if (!MAP.is_open()) error("Could not open output file: " + map_file, 12);
217
+ int POS; string ID;
218
+ for (unsigned int s=0; s<N_entries; s++)
219
+ {
220
+ if (include_entry[s] == false)
221
+ continue;
222
+
223
+ get_vcf_entry(s, vcf_line);
224
+ e.reset(vcf_line);
225
+ e.parse_basic_entry(true);
226
+ if (e.get_N_alleles() > 2)
227
+ continue;
228
+ POS = e.get_POS();
229
+ ID = e.get_ID();
230
+ if (ID == ".")
231
+ MAP << e.get_CHROM() << "\t" << POS << "\t0\t" << POS << endl;
232
+ else
233
+ MAP << e.get_CHROM() << "\t" << ID << "\t0\t" << POS << endl;
234
+ }
235
+
236
+ MAP.close();
237
+ printLOG("Done.\n");
238
+ }
239
+
240
+ // Output as Plink Transposed file
241
+ void vcf_file::output_as_plink_tped(const string &output_file_prefix)
242
+ {
243
+ // Output as PLINK formatted PED/MAP files.
244
+ if (has_genotypes == false)
245
+ error("Require Genotypes in VCF file in order to output as PLINK TPED.");
246
+
247
+ printLOG("Writing PLINK TPED file ... ");
248
+ string tped_file = output_file_prefix + ".tped";
249
+ string tfam_file = output_file_prefix + ".tfam";
250
+
251
+ ofstream TPED(tped_file.c_str());
252
+ if (!TPED.is_open()) error("Could not open output file: " + tped_file, 12);
253
+
254
+ vector<string> alleles;
255
+ char phase;
256
+ pair<int, int> genotype;
257
+ string vcf_line;
258
+ vcf_entry e(N_indv);
259
+ for (unsigned int s=0; s<N_entries; s++)
260
+ {
261
+ if (include_entry[s] == false)
262
+ continue;
263
+
264
+ get_vcf_entry(s, vcf_line);
265
+ e.reset(vcf_line);
266
+ e.parse_basic_entry(true);
267
+
268
+ if (e.get_N_alleles() > 2) // Only output sites with at most one alternative allele
269
+ {
270
+ one_off_warning("\tPLINK-TPED: Only outputting biallelic loci.");
271
+ continue;
272
+ }
273
+
274
+ if (e.get_ID() == ".")
275
+ TPED << e.get_CHROM() << "\t" << e.get_POS() << "\t0\t" << e.get_POS();
276
+ else
277
+ TPED << e.get_CHROM() << "\t" << e.get_ID() << "\t0\t" << e.get_POS();
278
+
279
+ e.get_alleles_vector(alleles);
280
+
281
+ for (unsigned int ui=0; ui<N_indv; ui++)
282
+ {
283
+ if (include_indv[ui] == false)
284
+ continue;
285
+
286
+ genotype = make_pair(-1,-1);
287
+ phase = '/';
288
+ if (include_genotype[s][ui] == true)
289
+ {
290
+ e.parse_genotype_entry(ui, true);
291
+ e.get_indv_GENOTYPE_ids(ui, genotype);
292
+ phase = e.get_indv_PHASE(ui);
293
+ }
294
+
295
+ if (genotype.first == -1)
296
+ TPED << "\t0";
297
+ else
298
+ TPED << "\t" << alleles[genotype.first];
299
+
300
+ if (genotype.second == -1)
301
+ {
302
+ if (phase == '/')
303
+ TPED << "\t0";
304
+ else if (genotype.first != -1)
305
+ TPED << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc
306
+ else
307
+ TPED << "\t0";
308
+ }
309
+ else
310
+ TPED << "\t" << alleles[genotype.second];
311
+ }
312
+ TPED << endl;
313
+ }
314
+
315
+ TPED.close();
316
+
317
+ printLOG("Writing PLINK TFAM file ... ");
318
+ ofstream TFAM(tfam_file.c_str());
319
+ if (!TFAM.is_open()) error("Could not open output file: " + tfam_file, 12);
320
+ for (unsigned int ui=0; ui<N_indv; ui++)
321
+ {
322
+ if (include_indv[ui] == false)
323
+ continue;
324
+ TFAM << indv[ui] << "\t" << indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0 << endl;
325
+ }
326
+
327
+ TFAM.close();
328
+ printLOG("Done.\n");
329
+ }
330
+
331
+ /*
332
+ // Output as a simple 0/1/2 matrix
333
+ void vcf_file::output_as_012_matrix(const string &output_file_prefix)
334
+ {
335
+ if (has_genotypes == false)
336
+ error("Require Genotypes in VCF file in order to output as 0/1/2 matrix.");
337
+
338
+ printLOG("Writing 012 matrix file ... ");
339
+ string ped_file = output_file_prefix + ".012";
340
+ string map_file = output_file_prefix + ".012.pos";
341
+ string fam_file = output_file_prefix + ".012.indv";
342
+
343
+ ofstream PED(ped_file.c_str());
344
+ if (!PED.is_open()) error("Could not open output file: " + ped_file, 12);
345
+ string allele1, allele2;
346
+
347
+ ofstream FAM(fam_file.c_str());
348
+ if (!FAM.is_open()) error("Could not open output file: " + fam_file, 12);
349
+
350
+ pair<int, int> genotype;
351
+ string vcf_line;
352
+ vcf_entry e(N_indv);
353
+ for (unsigned int ui=0; ui<N_indv; ui++)
354
+ {
355
+ if (include_indv[ui] == false)
356
+ continue;
357
+ FAM << indv[ui] << endl;
358
+ PED << ui;
359
+ //uk = 2*ui;
360
+ for (unsigned int s=0; s<N_entries; s++)
361
+ {
362
+ if (include_entry[s] == false)
363
+ continue;
364
+
365
+ get_vcf_entry(s, vcf_line);
366
+ e.reset(vcf_line);
367
+ e.parse_basic_entry(true);
368
+
369
+ if (e.get_N_alleles() <= 2) // Only output sites with one alternative allele
370
+ {
371
+ genotype = make_pair(-1,-1);
372
+ if (include_genotype[s][ui] == true)
373
+ {
374
+ e.parse_genotype_entry(ui, true);
375
+ e.get_indv_GENOTYPE_ids(ui, genotype);
376
+ }
377
+
378
+ if ((genotype.first == -1) && (genotype.second == -1))
379
+ PED << "\t-1"; // Missing data
380
+ else if ((genotype.first == 0) && (genotype.second == 0))
381
+ PED << "\t0"; // No copies of the alternative allele
382
+ else
383
+ {
384
+ if ((genotype.first == 1) && (genotype.second == 1))
385
+ PED << "\t2"; // Two copies of the alternative allele
386
+ else
387
+ PED << "\t1"; // Must be one copy of the alternative allele.
388
+ }
389
+ }
390
+ }
391
+ PED << endl;
392
+ }
393
+
394
+ FAM.close();
395
+ PED.close();
396
+
397
+ ofstream MAP(map_file.c_str());
398
+ if (!MAP.is_open()) error("Could not open output file: " + map_file, 12);
399
+ for (unsigned int s=0; s<N_entries; s++)
400
+ {
401
+ if (include_entry[s] == false)
402
+ continue;
403
+
404
+ get_vcf_entry(s, vcf_line);
405
+ e.reset(vcf_line);
406
+ e.parse_basic_entry(true);
407
+ if (e.get_N_alleles() <= 2) // Only output sites with one alternative allele
408
+ {
409
+ MAP << e.get_CHROM() << "\t" << e.get_POS() << endl;
410
+ }
411
+ }
412
+
413
+ MAP.close();
414
+ printLOG("Done.\n");
415
+ }
416
+ */
417
+
418
+ void vcf_file::output_as_012_matrix(const string &output_file_prefix)
419
+ {
420
+ // Output as PLINK formatted PED/MAP files.
421
+ if (has_genotypes == false)
422
+ error("Require Genotypes in VCF file in order to output as 0/1/2 matrix.");
423
+
424
+ printLOG("Writing 012 matrix file ... ");
425
+ string ped_file = output_file_prefix + ".012";
426
+ string map_file = output_file_prefix + ".012.pos";
427
+ string fam_file = output_file_prefix + ".012.indv";
428
+
429
+ ofstream FAM(fam_file.c_str());
430
+ if (!FAM.is_open()) error("Could not open output file: " + fam_file, 12);
431
+
432
+ vector<ofstream *> tmp_files(N_indv);
433
+ vector<string> tmp_filenames(N_indv);
434
+ for (unsigned int ui=0; ui<N_indv; ui++)
435
+ {
436
+ if (include_indv[ui] == false)
437
+ continue;
438
+ FAM << indv[ui] << endl;
439
+ string filename(tmpnam(NULL));
440
+ ofstream *tmp_file = new ofstream(filename.c_str());
441
+ if (!tmp_file->good())
442
+ error("\n\nCould not open temporary file.\n\n"
443
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
444
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
445
+ (*tmp_file) << ui;
446
+ tmp_files[ui] = tmp_file;
447
+ tmp_filenames[ui] = filename;
448
+ }
449
+
450
+ FAM.close();
451
+
452
+ vector<string> alleles;
453
+ char phase;
454
+ pair<int, int> genotype;
455
+ string vcf_line;
456
+ vcf_entry e(N_indv);
457
+ ofstream *tmp_file;
458
+ for (unsigned int s=0; s<N_entries; s++)
459
+ {
460
+ if (include_entry[s] == false)
461
+ continue;
462
+
463
+ get_vcf_entry(s, vcf_line);
464
+ e.reset(vcf_line);
465
+ e.parse_basic_entry(true);
466
+
467
+ if (e.get_N_alleles() > 2)
468
+ {
469
+ one_off_warning("\t012: Only outputting biallelic loci.");
470
+ continue;
471
+ }
472
+
473
+ e.get_alleles_vector(alleles);
474
+
475
+ for (unsigned int ui=0; ui<N_indv; ui++)
476
+ {
477
+ if (include_indv[ui] == false)
478
+ continue;
479
+
480
+ tmp_file = tmp_files[ui];
481
+
482
+ genotype = make_pair(-1,-1);
483
+ phase = '/';
484
+ if (include_genotype[s][ui] == true)
485
+ {
486
+ e.parse_genotype_entry(ui, true);
487
+ e.get_indv_GENOTYPE_ids(ui, genotype);
488
+ phase = e.get_indv_PHASE(ui);
489
+ }
490
+
491
+ if ((genotype.first == -1) && (genotype.second == -1))
492
+ (*tmp_file) << "\t-1"; // Missing data
493
+ else if ((genotype.first == 0) && (genotype.second == 0))
494
+ (*tmp_file) << "\t0"; // No copies of the alternative allele
495
+ else
496
+ {
497
+ if ((genotype.first == 1) && (genotype.second == 1))
498
+ (*tmp_file) << "\t2"; // Two copies of the alternative allele
499
+ else
500
+ (*tmp_file) << "\t1"; // Must be one copy of the alternative allele.
501
+ }
502
+ }
503
+ }
504
+
505
+ ofstream PED(ped_file.c_str());
506
+ if (!PED.is_open()) error("Could not open output file: " + ped_file, 12);
507
+ string tmp_line;
508
+ for (unsigned int ui=0; ui<N_indv; ui++)
509
+ {
510
+ if (include_indv[ui] == false)
511
+ continue;
512
+
513
+ ofstream *tmp_file = tmp_files[ui];
514
+ (*tmp_file) << endl;
515
+ tmp_file->close();
516
+
517
+ ifstream read_file(tmp_filenames[ui].c_str());
518
+ if (!read_file.good())
519
+ error("\n\nCould not open temporary file.\n\n"
520
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
521
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
522
+ getline(read_file, tmp_line);
523
+ PED << tmp_line << endl;
524
+ read_file.close();
525
+ remove(tmp_filenames[ui].c_str());
526
+ }
527
+ PED.close();
528
+
529
+ printLOG("Writing 012 positions file ... ");
530
+ ofstream MAP(map_file.c_str());
531
+ if (!MAP.is_open()) error("Could not open output file: " + map_file, 12);
532
+ for (unsigned int s=0; s<N_entries; s++)
533
+ {
534
+ if (include_entry[s] == false)
535
+ continue;
536
+
537
+ get_vcf_entry(s, vcf_line);
538
+ e.reset(vcf_line);
539
+ e.parse_basic_entry(true);
540
+ if (e.get_N_alleles() <= 2) // Only output sites with one alternative allele
541
+ {
542
+ MAP << e.get_CHROM() << "\t" << e.get_POS() << endl;
543
+ }
544
+ }
545
+
546
+ MAP.close();
547
+ printLOG("Done.\n");
548
+ }
549
+
550
+ // Output as IMPUTE format
551
+ void vcf_file::output_as_IMPUTE(const string &output_file_prefix)
552
+ {
553
+ if (has_genotypes == false)
554
+ error("Require Genotypes in VCF file in order to output IMPUTE format.");
555
+
556
+ printLOG("Outputting in IMPUTE format (bi-allelic, completely phased SNPs only)\n");
557
+ unsigned int s, ui;
558
+ string legend_file = output_file_prefix + ".impute.legend";
559
+ string haplotype_file = output_file_prefix + ".impute.hap";
560
+ string indv_file = output_file_prefix + ".impute.hap.indv";
561
+ ofstream legend(legend_file.c_str());
562
+ if (!legend.is_open())
563
+ error("Could not open IMPUTE Legend Output File: " + legend_file, 2);
564
+ legend << "ID pos allele0 allele1" << endl;
565
+
566
+ ofstream hap(haplotype_file.c_str());
567
+ if (!hap.is_open())
568
+ error("Could not open IMPUTE Haplotype Output File: " + haplotype_file, 2);
569
+
570
+ ofstream indv_out(indv_file.c_str());
571
+ if (!indv_out.is_open())
572
+ error("Could not open IMPUTE Individual Output File: " + indv_file, 2);
573
+
574
+ for (ui=0; ui<N_indv; ui++)
575
+ {
576
+ if (include_indv[ui] == false)
577
+ continue;
578
+ indv_out << indv[ui] << endl;
579
+ }
580
+ indv_out.close();
581
+
582
+ pair<int, int> alleles;
583
+ string vcf_line;
584
+ vcf_entry e(N_indv);
585
+ for (s=0; s<N_entries; s++)
586
+ {
587
+ if (include_entry[s] == false)
588
+ continue;
589
+
590
+ get_vcf_entry(s, vcf_line);
591
+ e.reset(vcf_line);
592
+ e.parse_basic_entry(true);
593
+
594
+ if (e.get_N_alleles() > 2)
595
+ {
596
+ one_off_warning("\tIMPUTE: Only outputting biallelic loci.");
597
+ continue;
598
+ }
599
+
600
+ // Exclude entries with missing data and/or unphased
601
+ bool missing = false;
602
+ for (ui=0; ui<N_indv; ui++)
603
+ {
604
+ if (include_indv[ui] == false)
605
+ continue;
606
+
607
+ if (include_genotype[s][ui] == false)
608
+ {
609
+ missing = true;
610
+ break;
611
+ }
612
+
613
+ e.parse_genotype_entry(ui, true);
614
+ e.get_indv_GENOTYPE_ids(ui, alleles);
615
+ if ((alleles.first == -1) || (alleles.second == -1))
616
+ {
617
+ missing = true;
618
+ break;
619
+ }
620
+
621
+ if (e.get_indv_PHASE(ui) != '|')
622
+ {
623
+ missing = true;
624
+ break;
625
+ }
626
+ }
627
+ if (missing == true)
628
+ continue;
629
+
630
+ if (e.get_ID() == ".")
631
+ {
632
+ legend << e.get_CHROM() << "-" << e.get_POS() << " " << e.get_POS() << " " << e.get_REF() << " " << e.get_ALT_allele(0) << endl;
633
+ }
634
+ else
635
+ legend << e.get_ID() << " " << e.get_POS() << " " << e.get_REF() << " " << e.get_ALT_allele(0) << endl;
636
+
637
+ bool first = true;
638
+ for (ui=0; ui<N_indv; ui++)
639
+ {
640
+ if (include_indv[ui] == false)
641
+ continue;
642
+
643
+ e.parse_genotype_entry(ui, true);
644
+ e.get_indv_GENOTYPE_ids(ui, alleles);
645
+ if (first == true)
646
+ {
647
+ hap << alleles.first << " " << alleles.second;
648
+ first = false;
649
+ }
650
+ else
651
+ hap << " " << alleles.first << " " << alleles.second;
652
+ }
653
+ hap << endl;
654
+ }
655
+
656
+ hap.close();
657
+ legend.close();
658
+ }
659
+
660
+ void vcf_file::output_LDhat_locs_file(const string &output_file_prefix, const string &chr, unsigned int &n_sites_out)
661
+ {
662
+ string locs_file = output_file_prefix + ".ldhat.locs";
663
+ ofstream locs(locs_file.c_str());
664
+ if (!locs.is_open())
665
+ error("Could not open LDhat locs Output File: " + locs_file, 2);
666
+
667
+ int max_pos = -1;
668
+ unsigned int n_sites=0;
669
+
670
+ vcf_entry e(N_indv);
671
+ string vcf_line;
672
+ string chrom;
673
+ for (unsigned int s=0; s<N_entries; s++)
674
+ {
675
+ if (include_entry[s] == false)
676
+ continue;
677
+
678
+ get_vcf_entry(s, vcf_line);
679
+ e.reset(vcf_line);
680
+ e.parse_basic_entry(true);
681
+
682
+ if (e.get_N_alleles() != 2)
683
+ {
684
+ continue;
685
+ }
686
+
687
+ e.get_CHROM(chrom);
688
+ if (chrom != chr)
689
+ error("Mismatching chromosome in LDhat loci", 13);
690
+
691
+ max_pos = max(e.get_POS(), max_pos);
692
+ n_sites++;
693
+ }
694
+
695
+ locs << n_sites;
696
+ locs.setf(ios::fixed,ios::floatfield);
697
+ locs.precision(4);
698
+ locs << "\t" << max_pos / 1000.0 << "\tL" << endl;
699
+ for (unsigned int s=0; s<N_entries; s++)
700
+ {
701
+ if (include_entry[s] == false)
702
+ continue;
703
+
704
+ get_vcf_entry(s, vcf_line);
705
+ e.reset(vcf_line);
706
+ e.parse_basic_entry(true);
707
+
708
+ if (e.get_N_alleles() != 2)
709
+ {
710
+ one_off_warning("\tLDhat: Only outputting biallelic loci.");
711
+ continue;
712
+ }
713
+
714
+ locs << e.get_POS() / 1000.0 << endl;
715
+ }
716
+ locs.close();
717
+
718
+ n_sites_out = n_sites;
719
+ }
720
+
721
+ void vcf_file::output_as_LDhat_phased(const string &output_file_prefix, const string &chr)
722
+ {
723
+ if (has_genotypes == false)
724
+ error("Require Genotypes in VCF file in order to output LDhat format.");
725
+
726
+ printLOG("Outputting in phased LDhat format\n");
727
+ if (chr == "")
728
+ error("Require chromosome for LDhat output", 10);
729
+
730
+ unsigned int n_sites;
731
+ output_LDhat_locs_file(output_file_prefix, chr, n_sites);
732
+
733
+ string sites_file = output_file_prefix + ".ldhat.sites";
734
+ ofstream sites(sites_file.c_str());
735
+ if (!sites.is_open())
736
+ error("Could not open LDhat sites Output File: " + sites_file, 2);
737
+
738
+ unsigned int n_indv = N_kept_individuals();
739
+ pair<int, int> alleles;
740
+
741
+ sites << n_indv*2 << "\t" << n_sites << "\t1" << endl; // Note - this is incorrect for the X-chr.
742
+
743
+ vector<ofstream *> tmp_files(2*N_indv);
744
+ vector<string> tmp_filenames(2*N_indv);
745
+ for (unsigned int ui=0; ui<N_indv; ui++)
746
+ {
747
+ if (include_indv[ui] == false)
748
+ continue;
749
+ string filename(tmpnam(NULL));
750
+ ofstream *tmp_file = new ofstream(filename.c_str());
751
+ if (!tmp_file->good())
752
+ error("Could not open temp file.\n", 12);
753
+ tmp_files[2*ui] = tmp_file;
754
+ tmp_filenames[2*ui] = filename;
755
+
756
+ string filename2(tmpnam(NULL));
757
+ ofstream *tmp_file2 = new ofstream(filename2.c_str());
758
+ if (!tmp_file2->good())
759
+ error("\n\nCould not open temporary file.\n\n"
760
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
761
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
762
+ tmp_files[2*ui+1] = tmp_file2;
763
+ tmp_filenames[2*ui+1] = filename2;
764
+ }
765
+
766
+ string vcf_line;
767
+ vcf_entry e(N_indv);
768
+ ofstream *tmp_file;
769
+
770
+ for (unsigned int s=0; s<N_entries; s++)
771
+ {
772
+ if (include_entry[s] == false)
773
+ continue;
774
+
775
+ get_vcf_entry(s, vcf_line);
776
+ e.reset(vcf_line);
777
+ e.parse_basic_entry(true);
778
+
779
+ if (e.get_N_alleles() != 2)
780
+ {
781
+ one_off_warning("\tLDhat: Only outputting biallelic loci.");
782
+ continue;
783
+ }
784
+
785
+ for (unsigned int ui=0; ui<N_indv; ui++)
786
+ {
787
+ if (include_indv[ui] == false)
788
+ continue;
789
+
790
+ e.parse_genotype_entry(ui, true);
791
+ e.get_indv_GENOTYPE_ids(ui, alleles);
792
+
793
+ for (unsigned int k=0; k<2; k++)
794
+ {
795
+ tmp_file = tmp_files[(2*ui)+k];
796
+
797
+ int geno;
798
+ if (k == 0)
799
+ geno = alleles.first;
800
+ else
801
+ geno = alleles.second;
802
+
803
+ if ((geno != -1) && (include_genotype[s][ui]==true))
804
+ (*tmp_file) << geno;
805
+ else
806
+ (*tmp_file) << "?";
807
+ }
808
+ }
809
+ }
810
+
811
+ string tmp_line;
812
+ for (unsigned int ui=0; ui<N_indv; ui++)
813
+ {
814
+ if (include_indv[ui] == false)
815
+ continue;
816
+
817
+ for (unsigned int k=0; k<2; k++)
818
+ {
819
+ ofstream *tmp_file = tmp_files[2*ui+k];
820
+ (*tmp_file) << endl;
821
+ tmp_file->close();
822
+
823
+ ifstream read_file(tmp_filenames[2*ui+k].c_str());
824
+ if (!read_file.good())
825
+ error("\n\nCould not open temporary file.\n\n"
826
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
827
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
828
+ getline(read_file, tmp_line);
829
+ sites << ">" << indv[ui] << "-" << k << endl;
830
+ sites << tmp_line << endl;
831
+ read_file.close();
832
+ remove(tmp_filenames[2*ui+k].c_str());
833
+ }
834
+ }
835
+
836
+ sites.close();
837
+ }
838
+
839
+ void vcf_file::output_as_LDhat_unphased(const string &output_file_prefix, const string &chr)
840
+ {
841
+ if (has_genotypes == false)
842
+ error("Require Genotypes in VCF file in order to output LDhat format.");
843
+
844
+ printLOG("Outputting in unphased LDhat format\n");
845
+ if (chr == "")
846
+ error("Require chromosome for LDhat output", 10);
847
+
848
+ unsigned int n_sites;
849
+ output_LDhat_locs_file(output_file_prefix, chr, n_sites);
850
+
851
+ string sites_file = output_file_prefix + ".ldhat.sites";
852
+ ofstream sites(sites_file.c_str());
853
+ if (!sites.is_open())
854
+ error("Could not open LDhat sites Output File: " + sites_file, 2);
855
+
856
+ unsigned int n_indv = N_kept_individuals();
857
+ pair<int, int> alleles;
858
+
859
+ sites << n_indv << "\t" << n_sites << "\t2" << endl;
860
+
861
+ vector<ofstream *> tmp_files(N_indv);
862
+ vector<string> tmp_filenames(N_indv);
863
+ for (unsigned int ui=0; ui<N_indv; ui++)
864
+ {
865
+ if (include_indv[ui] == false)
866
+ continue;
867
+ string filename(tmpnam(NULL));
868
+ ofstream *tmp_file = new ofstream(filename.c_str());
869
+ if (!tmp_file->good())
870
+ error("\n\nCould not open temporary file.\n\n"
871
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
872
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
873
+ tmp_files[ui] = tmp_file;
874
+ tmp_filenames[ui] = filename;
875
+ }
876
+
877
+ string vcf_line;
878
+ vcf_entry e(N_indv);
879
+ ofstream *tmp_file;
880
+ for (unsigned int s=0; s<N_entries; s++)
881
+ {
882
+ if (include_entry[s] == false)
883
+ continue;
884
+
885
+ get_vcf_entry(s, vcf_line);
886
+ e.reset(vcf_line);
887
+ e.parse_basic_entry(true);
888
+
889
+ if (e.get_N_alleles() != 2)
890
+ {
891
+ one_off_warning("\tLDhat: Only outputting biallelic loci.");
892
+ continue;
893
+ }
894
+
895
+ for (unsigned int ui=0; ui<N_indv; ui++)
896
+ {
897
+ if (include_indv[ui] == false)
898
+ continue;
899
+
900
+ tmp_file = tmp_files[ui];
901
+
902
+ if (include_genotype[s][ui] == false)
903
+ (*tmp_file) << "?";
904
+ else
905
+ {
906
+ e.parse_genotype_entry(ui, true);
907
+ e.get_indv_GENOTYPE_ids(ui, alleles);
908
+
909
+ switch (alleles.first)
910
+ {
911
+ case -1:
912
+ (*tmp_file) << "?"; break;
913
+ case 0:
914
+ if (alleles.second == 0)
915
+ (*tmp_file) << 0;
916
+ else if (alleles.second == 1)
917
+ (*tmp_file) << 2;
918
+ else if ((alleles.second == -1) && (e.get_indv_PHASE(ui) == '|'))
919
+ (*tmp_file) << 0; // Haploid case
920
+ else
921
+ (*tmp_file) << '?';
922
+ break;
923
+ case 1:
924
+ if (alleles.second == 0)
925
+ (*tmp_file) << 2;
926
+ else if (alleles.second == 1)
927
+ (*tmp_file) << 1;
928
+ else if ((alleles.second == -1) && (e.get_indv_PHASE(ui) == '|'))
929
+ (*tmp_file) << 1; // Haploid case
930
+ else
931
+ (*tmp_file) << '?';
932
+ break;
933
+ default:
934
+ (*tmp_file) << '?';
935
+ }
936
+ }
937
+ }
938
+ }
939
+
940
+ string tmp_line;
941
+ for (unsigned int ui=0; ui<N_indv; ui++)
942
+ {
943
+ if (include_indv[ui] == false)
944
+ continue;
945
+
946
+ ofstream *tmp_file = tmp_files[ui];
947
+ (*tmp_file) << endl;
948
+ tmp_file->close();
949
+
950
+ ifstream read_file(tmp_filenames[ui].c_str());
951
+ if (!read_file.good())
952
+ error("\n\nCould not open temporary file.\n\n"
953
+ "Most likely this is because the system is not allowing me to open enough temporary files.\n"
954
+ "Try using ulimit -n <int> to increase the number of allowed open files.\n", 12);
955
+ getline(read_file, tmp_line);
956
+ sites << ">" << indv[ui] << endl;
957
+ sites << tmp_line << endl;
958
+ read_file.close();
959
+ remove(tmp_filenames[ui].c_str());
960
+ }
961
+
962
+ sites.close();
963
+ }
964
+
965
+ // Output INFO fields in tab-delimited format
966
+ void vcf_file::output_INFO_for_each_site(const string &output_file_prefix, const vector<string> &INFO_to_extract)
967
+ {
968
+ if (INFO_to_extract.size() == 0)
969
+ return;
970
+
971
+ printLOG("Outputting INFO for each site\n");
972
+ string output = output_file_prefix + ".INFO";
973
+ ofstream out(output.c_str());
974
+ if (!out.is_open())
975
+ error("Could not open INFO Output File: " + output, 3);
976
+
977
+ out << "CHROM\tPOS\tREF\tALT";
978
+ for (unsigned int ui=0; ui<INFO_to_extract.size(); ui++)
979
+ out << "\t" << INFO_to_extract[ui];
980
+ out << endl;
981
+
982
+ string vcf_line;
983
+ vcf_entry e(N_indv);
984
+ for (unsigned int s=0; s<N_entries; s++)
985
+ {
986
+ if (include_entry[s] == false)
987
+ continue;
988
+
989
+ get_vcf_entry(s, vcf_line);
990
+ e.reset(vcf_line);
991
+ e.parse_basic_entry(true, false, true);
992
+
993
+ out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e.get_REF() << "\t" << e.get_ALT();
994
+
995
+ for (unsigned int ui=0; ui<INFO_to_extract.size(); ui++)
996
+ {
997
+ out << "\t" << e.get_INFO_value(INFO_to_extract[ui]);
998
+ }
999
+ out << endl;
1000
+ }
1001
+
1002
+ out.close();
1003
+ }
1004
+
1005
+
1006
+ // Output FORMAT information in tab-delimited format.
1007
+ void vcf_file::output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id)
1008
+ {
1009
+ if (FORMAT_id == "")
1010
+ return;
1011
+
1012
+ if (has_genotypes == false)
1013
+ error("Require Genotypes in VCF file in order to output FORMAT information.");
1014
+
1015
+ printLOG("Outputting FORMAT information for " + FORMAT_id + "\n");
1016
+ string output = output_file_prefix + "." + FORMAT_id + ".FORMAT";
1017
+ ofstream out(output.c_str());
1018
+ if (!out.is_open())
1019
+ error("Could not open FORMAT Output File: " + output, 7);
1020
+
1021
+ out << "CHROM\tPOS";
1022
+ for (unsigned int ui=0; ui<N_indv; ui++)
1023
+ {
1024
+ if (include_indv[ui] == true)
1025
+ out << "\t" << indv[ui];
1026
+ }
1027
+ out << endl;
1028
+
1029
+ string vcf_line, FORMAT_out;
1030
+ vcf_entry e(N_indv);
1031
+ for (unsigned int s=0; s<N_entries; s++)
1032
+ {
1033
+ if (include_entry[s] == false)
1034
+ continue;
1035
+
1036
+ get_vcf_entry(s, vcf_line);
1037
+ e.reset(vcf_line);
1038
+ e.parse_basic_entry();
1039
+ e.parse_full_entry(true);
1040
+
1041
+ if (e.FORMAT_id_exists(FORMAT_id) == false)
1042
+ continue;
1043
+
1044
+ out << e.get_CHROM() << "\t" << e.get_POS();
1045
+
1046
+ for (unsigned int ui=0; ui<N_indv; ui++)
1047
+ {
1048
+ if (include_indv[ui] == false)
1049
+ continue;
1050
+
1051
+ e.read_indv_generic_entry(ui, FORMAT_id, FORMAT_out);
1052
+ out << "\t" << FORMAT_out;
1053
+ }
1054
+ out << endl;
1055
+ }
1056
+ out.close();
1057
+ }
1058
+
1059
+ // Output genotype likelihoods from GL FORMAT tag, ready for input into BEAGLE
1060
+ // using the Genotype likelihoods file format.
1061
+ void vcf_file::output_BEAGLE_genotype_likelihoods(const string &output_file_prefix)
1062
+ {
1063
+ if (has_genotypes == false)
1064
+ error("Require Genotypes in VCF file in order to output BEAGLE genotype likelihoods.");
1065
+
1066
+ printLOG("Outputting in BEAGLE Genotype Likelihood format (bi-allelic SNPs with GL tags only)\n");
1067
+
1068
+ string output = output_file_prefix + ".BEAGLE.GL";
1069
+ ofstream out(output.c_str());
1070
+ if (!out.is_open())
1071
+ error("Could not open BEAGLE GL Output File: " + output, 3);
1072
+ out << "marker\talleleA\talleleB";
1073
+ for (unsigned int ui=0; ui<N_indv; ui++)
1074
+ {
1075
+ if (include_indv[ui] == true)
1076
+ out << "\t" << indv[ui] << "\t" << indv[ui] << "\t" << indv[ui];
1077
+ }
1078
+ out << endl;
1079
+
1080
+ string vcf_line, GL_entry, tmp_string;
1081
+ vcf_entry e(N_indv);
1082
+ double lk1, lk2, lk3;
1083
+ bool found_GL=false;
1084
+ istringstream ss;
1085
+
1086
+ for (unsigned int s=0; s<N_entries; s++)
1087
+ {
1088
+ if (include_entry[s] == false)
1089
+ continue;
1090
+
1091
+ get_vcf_entry(s, vcf_line);
1092
+ e.reset(vcf_line);
1093
+ e.parse_basic_entry(true);
1094
+
1095
+ if (e.get_N_alleles() != 2)
1096
+ {
1097
+ one_off_warning("\tBEAGLE: Only outputting biallelic loci.");
1098
+ continue;
1099
+ }
1100
+
1101
+ e.parse_full_entry(true);
1102
+
1103
+ if (e.FORMAT_id_exists("GL") == false)
1104
+ continue;
1105
+ found_GL = true;
1106
+
1107
+ out << e.get_CHROM() << ":" << e.get_POS() << "\t" << e.get_REF() << "\t" << e.get_ALT();
1108
+
1109
+ for (unsigned int ui=0; ui<N_indv; ui++)
1110
+ {
1111
+ if (include_indv[ui] == false)
1112
+ continue;
1113
+
1114
+ if (include_genotype[s][ui] == true)
1115
+ {
1116
+ e.read_indv_generic_entry(ui, "GL", GL_entry);
1117
+ ss.clear();
1118
+ ss.str(GL_entry);
1119
+ getline(ss, tmp_string, ',');
1120
+ lk1 = atof(tmp_string.c_str());
1121
+ getline(ss, tmp_string, ',');
1122
+ lk2 = atof(tmp_string.c_str());
1123
+ getline(ss, tmp_string);
1124
+ lk3 = atof(tmp_string.c_str());
1125
+ out << "\t" << pow(10,lk1) << "\t" << pow(10,lk2) << "\t" << pow(10,lk3);
1126
+ }
1127
+ else
1128
+ {
1129
+ out << "\t1\t1\t1"; // Mark as unknown
1130
+ }
1131
+ }
1132
+ out << endl;
1133
+ }
1134
+
1135
+ if (found_GL == false)
1136
+ error("Require GL FORMAT tags in VCF file to output BEAGLE input.");
1137
+ }
1138
+