ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1282 @@
1
+ /*
2
+ * vcf_file_merge.cpp
3
+ *
4
+ * Created on: Oct 30, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 230 $)
7
+ */
8
+
9
+ #include "vcf_file.h"
10
+
11
+ void vcf_file::return_site_union(vcf_file &file2, map<pair<string, int>, pair<int, int> > &CHROMPOS_to_filepos_pair)
12
+ {
13
+ unsigned int s;
14
+ int POS;
15
+ string CHROM;
16
+ string vcf_line;
17
+ for (s=0; s<N_entries; s++)
18
+ {
19
+ if (include_entry[s] == true)
20
+ {
21
+ get_vcf_entry(s, vcf_line);
22
+ vcf_entry e(N_indv, vcf_line);
23
+ e.parse_basic_entry();
24
+
25
+ CHROM = e.get_CHROM();
26
+ POS = e.get_POS();
27
+
28
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)] = make_pair<int,int>(s, -1);
29
+ }
30
+ }
31
+ for (s=0; s<file2.N_entries; s++)
32
+ {
33
+ if (file2.include_entry[s] == true)
34
+ {
35
+ file2.get_vcf_entry(s, vcf_line);
36
+ vcf_entry e(file2.N_indv, vcf_line);
37
+ e.parse_basic_entry();
38
+
39
+ CHROM = e.get_CHROM();
40
+ POS = e.get_POS();
41
+
42
+ if (CHROMPOS_to_filepos_pair.find(make_pair<string,int>(CHROM, POS)) != CHROMPOS_to_filepos_pair.end())
43
+ {
44
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)].second = s;
45
+ }
46
+ else
47
+ {
48
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)] = make_pair<int,int>(-1, s);
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+
55
+ void vcf_file::return_indv_union(vcf_file &file2, map<string, pair< int, int> > &combined_individuals)
56
+ {
57
+ for (unsigned int ui=0; ui<N_indv; ui++)
58
+ if (include_indv[ui] == true)
59
+ combined_individuals[indv[ui]] = make_pair<int,int>(ui, -1);
60
+
61
+ for (unsigned int ui=0; ui<file2.N_indv; ui++)
62
+ if (file2.include_indv[ui] == true)
63
+ {
64
+ if (combined_individuals.find(file2.indv[ui]) != combined_individuals.end())
65
+ combined_individuals[file2.indv[ui]].second = ui;
66
+ else
67
+ combined_individuals[file2.indv[ui]] = make_pair<int,int>(-1, ui);
68
+ }
69
+ }
70
+
71
+ void vcf_file::output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file)
72
+ {
73
+ printLOG("Comparing sites in VCF files...\n");
74
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
75
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
76
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
77
+
78
+ string vcf_line;
79
+ string CHROM;
80
+ int POS;
81
+
82
+ string output_file = output_file_prefix + ".diff.sites_in_files";
83
+ ofstream sites_in_files(output_file.c_str());
84
+ sites_in_files << "CHROM\tPOS\tIN_FILE\tREF\tALT1\tALT2" << endl;
85
+
86
+ int s1, s2;
87
+ int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
88
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
89
+ {
90
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
91
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
92
+
93
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
94
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
95
+
96
+ vcf_entry e1(N_indv);
97
+ vcf_entry e2(diff_vcf_file.N_indv);
98
+
99
+ // Read entries from file (if available)
100
+ if (s1 != -1)
101
+ {
102
+ get_vcf_entry(s1, vcf_line);
103
+ e1.reset(vcf_line);
104
+ }
105
+
106
+ if (s2 != -1)
107
+ {
108
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
109
+ e2.reset(vcf_line);
110
+ }
111
+
112
+ e1.parse_basic_entry(true);
113
+ e2.parse_basic_entry(true);
114
+
115
+ // Set the reference to the non-missing entry (if available)
116
+ string REF = e1.get_REF();
117
+ string REF2 = e2.get_REF();
118
+ if ((REF == "N") || (REF == "."))
119
+ REF = REF2;
120
+ if ((REF2 == "N") || (REF2 == "."))
121
+ REF2 = REF;
122
+
123
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N") && (REF != ".") && (REF2 != "."))
124
+ warning("Non-matching REF at " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2 + ". Diff results may be unreliable.");
125
+
126
+ sites_in_files << CHROM << "\t" << POS << "\t";
127
+ if ((s1 != -1) && (s2 != -1))
128
+ {
129
+ N_common_SNPs++;
130
+ sites_in_files << "B";
131
+ }
132
+ else if ((s1 != -1) && (s2 == -1))
133
+ {
134
+ N_SNPs_file1_only++;
135
+ sites_in_files << "1";
136
+ }
137
+ else if ((s1 == -1) && (s2 != -1))
138
+ {
139
+ N_SNPs_file2_only++;
140
+ sites_in_files << "2";
141
+ }
142
+ else
143
+ error("SNP in neither file!?");
144
+
145
+ sites_in_files << "\t" << REF << "\t" << e1.get_ALT() << "\t" << e2.get_ALT() << endl;
146
+ }
147
+
148
+ sites_in_files.close();
149
+
150
+ printLOG("Found " + int2str(N_common_SNPs) + " SNPs common to both files.\n");
151
+ printLOG("Found " + int2str(N_SNPs_file1_only) + " SNPs only in main file.\n");
152
+ printLOG("Found " + int2str(N_SNPs_file2_only) + " SNPs only in second file.\n");
153
+ }
154
+
155
+ void vcf_file::output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file)
156
+ {
157
+ printLOG("Comparing individuals in VCF files...\n");
158
+
159
+ string output_file = output_file_prefix + ".diff.indv_in_files";
160
+
161
+ ofstream out(output_file.c_str());
162
+ if (!out.is_open())
163
+ error("Could not open Indv Differences File: " + output_file, 3);
164
+ out << "INDV\tFILES" << endl;
165
+
166
+ // Build a list of individuals contained in each file
167
+ map<string, pair< int, int> > combined_individuals;
168
+ map<string, pair< int, int> >::iterator combined_individuals_it;
169
+ return_indv_union(diff_vcf_file, combined_individuals);
170
+
171
+ unsigned int N_combined_indv = combined_individuals.size();
172
+ unsigned int N[3]={0,0,0};
173
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
174
+ {
175
+ if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
176
+ {
177
+ N[0]++;
178
+ out << combined_individuals_it->first << "\tB" << endl;
179
+ }
180
+ else if (combined_individuals_it->second.first != -1)
181
+ {
182
+ N[1]++;
183
+ out << combined_individuals_it->first << "\t1" << endl;
184
+ }
185
+ else if (combined_individuals_it->second.second != -1)
186
+ {
187
+ N[2]++;
188
+ out << combined_individuals_it->first << "\t2" << endl;
189
+ }
190
+ else
191
+ error("Unhandled case");
192
+ }
193
+ out.close();
194
+
195
+ printLOG("N_combined_individuals:\t" + int2str(N_combined_indv) + "\n");
196
+ printLOG("N_individuals_common_to_both_files:\t" + int2str(N[0]) + "\n");
197
+ printLOG("N_individuals_unique_to_file1:\t" + int2str(N[1]) + "\n");
198
+ printLOG("N_individuals_unique_to_file2:\t" + int2str(N[2]) + "\n");
199
+ }
200
+
201
+ void vcf_file::output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file)
202
+ {
203
+ printLOG("Outputting Discordance By Individual...\n");
204
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
205
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
206
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
207
+
208
+ map<string, pair< int, int> > combined_individuals;
209
+ map<string, pair< int, int> >::iterator combined_individuals_it;
210
+ return_indv_union(diff_vcf_file, combined_individuals);
211
+
212
+ map<string, pair<int, int> > indv_sums;
213
+
214
+ string vcf_line, CHROM;
215
+ int POS;
216
+ int s1, s2, indv1, indv2;
217
+
218
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
219
+ {
220
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
221
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
222
+
223
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
224
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
225
+
226
+ vcf_entry e1(N_indv);
227
+ vcf_entry e2(diff_vcf_file.N_indv);
228
+
229
+ // Read entries from file (if available)
230
+ if (s1 != -1)
231
+ {
232
+ get_vcf_entry(s1, vcf_line);
233
+ e1.reset(vcf_line);
234
+ }
235
+
236
+ if (s2 != -1)
237
+ {
238
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
239
+ e2.reset(vcf_line);
240
+ }
241
+
242
+ e1.parse_basic_entry(true);
243
+ e2.parse_basic_entry(true);
244
+
245
+ // Set the reference to the non-missing entry (if available)
246
+ string REF = e1.get_REF();
247
+ string REF2 = e2.get_REF();
248
+ if (REF == "N")
249
+ REF = REF2;
250
+ if (REF2 == "N")
251
+ REF2 = REF;
252
+
253
+ if (REF.size() != REF2.size())
254
+ {
255
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
256
+ continue;
257
+ }
258
+
259
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
260
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
261
+
262
+ // Do the alternative alleles match?
263
+ string ALT, ALT2;
264
+ ALT = e1.get_ALT();
265
+ ALT2 = e2.get_ALT();
266
+
267
+ bool alleles_match = (ALT == ALT2) && (REF == REF2);
268
+ e1.parse_full_entry(true);
269
+ e1.parse_genotype_entries(true);
270
+
271
+ e2.parse_full_entry(true);
272
+ e2.parse_genotype_entries(true);
273
+
274
+ pair<string, string> genotype1, genotype2;
275
+ pair<int,int> geno_ids1, geno_ids2;
276
+ pair<string, string> missing_genotype(".",".");
277
+ pair<int, int> missing_id(-1,-1);
278
+
279
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
280
+ {
281
+ indv1 = combined_individuals_it->second.first;
282
+ indv2 = combined_individuals_it->second.second;
283
+
284
+ if ((indv1 == -1) || (indv2 == -1))
285
+ continue; // Individual not found in one of the files
286
+
287
+ if (alleles_match)
288
+ { // Alleles match, so can compare ids instead of strings
289
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
290
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
291
+
292
+ if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
293
+ {
294
+ indv_sums[combined_individuals_it->first].first++;
295
+ if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
296
+ ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
297
+ { // Match
298
+ // Don't do anything
299
+ }
300
+ else
301
+ { // Mismatch
302
+ indv_sums[combined_individuals_it->first].second++;
303
+ }
304
+ }
305
+ else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
306
+ { // Both missing
307
+ // Don't do anything.
308
+ }
309
+ else if (geno_ids1 != missing_id)
310
+ { // Genotype 1 is not missing, genotype 2 is.
311
+ // Don't do anything.
312
+ }
313
+ else if (geno_ids2 != missing_id)
314
+ { // Genotype 2 is not missing, genotype 1 is.
315
+ // Don't do anything.
316
+ }
317
+ else
318
+ error("Unknown condition");
319
+ }
320
+ else
321
+ { // Alleles don't match, so need to be more careful and compare strings
322
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
323
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
324
+
325
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
326
+ { // No missing data
327
+ indv_sums[combined_individuals_it->first].first++;
328
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
329
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
330
+ { // Match
331
+ // Don't do anything
332
+ }
333
+ else
334
+ { // Mismatch
335
+ indv_sums[combined_individuals_it->first].second++;
336
+ }
337
+ }
338
+ else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
339
+ { // Both missing
340
+ // Don't do anything
341
+ }
342
+ else if (genotype1 != missing_genotype)
343
+ { // Genotype 1 is not missing, genotype 2 is.
344
+ // Don't do anything
345
+ }
346
+ else if (genotype2 != missing_genotype)
347
+ { // Genotype 2 is not missing, genotype 1 is.
348
+ // Don't do anything
349
+ }
350
+ else
351
+ error("Unknown condition");
352
+ }
353
+ }
354
+ }
355
+
356
+ string output_file = output_file_prefix + ".diff.indv";
357
+ ofstream out(output_file.c_str());
358
+ if (!out.is_open())
359
+ error("Could not open Sites Differences File: " + output_file, 3);
360
+ out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
361
+
362
+ int N, N_discord;
363
+ double discordance;
364
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
365
+ {
366
+ out << combined_individuals_it->first;
367
+ N = indv_sums[combined_individuals_it->first].first;
368
+ N_discord = indv_sums[combined_individuals_it->first].second;
369
+ discordance = N_discord / double(N);
370
+ out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl;
371
+ }
372
+
373
+ out.close();
374
+ }
375
+
376
+ void vcf_file::output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file)
377
+ {
378
+ printLOG("Outputting Discordance By Site...\n");
379
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
380
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
381
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
382
+
383
+ map<string, pair< int, int> > combined_individuals;
384
+ map<string, pair< int, int> >::iterator combined_individuals_it;
385
+ return_indv_union(diff_vcf_file, combined_individuals);
386
+
387
+ string CHROM, vcf_line;
388
+ int POS;
389
+ int s1, s2, indv1, indv2;
390
+
391
+ string output_file = output_file_prefix + ".diff.sites";
392
+ ofstream diffsites(output_file.c_str());
393
+ if (!diffsites.is_open())
394
+ error("Could not open Sites Differences File: " + output_file, 3);
395
+ //diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALT\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE\tN_FILE1_NONREF_GENOTYPES\tNON_REF_DISCORDANCE" << endl;
396
+ diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
397
+
398
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
399
+ {
400
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
401
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
402
+
403
+ diffsites << CHROM << "\t" << POS;
404
+
405
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
406
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
407
+
408
+ vcf_entry e1(N_indv);
409
+ vcf_entry e2(diff_vcf_file.N_indv);
410
+
411
+ bool data_in_both = true;
412
+ // Read entries from file (if available)
413
+ if (s1 != -1)
414
+ {
415
+ get_vcf_entry(s1, vcf_line);
416
+ e1.reset(vcf_line);
417
+ }
418
+ else
419
+ data_in_both = false;
420
+
421
+ if (s2 != -1)
422
+ {
423
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
424
+ e2.reset(vcf_line);
425
+ }
426
+ else
427
+ data_in_both = false;
428
+
429
+ if (data_in_both)
430
+ diffsites << "\tB";
431
+ else if ((s1 != -1) && (s2 == -1))
432
+ diffsites << "\t1";
433
+ else if ((s1 == -1) && (s2 != -1))
434
+ diffsites << "\t2";
435
+ else
436
+ error("Unhandled condition");
437
+
438
+ e1.parse_basic_entry(true);
439
+ e2.parse_basic_entry(true);
440
+
441
+ // Set the reference to the non-missing entry (if available)
442
+ string REF = e1.get_REF();
443
+ string REF2 = e2.get_REF();
444
+ if (REF == "N")
445
+ REF = REF2;
446
+ if (REF2 == "N")
447
+ REF2 = REF;
448
+
449
+ if (REF.size() != REF2.size())
450
+ {
451
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
452
+ continue;
453
+ }
454
+
455
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
456
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
457
+
458
+ // Do the alternative alleles match?
459
+ string ALT, ALT2;
460
+ ALT = e1.get_ALT();
461
+ ALT2 = e2.get_ALT();
462
+
463
+ bool alleles_match = ((ALT == ALT2) && (REF == REF2));
464
+ diffsites << "\t" << alleles_match;
465
+
466
+ e1.parse_full_entry(true);
467
+ e1.parse_genotype_entries(true);
468
+
469
+ e2.parse_full_entry(true);
470
+ e2.parse_genotype_entries(true);
471
+
472
+ pair<string, string> genotype1, genotype2;
473
+ pair<int,int> geno_ids1, geno_ids2;
474
+ pair<string, string> missing_genotype(".",".");
475
+ pair<int, int> missing_id(-1,-1);
476
+
477
+ unsigned int N_common_called=0; // Number of genotypes called in both files
478
+ unsigned int N_missing_1=0, N_missing_2=0;
479
+ unsigned int N_discord=0;
480
+ unsigned int N_concord_non_missing=0;
481
+
482
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
483
+ {
484
+ indv1 = combined_individuals_it->second.first;
485
+ indv2 = combined_individuals_it->second.second;
486
+
487
+ if ((indv1 == -1) || (indv2 == -1))
488
+ continue; // Individual not found in one of the files
489
+
490
+ if (alleles_match)
491
+ { // Alleles match, so can compare ids instead of strings
492
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
493
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
494
+
495
+ if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
496
+ {
497
+ N_common_called++;
498
+ if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
499
+ ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
500
+ { // Match
501
+ N_concord_non_missing++;
502
+ }
503
+ else
504
+ { // Mismatch
505
+ N_discord++;
506
+ }
507
+ }
508
+ else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
509
+ { // Both missing
510
+ N_missing_1++; N_missing_2++;
511
+ }
512
+ else if (geno_ids1 != missing_id)
513
+ { // Genotype 1 is not missing, genotype 2 is.
514
+ N_missing_2++;
515
+ }
516
+ else if (geno_ids2 != missing_id)
517
+ { // Genotype 2 is not missing, genotype 1 is.
518
+ N_missing_1++;
519
+ }
520
+ else
521
+ error("Unknown condition");
522
+ }
523
+ else
524
+ { // Alleles don't match, so need to be more careful and compare strings
525
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
526
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
527
+
528
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
529
+ { // No missing data
530
+ N_common_called++;
531
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
532
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
533
+ { // Match
534
+ N_concord_non_missing++;
535
+ }
536
+ else
537
+ { // Mismatch
538
+ N_discord++;
539
+ }
540
+ }
541
+ else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
542
+ { // Both missing
543
+ N_missing_1++; N_missing_2++;
544
+ }
545
+ else if (genotype1 != missing_genotype)
546
+ { // Genotype 1 is not missing, genotype 2 is.
547
+ N_missing_2++;
548
+ }
549
+ else if (genotype2 != missing_genotype)
550
+ { // Genotype 2 is not missing, genotype 1 is.
551
+ N_missing_1++;
552
+ }
553
+ else
554
+ error("Unknown condition");
555
+ }
556
+ }
557
+ double discordance = N_discord / double(N_common_called);
558
+ diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance;
559
+ diffsites << endl;
560
+ }
561
+ diffsites.close();
562
+ }
563
+
564
+ void vcf_file::output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file)
565
+ {
566
+ printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n");
567
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
568
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
569
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
570
+
571
+ map<string, pair< int, int> > combined_individuals;
572
+ map<string, pair< int, int> >::iterator combined_individuals_it;
573
+ return_indv_union(diff_vcf_file, combined_individuals);
574
+
575
+ string vcf_line;
576
+ int s1, s2, indv1, indv2;
577
+
578
+ vector<vector<int> > discordance_matrix(4, vector<int>(4, 0));
579
+
580
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
581
+ {
582
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
583
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
584
+
585
+ vcf_entry e1(N_indv);
586
+ vcf_entry e2(diff_vcf_file.N_indv);
587
+
588
+ // Read entries from file (if available)
589
+ if (s1 != -1)
590
+ {
591
+ get_vcf_entry(s1, vcf_line);
592
+ e1.reset(vcf_line);
593
+ }
594
+
595
+ if (s2 != -1)
596
+ {
597
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
598
+ e2.reset(vcf_line);
599
+ }
600
+
601
+ e1.parse_basic_entry(true);
602
+ e2.parse_basic_entry(true);
603
+
604
+ if ((e1.get_N_alleles() != 2) || (e2.get_N_alleles() != 2))
605
+ continue;
606
+
607
+ // Set the reference to the non-missing entry (if available)
608
+ string REF = e1.get_REF();
609
+ string REF2 = e2.get_REF();
610
+ if (REF == "N")
611
+ REF = REF2;
612
+ if (REF2 == "N")
613
+ REF2 = REF;
614
+
615
+ if (REF.size() != REF2.size())
616
+ continue;
617
+
618
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
619
+ continue;
620
+
621
+ // Do the alternative alleles match?
622
+ string ALT, ALT2;
623
+ ALT = e1.get_ALT();
624
+ ALT2 = e2.get_ALT();
625
+
626
+ bool alleles_match = (ALT == ALT2) && (REF == REF2);
627
+ if (alleles_match == false)
628
+ continue;
629
+
630
+ e1.parse_full_entry(true);
631
+ e1.parse_genotype_entries(true);
632
+
633
+ e2.parse_full_entry(true);
634
+ e2.parse_genotype_entries(true);
635
+
636
+ pair<int,int> geno_ids1, geno_ids2;
637
+ int N1, N2;
638
+
639
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
640
+ {
641
+ indv1 = combined_individuals_it->second.first;
642
+ indv2 = combined_individuals_it->second.second;
643
+
644
+ if ((indv1 == -1) || (indv2 == -1))
645
+ continue; // Individual not found in one of the files
646
+
647
+ // Alleles match, so can compare ids instead of strings
648
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
649
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
650
+
651
+ if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) ||
652
+ ((geno_ids2.first != -1) && (geno_ids2.second == -1)))
653
+ { // Haploid
654
+ one_off_warning("***Warning: Haploid chromosomes not counted!***");
655
+ continue;
656
+ }
657
+
658
+ N1 = geno_ids1.first + geno_ids1.second;
659
+ N2 = geno_ids2.first + geno_ids2.second;
660
+
661
+ if ((N1 == -1) || (N1 < -2) || (N1 > 2))
662
+ error("Unhandled case");
663
+ if ((N2 == -1) || (N2 < -2) || (N2 > 2))
664
+ error("Unhandled case");
665
+
666
+ if (N1 == -2)
667
+ N1 = 3;
668
+
669
+ if (N2 == -2)
670
+ N2 = 3;
671
+
672
+ discordance_matrix[N1][N2]++;
673
+ }
674
+ }
675
+
676
+ string output_file = output_file_prefix + ".diff.discordance_matrix";
677
+ ofstream out(output_file.c_str());
678
+ if (!out.is_open())
679
+ error("Could not open Discordance Matrix File: " + output_file, 3);
680
+
681
+ out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl;
682
+ out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl;
683
+ out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl;
684
+ out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl;
685
+ out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl;
686
+ out.close();
687
+ }
688
+
689
+ void vcf_file::output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file)
690
+ {
691
+ printLOG("Outputting Phase Switch Errors...\n");
692
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
693
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
694
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
695
+
696
+ map<string, pair< int, int> > combined_individuals;
697
+ map<string, pair< int, int> >::iterator combined_individuals_it;
698
+ return_indv_union(diff_vcf_file, combined_individuals);
699
+
700
+ string CHROM, vcf_line;
701
+ int POS;
702
+ int s1, s2, indv1, indv2;
703
+
704
+ string output_file = output_file_prefix + ".diff.switch";
705
+ ofstream switcherror(output_file.c_str());
706
+ if (!switcherror.is_open())
707
+ error("Could not open Switch Error file: " + output_file, 4);
708
+ switcherror << "CHROM\tPOS\tINDV" << endl;
709
+
710
+ unsigned int N_combined_indv = combined_individuals.size();
711
+ vector<int> N_phased_het_sites(N_combined_indv, 0);
712
+ vector<int> N_switch_errors(N_combined_indv, 0);
713
+
714
+ pair<string, string> missing_genotype(".",".");
715
+ vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
716
+ vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
717
+ pair<string, string> file1_hap1, file1_hap2, file2_hap1;
718
+
719
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
720
+ {
721
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
722
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
723
+
724
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
725
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
726
+
727
+ vcf_entry e1(N_indv);
728
+ vcf_entry e2(diff_vcf_file.N_indv);
729
+
730
+ // Read entries from file (if available)
731
+ if (s1 != -1)
732
+ {
733
+ get_vcf_entry(s1, vcf_line);
734
+ e1.reset(vcf_line);
735
+ }
736
+
737
+ if (s2 != -1)
738
+ {
739
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
740
+ e2.reset(vcf_line);
741
+ }
742
+
743
+ e1.parse_basic_entry(true);
744
+ e2.parse_basic_entry(true);
745
+
746
+ e1.parse_full_entry(true);
747
+ e1.parse_genotype_entries(true);
748
+
749
+ e2.parse_full_entry(true);
750
+ e2.parse_genotype_entries(true);
751
+
752
+ pair<string, string> genotype1, genotype2;
753
+ pair<string, string> missing_genotype(".",".");
754
+
755
+ unsigned int N_common_called=0; // Number of genotypes called in both files
756
+ unsigned int indv_count=0;
757
+
758
+ // Bug fix applied (#3354189) - July 5th 2011
759
+ for (combined_individuals_it=combined_individuals.begin();
760
+ combined_individuals_it!=combined_individuals.end();
761
+ ++combined_individuals_it, indv_count++)
762
+ {
763
+ indv1 = combined_individuals_it->second.first;
764
+ indv2 = combined_individuals_it->second.second;
765
+
766
+ if ((indv1 == -1) || (indv2 == -1))
767
+ continue; // Individual not found in one of the files
768
+
769
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
770
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
771
+
772
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
773
+ { // No missing data
774
+ N_common_called++;
775
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
776
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
777
+ { // Have a matching genotypes in files 1 and 2
778
+ if (genotype1.first != genotype1.second)
779
+ { // It's a heterozgote
780
+ char phase1, phase2;
781
+ phase1 = e1.get_indv_PHASE(indv1);
782
+ phase2 = e2.get_indv_PHASE(indv2);
783
+ if ((phase1 == '|') && (phase2 == '|'))
784
+ { // Calculate Phasing error (switch error)
785
+ N_phased_het_sites[indv_count]++;
786
+ file1_hap1 = make_pair<string,string>(prev_geno_file1[indv_count].first, genotype1.first);
787
+ file1_hap2 = make_pair<string,string>(prev_geno_file1[indv_count].second, genotype1.second);
788
+ file2_hap1 = make_pair<string,string>(prev_geno_file2[indv_count].first, genotype2.first);
789
+
790
+ if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
791
+ { // Must be a switch error
792
+ string indv_id;
793
+ N_switch_errors[indv_count]++;
794
+ if (indv1 != -1)
795
+ indv_id = indv[indv1];
796
+ else
797
+ indv_id = diff_vcf_file.indv[indv2];
798
+ switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl;
799
+ }
800
+ prev_geno_file1[indv_count] = genotype1;
801
+ prev_geno_file2[indv_count] = genotype2;
802
+ }
803
+ }
804
+ }
805
+ }
806
+ }
807
+ }
808
+ switcherror.close();
809
+
810
+ output_file = output_file_prefix + ".diff.indv.switch";
811
+ ofstream idiscord(output_file.c_str());
812
+ if (!idiscord.is_open())
813
+ error("Could not open Individual Discordance File: " + output_file, 3);
814
+
815
+ idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
816
+ unsigned int indv_count=0;
817
+ double switch_error;
818
+ string indv_id;
819
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
820
+ {
821
+ indv1 = combined_individuals_it->second.first;
822
+ indv2 = combined_individuals_it->second.second;
823
+
824
+ if (indv1 != -1)
825
+ indv_id = indv[indv1];
826
+ else
827
+ indv_id = diff_vcf_file.indv[indv2];
828
+
829
+ if (N_phased_het_sites[indv_count] > 0)
830
+ switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
831
+ else
832
+ switch_error = 0;
833
+ idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
834
+
835
+ indv_count++;
836
+ }
837
+ idiscord.close();
838
+ }
839
+
840
+ /*
841
+ void vcf_file::output_concensus_statistics(const string &output_file_prefix, vcf_file &diff_vcf_file)
842
+ {
843
+ printLOG("Outputting Consensus Statistics... \n");
844
+ unsigned int ui;
845
+
846
+ string output_file;
847
+ string vcf_line;
848
+
849
+ // Build a list of individuals contained in each file
850
+ map<string, pair< int, int> > combined_individuals;
851
+ map<string, pair< int, int> >::iterator combined_individuals_it;
852
+ for (ui=0; ui<N_indv; ui++)
853
+ if (include_indv[ui] == true)
854
+ combined_individuals[indv[ui]] = make_pair<int,int>(ui, -1);
855
+
856
+ for (ui=0; ui<diff_vcf_file.N_indv; ui++)
857
+ if (diff_vcf_file.include_indv[ui] == true)
858
+ {
859
+ if (combined_individuals.find(diff_vcf_file.indv[ui]) != combined_individuals.end())
860
+ combined_individuals[diff_vcf_file.indv[ui]].second = ui;
861
+ else
862
+ combined_individuals[diff_vcf_file.indv[ui]] = make_pair<int,int>(-1, ui);
863
+ }
864
+
865
+ unsigned int N_combined_indv = combined_individuals.size();
866
+ unsigned int N[3]={0,0,0};
867
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
868
+ {
869
+ if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
870
+ N[0]++;
871
+ else if (combined_individuals_it->second.first != -1)
872
+ N[1]++;
873
+ else
874
+ N[2]++;
875
+ }
876
+
877
+ vector<int> indv_N_discord(N_combined_indv, 0);
878
+ vector<int> indv_N_called_sites(N_combined_indv, 0);
879
+
880
+ printLOG("N_combined_individuals:\t" + int2str(N_combined_indv) + "\n");
881
+ printLOG("N_individuals_common_to_both_files:\t" + int2str(N[0]) + "\n");
882
+ printLOG("N_individuals_unique_to_file1:\t" + int2str(N[1]) + "\n");
883
+ printLOG("N_individuals_unique_to_file2:\t" + int2str(N[2]) + "\n");
884
+
885
+ // Build a table of included entries in both files
886
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
887
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
888
+ string CHROM;
889
+ int POS;
890
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
891
+
892
+ output_file = output_file_prefix + ".diff.sites_in_files";
893
+ ofstream sites_in_files(output_file.c_str());
894
+ sites_in_files << "CHROM\tPOS\tIN_FILE" << endl;
895
+
896
+ int s1, s2;
897
+ int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
898
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
899
+ {
900
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
901
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
902
+
903
+ sites_in_files << CHROMPOS_to_filepos_pair_it->first.first << "\t" << CHROMPOS_to_filepos_pair_it->first.second << "\t";
904
+ if ((s1 != -1) && (s2 != -1))
905
+ {
906
+ N_common_SNPs++;
907
+ sites_in_files << "B" << endl;
908
+ }
909
+ else if ((s1 != -1) && (s2 == -1))
910
+ {
911
+ N_SNPs_file1_only++;
912
+ sites_in_files << "1" << endl;
913
+ }
914
+ else if ((s1 == -1) && (s2 != -1))
915
+ {
916
+ N_SNPs_file2_only++;
917
+ sites_in_files << "2" << endl;
918
+ }
919
+ else
920
+ error("SNP in neither file!?");
921
+ }
922
+
923
+ sites_in_files.close();
924
+
925
+ printLOG("Found " + int2str(N_common_SNPs) + " SNPs common to both files.\n");
926
+ printLOG("Found " + int2str(N_SNPs_file1_only) + " SNPs only in main file.\n");
927
+ printLOG("Found " + int2str(N_SNPs_file2_only) + " SNPs only in second file.\n");
928
+
929
+ output_file = output_file_prefix + ".diff.sites";
930
+ ofstream diffsites(output_file.c_str());
931
+ if (!diffsites.is_open())
932
+ error("Could not open Sites Differences File: " + output_file, 3);
933
+ diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALT\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE\tN_FILE1_NONREF_GENOTYPES\tNON_REF_DISCORDANCE" << endl;
934
+
935
+ output_file = output_file_prefix + ".diff.switch";
936
+ ofstream switcherror(output_file.c_str());
937
+ if (!switcherror.is_open())
938
+ error("Could not open Switch Error file: " + output_file, 4);
939
+ switcherror << "CHROM\tPOS\tINDV" << endl;
940
+
941
+ // Now try and merge the entries.
942
+ unsigned int N_common_genotypes = 0;
943
+ unsigned int N_common_discordant_genotypes = 0;
944
+ unsigned int N_sites_with_mismatching_ALT = 0;
945
+ unsigned int N_non_ref_genotypes = 0;
946
+ unsigned int N_discordant_non_ref_genotypes = 0;
947
+
948
+ pair<string, string> genotype1, genotype2;
949
+ pair<int,int> geno_ids1, geno_ids2;
950
+ pair<string, string> missing_genotype(".",".");
951
+ pair<int, int> missing_HQUAL(0,0);
952
+
953
+ pair<int, int> homo_ref(0, 0);
954
+ vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
955
+ vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
956
+ pair<string, string> file1_hap1, file1_hap2, file2_hap1;
957
+
958
+ vector<int> N_phased_het_sites(N_combined_indv, 0);
959
+ vector<int> N_switch_errors(N_combined_indv, 0);
960
+ vector<pair<int,int> > indv_depth_at_common_sites(N_combined_indv, make_pair(0,0));
961
+ vector<pair<int,int> > indv_count_at_common_sites(N_combined_indv, make_pair(0,0));
962
+
963
+ vector<vector<int> > genotype_concord_matrix(4, vector<int>(4, 0));
964
+
965
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
966
+ {
967
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
968
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
969
+
970
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
971
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
972
+
973
+ vcf_entry e1(N_indv);
974
+ vcf_entry e2(diff_vcf_file.N_indv);
975
+
976
+ bool data_in_both = true;
977
+ // Read entries from file (if available)
978
+ if (s1 != -1)
979
+ {
980
+ get_vcf_entry(s1, vcf_line);
981
+ e1.reset(vcf_line);
982
+ }
983
+ else
984
+ data_in_both = false;
985
+
986
+ if (s2 != -1)
987
+ {
988
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
989
+ e2.reset(vcf_line);
990
+ }
991
+ else
992
+ data_in_both = false;
993
+
994
+ e1.parse_basic_entry(true, true, true);
995
+ e2.parse_basic_entry(true, true, true);
996
+
997
+ // Set the reference to the non-missing entry (if available)
998
+ string REF = e1.get_REF();
999
+ string REF2 = e2.get_REF();
1000
+ if (REF == "N")
1001
+ REF = REF2;
1002
+ if (REF2 == "N")
1003
+ REF2 = REF;
1004
+
1005
+ if (REF.size() != REF2.size())
1006
+ {
1007
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
1008
+ continue;
1009
+ }
1010
+
1011
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
1012
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
1013
+
1014
+ // Do the alternative alleles match?
1015
+ set<string> ALT1, ALT2;
1016
+ for (ui=0; ui<(e1.get_N_alleles()-1); ui++)
1017
+ ALT1.insert(e1.get_ALT_allele(ui));
1018
+
1019
+ for (ui=0; ui<(e2.get_N_alleles()-1); ui++)
1020
+ ALT2.insert(e2.get_ALT_allele(ui));
1021
+
1022
+ bool matching_ALT=true;
1023
+ if ((data_in_both) && (ALT1 != ALT2) && (ALT1.size() > 0) && (ALT2.size() > 0))
1024
+ {
1025
+ N_sites_with_mismatching_ALT++;
1026
+ matching_ALT = false;
1027
+ }
1028
+
1029
+ if (data_in_both)
1030
+ {
1031
+ diffsites << CHROM << "\t" << POS << "\t";
1032
+ diffsites << "B\t" << matching_ALT << "\t";
1033
+ }
1034
+ else
1035
+ {
1036
+ continue;
1037
+ }
1038
+
1039
+ if (s1 != -1)
1040
+ {
1041
+ e1.parse_full_entry(true);
1042
+ e1.parse_genotype_entries(true, true, true, true, true);
1043
+ }
1044
+
1045
+ if (s2 != -1)
1046
+ {
1047
+ e2.parse_full_entry(true);
1048
+ e2.parse_genotype_entries(true, true, true, true, true);
1049
+ }
1050
+
1051
+ // Now merge the genotypes.
1052
+ unsigned int indv_count=0;
1053
+ int indv1, indv2;
1054
+ unsigned int N_discordant_site_counter=0;
1055
+ unsigned int N_indvs_with_data=0;
1056
+ unsigned int site_N_non_ref_genotypes=0;
1057
+ unsigned int site_N_discordant_non_ref_genotypes = 0;
1058
+ int depth;
1059
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1060
+ {
1061
+ indv1 = combined_individuals_it->second.first;
1062
+ indv2 = combined_individuals_it->second.second;
1063
+
1064
+ if ((indv1 == -1) && (indv2 == -1))
1065
+ { // Genotype is completely missing... should never happen
1066
+ error("Missing genotype!?", 83);
1067
+ }
1068
+ else if ((indv1 == -1) && (indv2 != -1))
1069
+ { // Data is missing from first file, so just use second file.
1070
+
1071
+ }
1072
+ else if ((indv1 != -1) && (indv2 == -1))
1073
+ { // Data is missing from second file, so just use first file.
1074
+
1075
+ }
1076
+ else
1077
+ { // Data from both files, so figure out what to do
1078
+ bool non_ref_genotype = false;
1079
+ if (data_in_both)
1080
+ {
1081
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
1082
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
1083
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
1084
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
1085
+
1086
+ N_common_genotypes++;
1087
+ N_indvs_with_data++;
1088
+
1089
+ if (geno_ids1 != homo_ref)
1090
+ { // First file is not a hom ref
1091
+ N_non_ref_genotypes++;
1092
+ site_N_non_ref_genotypes++;
1093
+ non_ref_genotype = true;
1094
+ }
1095
+
1096
+ depth = e1.get_indv_DEPTH(indv1);
1097
+ if (depth >= 0)
1098
+ {
1099
+ indv_depth_at_common_sites[indv_count].first += depth;
1100
+ indv_count_at_common_sites[indv_count].first++;
1101
+ }
1102
+ depth = e2.get_indv_DEPTH(indv2);
1103
+ if (depth >= 0)
1104
+ {
1105
+ indv_depth_at_common_sites[indv_count].second += depth;
1106
+ indv_count_at_common_sites[indv_count].second++;
1107
+ }
1108
+ }
1109
+
1110
+ if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
1111
+ {
1112
+ genotype_concord_matrix[3][3]++;
1113
+ }
1114
+
1115
+ if ((genotype1 == missing_genotype) && (genotype2 != missing_genotype))
1116
+ { // Missing data, Favour second file
1117
+ if (matching_ALT && (ALT2.size() <= 1))
1118
+ {
1119
+ unsigned int idx2 = geno_ids2.first + geno_ids2.second;
1120
+ genotype_concord_matrix[3][idx2]++;
1121
+ }
1122
+ }
1123
+
1124
+ if ((genotype2 == missing_genotype) && (genotype1 != missing_genotype))
1125
+ { // Favour first file
1126
+ if (matching_ALT && (ALT1.size() <= 1))
1127
+ {
1128
+ unsigned int idx1 = geno_ids1.first + geno_ids1.second;
1129
+ genotype_concord_matrix[idx1][3]++;
1130
+ }
1131
+ }
1132
+
1133
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
1134
+ {
1135
+ if (data_in_both)
1136
+ {
1137
+ if (matching_ALT && (ALT1.size() <= 1) && (ALT2.size() <= 1))
1138
+ {
1139
+ unsigned int idx1 = geno_ids1.first + geno_ids1.second;
1140
+ unsigned int idx2 = geno_ids2.first + geno_ids2.second;
1141
+ genotype_concord_matrix[idx1][idx2]++;
1142
+ }
1143
+
1144
+ indv_N_called_sites[indv_count]++;
1145
+ if (!vcf_entry::genotypes_equal(genotype1, genotype2))
1146
+ {
1147
+ N_common_discordant_genotypes++;
1148
+ N_discordant_site_counter++;
1149
+ indv_N_discord[indv_count]++;
1150
+
1151
+ if (non_ref_genotype)
1152
+ {
1153
+ N_discordant_non_ref_genotypes++;
1154
+ site_N_discordant_non_ref_genotypes++;
1155
+ }
1156
+ }
1157
+ else
1158
+ { // Have a matching genotype in files 1 and 2
1159
+ if (geno_ids1.first != geno_ids1.second)
1160
+ { // It's a heterozgote
1161
+ char phase1, phase2;
1162
+ phase1 = e1.get_indv_PHASE(indv1);
1163
+ phase2 = e2.get_indv_PHASE(indv2);
1164
+ if ((phase1 == '|') && (phase2 == '|'))
1165
+ { // Calculate Phasing error (switch error)
1166
+ N_phased_het_sites[indv_count]++;
1167
+ file1_hap1 = make_pair<string,string>(prev_geno_file1[indv_count].first, genotype1.first);
1168
+ file1_hap2 = make_pair<string,string>(prev_geno_file1[indv_count].second, genotype1.second);
1169
+ file2_hap1 = make_pair<string,string>(prev_geno_file2[indv_count].first, genotype2.first);
1170
+
1171
+ if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
1172
+ { // Must be a switch error
1173
+ string indv_id;
1174
+ N_switch_errors[indv_count]++;
1175
+ if (indv1 != -1)
1176
+ indv_id = indv[indv1];
1177
+ else
1178
+ indv_id = diff_vcf_file.indv[indv2];
1179
+ switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl;
1180
+ }
1181
+ prev_geno_file1[indv_count] = genotype1;
1182
+ prev_geno_file2[indv_count] = genotype2;
1183
+ }
1184
+ }
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+
1190
+ indv_count++;
1191
+ }
1192
+ double discordance = 0.0;
1193
+ if (N_indvs_with_data > 0)
1194
+ discordance = double(N_discordant_site_counter) / N_indvs_with_data;
1195
+ double non_ref_discordance = 0.0;
1196
+ if (site_N_non_ref_genotypes > 0)
1197
+ non_ref_discordance = double(site_N_discordant_non_ref_genotypes) / site_N_non_ref_genotypes;
1198
+ diffsites << N_indvs_with_data << "\t" << N_discordant_site_counter << "\t" << discordance;
1199
+ diffsites << "\t" << site_N_non_ref_genotypes << "\t" << non_ref_discordance;
1200
+ diffsites << endl;
1201
+ }
1202
+
1203
+ output_file = output_file_prefix + ".diff.4x4";
1204
+ ofstream four_by_four(output_file.c_str());
1205
+ if (!four_by_four.is_open())
1206
+ error("Could not open 3x3 File: " + output_file, 3);
1207
+
1208
+ four_by_four << "-\tN00_file1\tN01_file1\tN11_file1\tN.._file1" << endl;
1209
+
1210
+ four_by_four << "N00_file2\t" << genotype_concord_matrix[0][0] << "\t" << genotype_concord_matrix[1][0] << "\t" << genotype_concord_matrix[2][0] << "\t" << genotype_concord_matrix[3][0] << endl;
1211
+ four_by_four << "N01_file2\t" << genotype_concord_matrix[0][1] << "\t" << genotype_concord_matrix[1][1] << "\t" << genotype_concord_matrix[2][1] << "\t" << genotype_concord_matrix[3][1] << endl;
1212
+ four_by_four << "N11_file2\t" << genotype_concord_matrix[0][2] << "\t" << genotype_concord_matrix[1][2] << "\t" << genotype_concord_matrix[2][2] << "\t" << genotype_concord_matrix[3][2] << endl;
1213
+ four_by_four << "N.._file2\t" << genotype_concord_matrix[0][3] << "\t" << genotype_concord_matrix[1][3] << "\t" << genotype_concord_matrix[2][3] << "\t" << genotype_concord_matrix[3][3] << endl;
1214
+ four_by_four.close();
1215
+
1216
+
1217
+ output_file = output_file_prefix + ".diff.indv.discord";
1218
+ ofstream idiscord(output_file.c_str());
1219
+ if (!idiscord.is_open())
1220
+ error("Could not open Individual Discordance File: " + output_file, 3);
1221
+
1222
+ idiscord << "INDV\tMEAN_DP_1\tMEAN_DP_2\tN_COMMON_CALLED\tN_DISCORD\tDISCORD\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
1223
+ unsigned int indv_count=0;
1224
+ double discordance, switch_error;
1225
+ int indv1, indv2;
1226
+ string indv_id;
1227
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1228
+ {
1229
+ indv1 = combined_individuals_it->second.first;
1230
+ indv2 = combined_individuals_it->second.second;
1231
+
1232
+ if (indv1 != -1)
1233
+ indv_id = indv[indv1];
1234
+ else
1235
+ indv_id = diff_vcf_file.indv[indv2];
1236
+
1237
+ if (indv_N_called_sites[indv_count] > 0)
1238
+ discordance = double(indv_N_discord[indv_count]) / indv_N_called_sites[indv_count];
1239
+ else
1240
+ discordance = 0.0;
1241
+ idiscord << indv_id;
1242
+
1243
+ double mean_depth1 = 0, mean_depth2=0;
1244
+ if (indv_count_at_common_sites[indv_count].first > 0)
1245
+ {
1246
+ mean_depth1 = double(indv_depth_at_common_sites[indv_count].first) / indv_count_at_common_sites[indv_count].first;
1247
+ }
1248
+
1249
+ if (indv_count_at_common_sites[indv_count].second > 0)
1250
+ {
1251
+ mean_depth2 = double(indv_depth_at_common_sites[indv_count].second) / indv_count_at_common_sites[indv_count].second;
1252
+ }
1253
+ idiscord << "\t" << mean_depth1 << "\t" << mean_depth2;
1254
+
1255
+ idiscord << "\t" << indv_N_called_sites[indv_count] << "\t" << indv_N_discord[indv_count] << "\t" << discordance;
1256
+ if (N_phased_het_sites[indv_count] > 0)
1257
+ switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
1258
+ else
1259
+ switch_error = 0;
1260
+ idiscord << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
1261
+
1262
+ indv_count++;
1263
+ }
1264
+ idiscord.close();
1265
+
1266
+ printLOG("Found " + int2str(N_sites_with_mismatching_ALT) + " sites with mismatching ALT alleles.\n");
1267
+
1268
+ printLOG("Found " + int2str(N_non_ref_genotypes) + " non-reference genotypes called in both files.\n");
1269
+ printLOG("Found " + int2str(N_discordant_non_ref_genotypes) + " discordant non-reference genotypes.\n");
1270
+ double concordance = 1.0 - (double(N_discordant_non_ref_genotypes)) / N_non_ref_genotypes;
1271
+ printLOG("Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n");
1272
+
1273
+ printLOG("Found " + int2str(N_common_genotypes) + " genotypes called in both files.\n");
1274
+ printLOG("Found " + int2str(N_common_discordant_genotypes) + " discordant genotypes.\n");
1275
+ concordance = 1.0 - (double(N_common_discordant_genotypes)) / N_common_genotypes;
1276
+ printLOG("Overall Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n");
1277
+
1278
+ diffsites.close();
1279
+ switcherror.close();
1280
+ printLOG("Done\n");
1281
+ }
1282
+ */