ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1282 @@
1
+ /*
2
+ * vcf_file_merge.cpp
3
+ *
4
+ * Created on: Oct 30, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 230 $)
7
+ */
8
+
9
+ #include "vcf_file.h"
10
+
11
+ void vcf_file::return_site_union(vcf_file &file2, map<pair<string, int>, pair<int, int> > &CHROMPOS_to_filepos_pair)
12
+ {
13
+ unsigned int s;
14
+ int POS;
15
+ string CHROM;
16
+ string vcf_line;
17
+ for (s=0; s<N_entries; s++)
18
+ {
19
+ if (include_entry[s] == true)
20
+ {
21
+ get_vcf_entry(s, vcf_line);
22
+ vcf_entry e(N_indv, vcf_line);
23
+ e.parse_basic_entry();
24
+
25
+ CHROM = e.get_CHROM();
26
+ POS = e.get_POS();
27
+
28
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)] = make_pair<int,int>(s, -1);
29
+ }
30
+ }
31
+ for (s=0; s<file2.N_entries; s++)
32
+ {
33
+ if (file2.include_entry[s] == true)
34
+ {
35
+ file2.get_vcf_entry(s, vcf_line);
36
+ vcf_entry e(file2.N_indv, vcf_line);
37
+ e.parse_basic_entry();
38
+
39
+ CHROM = e.get_CHROM();
40
+ POS = e.get_POS();
41
+
42
+ if (CHROMPOS_to_filepos_pair.find(make_pair<string,int>(CHROM, POS)) != CHROMPOS_to_filepos_pair.end())
43
+ {
44
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)].second = s;
45
+ }
46
+ else
47
+ {
48
+ CHROMPOS_to_filepos_pair[make_pair<string,int>(CHROM, POS)] = make_pair<int,int>(-1, s);
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+
55
+ void vcf_file::return_indv_union(vcf_file &file2, map<string, pair< int, int> > &combined_individuals)
56
+ {
57
+ for (unsigned int ui=0; ui<N_indv; ui++)
58
+ if (include_indv[ui] == true)
59
+ combined_individuals[indv[ui]] = make_pair<int,int>(ui, -1);
60
+
61
+ for (unsigned int ui=0; ui<file2.N_indv; ui++)
62
+ if (file2.include_indv[ui] == true)
63
+ {
64
+ if (combined_individuals.find(file2.indv[ui]) != combined_individuals.end())
65
+ combined_individuals[file2.indv[ui]].second = ui;
66
+ else
67
+ combined_individuals[file2.indv[ui]] = make_pair<int,int>(-1, ui);
68
+ }
69
+ }
70
+
71
+ void vcf_file::output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file)
72
+ {
73
+ printLOG("Comparing sites in VCF files...\n");
74
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
75
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
76
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
77
+
78
+ string vcf_line;
79
+ string CHROM;
80
+ int POS;
81
+
82
+ string output_file = output_file_prefix + ".diff.sites_in_files";
83
+ ofstream sites_in_files(output_file.c_str());
84
+ sites_in_files << "CHROM\tPOS\tIN_FILE\tREF\tALT1\tALT2" << endl;
85
+
86
+ int s1, s2;
87
+ int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
88
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
89
+ {
90
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
91
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
92
+
93
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
94
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
95
+
96
+ vcf_entry e1(N_indv);
97
+ vcf_entry e2(diff_vcf_file.N_indv);
98
+
99
+ // Read entries from file (if available)
100
+ if (s1 != -1)
101
+ {
102
+ get_vcf_entry(s1, vcf_line);
103
+ e1.reset(vcf_line);
104
+ }
105
+
106
+ if (s2 != -1)
107
+ {
108
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
109
+ e2.reset(vcf_line);
110
+ }
111
+
112
+ e1.parse_basic_entry(true);
113
+ e2.parse_basic_entry(true);
114
+
115
+ // Set the reference to the non-missing entry (if available)
116
+ string REF = e1.get_REF();
117
+ string REF2 = e2.get_REF();
118
+ if ((REF == "N") || (REF == "."))
119
+ REF = REF2;
120
+ if ((REF2 == "N") || (REF2 == "."))
121
+ REF2 = REF;
122
+
123
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N") && (REF != ".") && (REF2 != "."))
124
+ warning("Non-matching REF at " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2 + ". Diff results may be unreliable.");
125
+
126
+ sites_in_files << CHROM << "\t" << POS << "\t";
127
+ if ((s1 != -1) && (s2 != -1))
128
+ {
129
+ N_common_SNPs++;
130
+ sites_in_files << "B";
131
+ }
132
+ else if ((s1 != -1) && (s2 == -1))
133
+ {
134
+ N_SNPs_file1_only++;
135
+ sites_in_files << "1";
136
+ }
137
+ else if ((s1 == -1) && (s2 != -1))
138
+ {
139
+ N_SNPs_file2_only++;
140
+ sites_in_files << "2";
141
+ }
142
+ else
143
+ error("SNP in neither file!?");
144
+
145
+ sites_in_files << "\t" << REF << "\t" << e1.get_ALT() << "\t" << e2.get_ALT() << endl;
146
+ }
147
+
148
+ sites_in_files.close();
149
+
150
+ printLOG("Found " + int2str(N_common_SNPs) + " SNPs common to both files.\n");
151
+ printLOG("Found " + int2str(N_SNPs_file1_only) + " SNPs only in main file.\n");
152
+ printLOG("Found " + int2str(N_SNPs_file2_only) + " SNPs only in second file.\n");
153
+ }
154
+
155
+ void vcf_file::output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file)
156
+ {
157
+ printLOG("Comparing individuals in VCF files...\n");
158
+
159
+ string output_file = output_file_prefix + ".diff.indv_in_files";
160
+
161
+ ofstream out(output_file.c_str());
162
+ if (!out.is_open())
163
+ error("Could not open Indv Differences File: " + output_file, 3);
164
+ out << "INDV\tFILES" << endl;
165
+
166
+ // Build a list of individuals contained in each file
167
+ map<string, pair< int, int> > combined_individuals;
168
+ map<string, pair< int, int> >::iterator combined_individuals_it;
169
+ return_indv_union(diff_vcf_file, combined_individuals);
170
+
171
+ unsigned int N_combined_indv = combined_individuals.size();
172
+ unsigned int N[3]={0,0,0};
173
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
174
+ {
175
+ if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
176
+ {
177
+ N[0]++;
178
+ out << combined_individuals_it->first << "\tB" << endl;
179
+ }
180
+ else if (combined_individuals_it->second.first != -1)
181
+ {
182
+ N[1]++;
183
+ out << combined_individuals_it->first << "\t1" << endl;
184
+ }
185
+ else if (combined_individuals_it->second.second != -1)
186
+ {
187
+ N[2]++;
188
+ out << combined_individuals_it->first << "\t2" << endl;
189
+ }
190
+ else
191
+ error("Unhandled case");
192
+ }
193
+ out.close();
194
+
195
+ printLOG("N_combined_individuals:\t" + int2str(N_combined_indv) + "\n");
196
+ printLOG("N_individuals_common_to_both_files:\t" + int2str(N[0]) + "\n");
197
+ printLOG("N_individuals_unique_to_file1:\t" + int2str(N[1]) + "\n");
198
+ printLOG("N_individuals_unique_to_file2:\t" + int2str(N[2]) + "\n");
199
+ }
200
+
201
+ void vcf_file::output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file)
202
+ {
203
+ printLOG("Outputting Discordance By Individual...\n");
204
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
205
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
206
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
207
+
208
+ map<string, pair< int, int> > combined_individuals;
209
+ map<string, pair< int, int> >::iterator combined_individuals_it;
210
+ return_indv_union(diff_vcf_file, combined_individuals);
211
+
212
+ map<string, pair<int, int> > indv_sums;
213
+
214
+ string vcf_line, CHROM;
215
+ int POS;
216
+ int s1, s2, indv1, indv2;
217
+
218
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
219
+ {
220
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
221
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
222
+
223
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
224
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
225
+
226
+ vcf_entry e1(N_indv);
227
+ vcf_entry e2(diff_vcf_file.N_indv);
228
+
229
+ // Read entries from file (if available)
230
+ if (s1 != -1)
231
+ {
232
+ get_vcf_entry(s1, vcf_line);
233
+ e1.reset(vcf_line);
234
+ }
235
+
236
+ if (s2 != -1)
237
+ {
238
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
239
+ e2.reset(vcf_line);
240
+ }
241
+
242
+ e1.parse_basic_entry(true);
243
+ e2.parse_basic_entry(true);
244
+
245
+ // Set the reference to the non-missing entry (if available)
246
+ string REF = e1.get_REF();
247
+ string REF2 = e2.get_REF();
248
+ if (REF == "N")
249
+ REF = REF2;
250
+ if (REF2 == "N")
251
+ REF2 = REF;
252
+
253
+ if (REF.size() != REF2.size())
254
+ {
255
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
256
+ continue;
257
+ }
258
+
259
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
260
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
261
+
262
+ // Do the alternative alleles match?
263
+ string ALT, ALT2;
264
+ ALT = e1.get_ALT();
265
+ ALT2 = e2.get_ALT();
266
+
267
+ bool alleles_match = (ALT == ALT2) && (REF == REF2);
268
+ e1.parse_full_entry(true);
269
+ e1.parse_genotype_entries(true);
270
+
271
+ e2.parse_full_entry(true);
272
+ e2.parse_genotype_entries(true);
273
+
274
+ pair<string, string> genotype1, genotype2;
275
+ pair<int,int> geno_ids1, geno_ids2;
276
+ pair<string, string> missing_genotype(".",".");
277
+ pair<int, int> missing_id(-1,-1);
278
+
279
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
280
+ {
281
+ indv1 = combined_individuals_it->second.first;
282
+ indv2 = combined_individuals_it->second.second;
283
+
284
+ if ((indv1 == -1) || (indv2 == -1))
285
+ continue; // Individual not found in one of the files
286
+
287
+ if (alleles_match)
288
+ { // Alleles match, so can compare ids instead of strings
289
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
290
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
291
+
292
+ if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
293
+ {
294
+ indv_sums[combined_individuals_it->first].first++;
295
+ if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
296
+ ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
297
+ { // Match
298
+ // Don't do anything
299
+ }
300
+ else
301
+ { // Mismatch
302
+ indv_sums[combined_individuals_it->first].second++;
303
+ }
304
+ }
305
+ else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
306
+ { // Both missing
307
+ // Don't do anything.
308
+ }
309
+ else if (geno_ids1 != missing_id)
310
+ { // Genotype 1 is not missing, genotype 2 is.
311
+ // Don't do anything.
312
+ }
313
+ else if (geno_ids2 != missing_id)
314
+ { // Genotype 2 is not missing, genotype 1 is.
315
+ // Don't do anything.
316
+ }
317
+ else
318
+ error("Unknown condition");
319
+ }
320
+ else
321
+ { // Alleles don't match, so need to be more careful and compare strings
322
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
323
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
324
+
325
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
326
+ { // No missing data
327
+ indv_sums[combined_individuals_it->first].first++;
328
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
329
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
330
+ { // Match
331
+ // Don't do anything
332
+ }
333
+ else
334
+ { // Mismatch
335
+ indv_sums[combined_individuals_it->first].second++;
336
+ }
337
+ }
338
+ else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
339
+ { // Both missing
340
+ // Don't do anything
341
+ }
342
+ else if (genotype1 != missing_genotype)
343
+ { // Genotype 1 is not missing, genotype 2 is.
344
+ // Don't do anything
345
+ }
346
+ else if (genotype2 != missing_genotype)
347
+ { // Genotype 2 is not missing, genotype 1 is.
348
+ // Don't do anything
349
+ }
350
+ else
351
+ error("Unknown condition");
352
+ }
353
+ }
354
+ }
355
+
356
+ string output_file = output_file_prefix + ".diff.indv";
357
+ ofstream out(output_file.c_str());
358
+ if (!out.is_open())
359
+ error("Could not open Sites Differences File: " + output_file, 3);
360
+ out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
361
+
362
+ int N, N_discord;
363
+ double discordance;
364
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
365
+ {
366
+ out << combined_individuals_it->first;
367
+ N = indv_sums[combined_individuals_it->first].first;
368
+ N_discord = indv_sums[combined_individuals_it->first].second;
369
+ discordance = N_discord / double(N);
370
+ out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl;
371
+ }
372
+
373
+ out.close();
374
+ }
375
+
376
+ void vcf_file::output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file)
377
+ {
378
+ printLOG("Outputting Discordance By Site...\n");
379
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
380
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
381
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
382
+
383
+ map<string, pair< int, int> > combined_individuals;
384
+ map<string, pair< int, int> >::iterator combined_individuals_it;
385
+ return_indv_union(diff_vcf_file, combined_individuals);
386
+
387
+ string CHROM, vcf_line;
388
+ int POS;
389
+ int s1, s2, indv1, indv2;
390
+
391
+ string output_file = output_file_prefix + ".diff.sites";
392
+ ofstream diffsites(output_file.c_str());
393
+ if (!diffsites.is_open())
394
+ error("Could not open Sites Differences File: " + output_file, 3);
395
+ //diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALT\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE\tN_FILE1_NONREF_GENOTYPES\tNON_REF_DISCORDANCE" << endl;
396
+ diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
397
+
398
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
399
+ {
400
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
401
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
402
+
403
+ diffsites << CHROM << "\t" << POS;
404
+
405
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
406
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
407
+
408
+ vcf_entry e1(N_indv);
409
+ vcf_entry e2(diff_vcf_file.N_indv);
410
+
411
+ bool data_in_both = true;
412
+ // Read entries from file (if available)
413
+ if (s1 != -1)
414
+ {
415
+ get_vcf_entry(s1, vcf_line);
416
+ e1.reset(vcf_line);
417
+ }
418
+ else
419
+ data_in_both = false;
420
+
421
+ if (s2 != -1)
422
+ {
423
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
424
+ e2.reset(vcf_line);
425
+ }
426
+ else
427
+ data_in_both = false;
428
+
429
+ if (data_in_both)
430
+ diffsites << "\tB";
431
+ else if ((s1 != -1) && (s2 == -1))
432
+ diffsites << "\t1";
433
+ else if ((s1 == -1) && (s2 != -1))
434
+ diffsites << "\t2";
435
+ else
436
+ error("Unhandled condition");
437
+
438
+ e1.parse_basic_entry(true);
439
+ e2.parse_basic_entry(true);
440
+
441
+ // Set the reference to the non-missing entry (if available)
442
+ string REF = e1.get_REF();
443
+ string REF2 = e2.get_REF();
444
+ if (REF == "N")
445
+ REF = REF2;
446
+ if (REF2 == "N")
447
+ REF2 = REF;
448
+
449
+ if (REF.size() != REF2.size())
450
+ {
451
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
452
+ continue;
453
+ }
454
+
455
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
456
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
457
+
458
+ // Do the alternative alleles match?
459
+ string ALT, ALT2;
460
+ ALT = e1.get_ALT();
461
+ ALT2 = e2.get_ALT();
462
+
463
+ bool alleles_match = ((ALT == ALT2) && (REF == REF2));
464
+ diffsites << "\t" << alleles_match;
465
+
466
+ e1.parse_full_entry(true);
467
+ e1.parse_genotype_entries(true);
468
+
469
+ e2.parse_full_entry(true);
470
+ e2.parse_genotype_entries(true);
471
+
472
+ pair<string, string> genotype1, genotype2;
473
+ pair<int,int> geno_ids1, geno_ids2;
474
+ pair<string, string> missing_genotype(".",".");
475
+ pair<int, int> missing_id(-1,-1);
476
+
477
+ unsigned int N_common_called=0; // Number of genotypes called in both files
478
+ unsigned int N_missing_1=0, N_missing_2=0;
479
+ unsigned int N_discord=0;
480
+ unsigned int N_concord_non_missing=0;
481
+
482
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
483
+ {
484
+ indv1 = combined_individuals_it->second.first;
485
+ indv2 = combined_individuals_it->second.second;
486
+
487
+ if ((indv1 == -1) || (indv2 == -1))
488
+ continue; // Individual not found in one of the files
489
+
490
+ if (alleles_match)
491
+ { // Alleles match, so can compare ids instead of strings
492
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
493
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
494
+
495
+ if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
496
+ {
497
+ N_common_called++;
498
+ if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
499
+ ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
500
+ { // Match
501
+ N_concord_non_missing++;
502
+ }
503
+ else
504
+ { // Mismatch
505
+ N_discord++;
506
+ }
507
+ }
508
+ else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
509
+ { // Both missing
510
+ N_missing_1++; N_missing_2++;
511
+ }
512
+ else if (geno_ids1 != missing_id)
513
+ { // Genotype 1 is not missing, genotype 2 is.
514
+ N_missing_2++;
515
+ }
516
+ else if (geno_ids2 != missing_id)
517
+ { // Genotype 2 is not missing, genotype 1 is.
518
+ N_missing_1++;
519
+ }
520
+ else
521
+ error("Unknown condition");
522
+ }
523
+ else
524
+ { // Alleles don't match, so need to be more careful and compare strings
525
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
526
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
527
+
528
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
529
+ { // No missing data
530
+ N_common_called++;
531
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
532
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
533
+ { // Match
534
+ N_concord_non_missing++;
535
+ }
536
+ else
537
+ { // Mismatch
538
+ N_discord++;
539
+ }
540
+ }
541
+ else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
542
+ { // Both missing
543
+ N_missing_1++; N_missing_2++;
544
+ }
545
+ else if (genotype1 != missing_genotype)
546
+ { // Genotype 1 is not missing, genotype 2 is.
547
+ N_missing_2++;
548
+ }
549
+ else if (genotype2 != missing_genotype)
550
+ { // Genotype 2 is not missing, genotype 1 is.
551
+ N_missing_1++;
552
+ }
553
+ else
554
+ error("Unknown condition");
555
+ }
556
+ }
557
+ double discordance = N_discord / double(N_common_called);
558
+ diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance;
559
+ diffsites << endl;
560
+ }
561
+ diffsites.close();
562
+ }
563
+
564
+ void vcf_file::output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file)
565
+ {
566
+ printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n");
567
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
568
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
569
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
570
+
571
+ map<string, pair< int, int> > combined_individuals;
572
+ map<string, pair< int, int> >::iterator combined_individuals_it;
573
+ return_indv_union(diff_vcf_file, combined_individuals);
574
+
575
+ string vcf_line;
576
+ int s1, s2, indv1, indv2;
577
+
578
+ vector<vector<int> > discordance_matrix(4, vector<int>(4, 0));
579
+
580
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
581
+ {
582
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
583
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
584
+
585
+ vcf_entry e1(N_indv);
586
+ vcf_entry e2(diff_vcf_file.N_indv);
587
+
588
+ // Read entries from file (if available)
589
+ if (s1 != -1)
590
+ {
591
+ get_vcf_entry(s1, vcf_line);
592
+ e1.reset(vcf_line);
593
+ }
594
+
595
+ if (s2 != -1)
596
+ {
597
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
598
+ e2.reset(vcf_line);
599
+ }
600
+
601
+ e1.parse_basic_entry(true);
602
+ e2.parse_basic_entry(true);
603
+
604
+ if ((e1.get_N_alleles() != 2) || (e2.get_N_alleles() != 2))
605
+ continue;
606
+
607
+ // Set the reference to the non-missing entry (if available)
608
+ string REF = e1.get_REF();
609
+ string REF2 = e2.get_REF();
610
+ if (REF == "N")
611
+ REF = REF2;
612
+ if (REF2 == "N")
613
+ REF2 = REF;
614
+
615
+ if (REF.size() != REF2.size())
616
+ continue;
617
+
618
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
619
+ continue;
620
+
621
+ // Do the alternative alleles match?
622
+ string ALT, ALT2;
623
+ ALT = e1.get_ALT();
624
+ ALT2 = e2.get_ALT();
625
+
626
+ bool alleles_match = (ALT == ALT2) && (REF == REF2);
627
+ if (alleles_match == false)
628
+ continue;
629
+
630
+ e1.parse_full_entry(true);
631
+ e1.parse_genotype_entries(true);
632
+
633
+ e2.parse_full_entry(true);
634
+ e2.parse_genotype_entries(true);
635
+
636
+ pair<int,int> geno_ids1, geno_ids2;
637
+ int N1, N2;
638
+
639
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
640
+ {
641
+ indv1 = combined_individuals_it->second.first;
642
+ indv2 = combined_individuals_it->second.second;
643
+
644
+ if ((indv1 == -1) || (indv2 == -1))
645
+ continue; // Individual not found in one of the files
646
+
647
+ // Alleles match, so can compare ids instead of strings
648
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
649
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
650
+
651
+ if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) ||
652
+ ((geno_ids2.first != -1) && (geno_ids2.second == -1)))
653
+ { // Haploid
654
+ one_off_warning("***Warning: Haploid chromosomes not counted!***");
655
+ continue;
656
+ }
657
+
658
+ N1 = geno_ids1.first + geno_ids1.second;
659
+ N2 = geno_ids2.first + geno_ids2.second;
660
+
661
+ if ((N1 == -1) || (N1 < -2) || (N1 > 2))
662
+ error("Unhandled case");
663
+ if ((N2 == -1) || (N2 < -2) || (N2 > 2))
664
+ error("Unhandled case");
665
+
666
+ if (N1 == -2)
667
+ N1 = 3;
668
+
669
+ if (N2 == -2)
670
+ N2 = 3;
671
+
672
+ discordance_matrix[N1][N2]++;
673
+ }
674
+ }
675
+
676
+ string output_file = output_file_prefix + ".diff.discordance_matrix";
677
+ ofstream out(output_file.c_str());
678
+ if (!out.is_open())
679
+ error("Could not open Discordance Matrix File: " + output_file, 3);
680
+
681
+ out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl;
682
+ out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl;
683
+ out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl;
684
+ out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl;
685
+ out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl;
686
+ out.close();
687
+ }
688
+
689
+ void vcf_file::output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file)
690
+ {
691
+ printLOG("Outputting Phase Switch Errors...\n");
692
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
693
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
694
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
695
+
696
+ map<string, pair< int, int> > combined_individuals;
697
+ map<string, pair< int, int> >::iterator combined_individuals_it;
698
+ return_indv_union(diff_vcf_file, combined_individuals);
699
+
700
+ string CHROM, vcf_line;
701
+ int POS;
702
+ int s1, s2, indv1, indv2;
703
+
704
+ string output_file = output_file_prefix + ".diff.switch";
705
+ ofstream switcherror(output_file.c_str());
706
+ if (!switcherror.is_open())
707
+ error("Could not open Switch Error file: " + output_file, 4);
708
+ switcherror << "CHROM\tPOS\tINDV" << endl;
709
+
710
+ unsigned int N_combined_indv = combined_individuals.size();
711
+ vector<int> N_phased_het_sites(N_combined_indv, 0);
712
+ vector<int> N_switch_errors(N_combined_indv, 0);
713
+
714
+ pair<string, string> missing_genotype(".",".");
715
+ vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
716
+ vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
717
+ pair<string, string> file1_hap1, file1_hap2, file2_hap1;
718
+
719
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
720
+ {
721
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
722
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
723
+
724
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
725
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
726
+
727
+ vcf_entry e1(N_indv);
728
+ vcf_entry e2(diff_vcf_file.N_indv);
729
+
730
+ // Read entries from file (if available)
731
+ if (s1 != -1)
732
+ {
733
+ get_vcf_entry(s1, vcf_line);
734
+ e1.reset(vcf_line);
735
+ }
736
+
737
+ if (s2 != -1)
738
+ {
739
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
740
+ e2.reset(vcf_line);
741
+ }
742
+
743
+ e1.parse_basic_entry(true);
744
+ e2.parse_basic_entry(true);
745
+
746
+ e1.parse_full_entry(true);
747
+ e1.parse_genotype_entries(true);
748
+
749
+ e2.parse_full_entry(true);
750
+ e2.parse_genotype_entries(true);
751
+
752
+ pair<string, string> genotype1, genotype2;
753
+ pair<string, string> missing_genotype(".",".");
754
+
755
+ unsigned int N_common_called=0; // Number of genotypes called in both files
756
+ unsigned int indv_count=0;
757
+
758
+ // Bug fix applied (#3354189) - July 5th 2011
759
+ for (combined_individuals_it=combined_individuals.begin();
760
+ combined_individuals_it!=combined_individuals.end();
761
+ ++combined_individuals_it, indv_count++)
762
+ {
763
+ indv1 = combined_individuals_it->second.first;
764
+ indv2 = combined_individuals_it->second.second;
765
+
766
+ if ((indv1 == -1) || (indv2 == -1))
767
+ continue; // Individual not found in one of the files
768
+
769
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
770
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
771
+
772
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
773
+ { // No missing data
774
+ N_common_called++;
775
+ if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
776
+ ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
777
+ { // Have a matching genotypes in files 1 and 2
778
+ if (genotype1.first != genotype1.second)
779
+ { // It's a heterozgote
780
+ char phase1, phase2;
781
+ phase1 = e1.get_indv_PHASE(indv1);
782
+ phase2 = e2.get_indv_PHASE(indv2);
783
+ if ((phase1 == '|') && (phase2 == '|'))
784
+ { // Calculate Phasing error (switch error)
785
+ N_phased_het_sites[indv_count]++;
786
+ file1_hap1 = make_pair<string,string>(prev_geno_file1[indv_count].first, genotype1.first);
787
+ file1_hap2 = make_pair<string,string>(prev_geno_file1[indv_count].second, genotype1.second);
788
+ file2_hap1 = make_pair<string,string>(prev_geno_file2[indv_count].first, genotype2.first);
789
+
790
+ if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
791
+ { // Must be a switch error
792
+ string indv_id;
793
+ N_switch_errors[indv_count]++;
794
+ if (indv1 != -1)
795
+ indv_id = indv[indv1];
796
+ else
797
+ indv_id = diff_vcf_file.indv[indv2];
798
+ switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl;
799
+ }
800
+ prev_geno_file1[indv_count] = genotype1;
801
+ prev_geno_file2[indv_count] = genotype2;
802
+ }
803
+ }
804
+ }
805
+ }
806
+ }
807
+ }
808
+ switcherror.close();
809
+
810
+ output_file = output_file_prefix + ".diff.indv.switch";
811
+ ofstream idiscord(output_file.c_str());
812
+ if (!idiscord.is_open())
813
+ error("Could not open Individual Discordance File: " + output_file, 3);
814
+
815
+ idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
816
+ unsigned int indv_count=0;
817
+ double switch_error;
818
+ string indv_id;
819
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
820
+ {
821
+ indv1 = combined_individuals_it->second.first;
822
+ indv2 = combined_individuals_it->second.second;
823
+
824
+ if (indv1 != -1)
825
+ indv_id = indv[indv1];
826
+ else
827
+ indv_id = diff_vcf_file.indv[indv2];
828
+
829
+ if (N_phased_het_sites[indv_count] > 0)
830
+ switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
831
+ else
832
+ switch_error = 0;
833
+ idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
834
+
835
+ indv_count++;
836
+ }
837
+ idiscord.close();
838
+ }
839
+
840
+ /*
841
+ void vcf_file::output_concensus_statistics(const string &output_file_prefix, vcf_file &diff_vcf_file)
842
+ {
843
+ printLOG("Outputting Consensus Statistics... \n");
844
+ unsigned int ui;
845
+
846
+ string output_file;
847
+ string vcf_line;
848
+
849
+ // Build a list of individuals contained in each file
850
+ map<string, pair< int, int> > combined_individuals;
851
+ map<string, pair< int, int> >::iterator combined_individuals_it;
852
+ for (ui=0; ui<N_indv; ui++)
853
+ if (include_indv[ui] == true)
854
+ combined_individuals[indv[ui]] = make_pair<int,int>(ui, -1);
855
+
856
+ for (ui=0; ui<diff_vcf_file.N_indv; ui++)
857
+ if (diff_vcf_file.include_indv[ui] == true)
858
+ {
859
+ if (combined_individuals.find(diff_vcf_file.indv[ui]) != combined_individuals.end())
860
+ combined_individuals[diff_vcf_file.indv[ui]].second = ui;
861
+ else
862
+ combined_individuals[diff_vcf_file.indv[ui]] = make_pair<int,int>(-1, ui);
863
+ }
864
+
865
+ unsigned int N_combined_indv = combined_individuals.size();
866
+ unsigned int N[3]={0,0,0};
867
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
868
+ {
869
+ if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
870
+ N[0]++;
871
+ else if (combined_individuals_it->second.first != -1)
872
+ N[1]++;
873
+ else
874
+ N[2]++;
875
+ }
876
+
877
+ vector<int> indv_N_discord(N_combined_indv, 0);
878
+ vector<int> indv_N_called_sites(N_combined_indv, 0);
879
+
880
+ printLOG("N_combined_individuals:\t" + int2str(N_combined_indv) + "\n");
881
+ printLOG("N_individuals_common_to_both_files:\t" + int2str(N[0]) + "\n");
882
+ printLOG("N_individuals_unique_to_file1:\t" + int2str(N[1]) + "\n");
883
+ printLOG("N_individuals_unique_to_file2:\t" + int2str(N[2]) + "\n");
884
+
885
+ // Build a table of included entries in both files
886
+ map<pair<string, int>, pair<int, int> > CHROMPOS_to_filepos_pair;
887
+ map<pair<string, int>, pair<int, int> >::iterator CHROMPOS_to_filepos_pair_it;
888
+ string CHROM;
889
+ int POS;
890
+ return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair);
891
+
892
+ output_file = output_file_prefix + ".diff.sites_in_files";
893
+ ofstream sites_in_files(output_file.c_str());
894
+ sites_in_files << "CHROM\tPOS\tIN_FILE" << endl;
895
+
896
+ int s1, s2;
897
+ int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
898
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
899
+ {
900
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
901
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
902
+
903
+ sites_in_files << CHROMPOS_to_filepos_pair_it->first.first << "\t" << CHROMPOS_to_filepos_pair_it->first.second << "\t";
904
+ if ((s1 != -1) && (s2 != -1))
905
+ {
906
+ N_common_SNPs++;
907
+ sites_in_files << "B" << endl;
908
+ }
909
+ else if ((s1 != -1) && (s2 == -1))
910
+ {
911
+ N_SNPs_file1_only++;
912
+ sites_in_files << "1" << endl;
913
+ }
914
+ else if ((s1 == -1) && (s2 != -1))
915
+ {
916
+ N_SNPs_file2_only++;
917
+ sites_in_files << "2" << endl;
918
+ }
919
+ else
920
+ error("SNP in neither file!?");
921
+ }
922
+
923
+ sites_in_files.close();
924
+
925
+ printLOG("Found " + int2str(N_common_SNPs) + " SNPs common to both files.\n");
926
+ printLOG("Found " + int2str(N_SNPs_file1_only) + " SNPs only in main file.\n");
927
+ printLOG("Found " + int2str(N_SNPs_file2_only) + " SNPs only in second file.\n");
928
+
929
+ output_file = output_file_prefix + ".diff.sites";
930
+ ofstream diffsites(output_file.c_str());
931
+ if (!diffsites.is_open())
932
+ error("Could not open Sites Differences File: " + output_file, 3);
933
+ diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALT\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE\tN_FILE1_NONREF_GENOTYPES\tNON_REF_DISCORDANCE" << endl;
934
+
935
+ output_file = output_file_prefix + ".diff.switch";
936
+ ofstream switcherror(output_file.c_str());
937
+ if (!switcherror.is_open())
938
+ error("Could not open Switch Error file: " + output_file, 4);
939
+ switcherror << "CHROM\tPOS\tINDV" << endl;
940
+
941
+ // Now try and merge the entries.
942
+ unsigned int N_common_genotypes = 0;
943
+ unsigned int N_common_discordant_genotypes = 0;
944
+ unsigned int N_sites_with_mismatching_ALT = 0;
945
+ unsigned int N_non_ref_genotypes = 0;
946
+ unsigned int N_discordant_non_ref_genotypes = 0;
947
+
948
+ pair<string, string> genotype1, genotype2;
949
+ pair<int,int> geno_ids1, geno_ids2;
950
+ pair<string, string> missing_genotype(".",".");
951
+ pair<int, int> missing_HQUAL(0,0);
952
+
953
+ pair<int, int> homo_ref(0, 0);
954
+ vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
955
+ vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
956
+ pair<string, string> file1_hap1, file1_hap2, file2_hap1;
957
+
958
+ vector<int> N_phased_het_sites(N_combined_indv, 0);
959
+ vector<int> N_switch_errors(N_combined_indv, 0);
960
+ vector<pair<int,int> > indv_depth_at_common_sites(N_combined_indv, make_pair(0,0));
961
+ vector<pair<int,int> > indv_count_at_common_sites(N_combined_indv, make_pair(0,0));
962
+
963
+ vector<vector<int> > genotype_concord_matrix(4, vector<int>(4, 0));
964
+
965
+ for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it)
966
+ {
967
+ CHROM = CHROMPOS_to_filepos_pair_it->first.first;
968
+ POS = CHROMPOS_to_filepos_pair_it->first.second;
969
+
970
+ s1 = CHROMPOS_to_filepos_pair_it->second.first;
971
+ s2 = CHROMPOS_to_filepos_pair_it->second.second;
972
+
973
+ vcf_entry e1(N_indv);
974
+ vcf_entry e2(diff_vcf_file.N_indv);
975
+
976
+ bool data_in_both = true;
977
+ // Read entries from file (if available)
978
+ if (s1 != -1)
979
+ {
980
+ get_vcf_entry(s1, vcf_line);
981
+ e1.reset(vcf_line);
982
+ }
983
+ else
984
+ data_in_both = false;
985
+
986
+ if (s2 != -1)
987
+ {
988
+ diff_vcf_file.get_vcf_entry(s2, vcf_line);
989
+ e2.reset(vcf_line);
990
+ }
991
+ else
992
+ data_in_both = false;
993
+
994
+ e1.parse_basic_entry(true, true, true);
995
+ e2.parse_basic_entry(true, true, true);
996
+
997
+ // Set the reference to the non-missing entry (if available)
998
+ string REF = e1.get_REF();
999
+ string REF2 = e2.get_REF();
1000
+ if (REF == "N")
1001
+ REF = REF2;
1002
+ if (REF2 == "N")
1003
+ REF2 = REF;
1004
+
1005
+ if (REF.size() != REF2.size())
1006
+ {
1007
+ warning("REF sequences at " + CHROM + ":" + int2str(POS) + " are not comparable. Skipping site");
1008
+ continue;
1009
+ }
1010
+
1011
+ if ((REF != REF2) && (REF2 != "N") && (REF != "N"))
1012
+ warning("Non-matching REF " + CHROM + ":" + int2str(POS) + " " + REF + "/" + REF2);
1013
+
1014
+ // Do the alternative alleles match?
1015
+ set<string> ALT1, ALT2;
1016
+ for (ui=0; ui<(e1.get_N_alleles()-1); ui++)
1017
+ ALT1.insert(e1.get_ALT_allele(ui));
1018
+
1019
+ for (ui=0; ui<(e2.get_N_alleles()-1); ui++)
1020
+ ALT2.insert(e2.get_ALT_allele(ui));
1021
+
1022
+ bool matching_ALT=true;
1023
+ if ((data_in_both) && (ALT1 != ALT2) && (ALT1.size() > 0) && (ALT2.size() > 0))
1024
+ {
1025
+ N_sites_with_mismatching_ALT++;
1026
+ matching_ALT = false;
1027
+ }
1028
+
1029
+ if (data_in_both)
1030
+ {
1031
+ diffsites << CHROM << "\t" << POS << "\t";
1032
+ diffsites << "B\t" << matching_ALT << "\t";
1033
+ }
1034
+ else
1035
+ {
1036
+ continue;
1037
+ }
1038
+
1039
+ if (s1 != -1)
1040
+ {
1041
+ e1.parse_full_entry(true);
1042
+ e1.parse_genotype_entries(true, true, true, true, true);
1043
+ }
1044
+
1045
+ if (s2 != -1)
1046
+ {
1047
+ e2.parse_full_entry(true);
1048
+ e2.parse_genotype_entries(true, true, true, true, true);
1049
+ }
1050
+
1051
+ // Now merge the genotypes.
1052
+ unsigned int indv_count=0;
1053
+ int indv1, indv2;
1054
+ unsigned int N_discordant_site_counter=0;
1055
+ unsigned int N_indvs_with_data=0;
1056
+ unsigned int site_N_non_ref_genotypes=0;
1057
+ unsigned int site_N_discordant_non_ref_genotypes = 0;
1058
+ int depth;
1059
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1060
+ {
1061
+ indv1 = combined_individuals_it->second.first;
1062
+ indv2 = combined_individuals_it->second.second;
1063
+
1064
+ if ((indv1 == -1) && (indv2 == -1))
1065
+ { // Genotype is completely missing... should never happen
1066
+ error("Missing genotype!?", 83);
1067
+ }
1068
+ else if ((indv1 == -1) && (indv2 != -1))
1069
+ { // Data is missing from first file, so just use second file.
1070
+
1071
+ }
1072
+ else if ((indv1 != -1) && (indv2 == -1))
1073
+ { // Data is missing from second file, so just use first file.
1074
+
1075
+ }
1076
+ else
1077
+ { // Data from both files, so figure out what to do
1078
+ bool non_ref_genotype = false;
1079
+ if (data_in_both)
1080
+ {
1081
+ e1.get_indv_GENOTYPE_strings(indv1, genotype1);
1082
+ e2.get_indv_GENOTYPE_strings(indv2, genotype2);
1083
+ e1.get_indv_GENOTYPE_ids(indv1, geno_ids1);
1084
+ e2.get_indv_GENOTYPE_ids(indv2, geno_ids2);
1085
+
1086
+ N_common_genotypes++;
1087
+ N_indvs_with_data++;
1088
+
1089
+ if (geno_ids1 != homo_ref)
1090
+ { // First file is not a hom ref
1091
+ N_non_ref_genotypes++;
1092
+ site_N_non_ref_genotypes++;
1093
+ non_ref_genotype = true;
1094
+ }
1095
+
1096
+ depth = e1.get_indv_DEPTH(indv1);
1097
+ if (depth >= 0)
1098
+ {
1099
+ indv_depth_at_common_sites[indv_count].first += depth;
1100
+ indv_count_at_common_sites[indv_count].first++;
1101
+ }
1102
+ depth = e2.get_indv_DEPTH(indv2);
1103
+ if (depth >= 0)
1104
+ {
1105
+ indv_depth_at_common_sites[indv_count].second += depth;
1106
+ indv_count_at_common_sites[indv_count].second++;
1107
+ }
1108
+ }
1109
+
1110
+ if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
1111
+ {
1112
+ genotype_concord_matrix[3][3]++;
1113
+ }
1114
+
1115
+ if ((genotype1 == missing_genotype) && (genotype2 != missing_genotype))
1116
+ { // Missing data, Favour second file
1117
+ if (matching_ALT && (ALT2.size() <= 1))
1118
+ {
1119
+ unsigned int idx2 = geno_ids2.first + geno_ids2.second;
1120
+ genotype_concord_matrix[3][idx2]++;
1121
+ }
1122
+ }
1123
+
1124
+ if ((genotype2 == missing_genotype) && (genotype1 != missing_genotype))
1125
+ { // Favour first file
1126
+ if (matching_ALT && (ALT1.size() <= 1))
1127
+ {
1128
+ unsigned int idx1 = geno_ids1.first + geno_ids1.second;
1129
+ genotype_concord_matrix[idx1][3]++;
1130
+ }
1131
+ }
1132
+
1133
+ if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
1134
+ {
1135
+ if (data_in_both)
1136
+ {
1137
+ if (matching_ALT && (ALT1.size() <= 1) && (ALT2.size() <= 1))
1138
+ {
1139
+ unsigned int idx1 = geno_ids1.first + geno_ids1.second;
1140
+ unsigned int idx2 = geno_ids2.first + geno_ids2.second;
1141
+ genotype_concord_matrix[idx1][idx2]++;
1142
+ }
1143
+
1144
+ indv_N_called_sites[indv_count]++;
1145
+ if (!vcf_entry::genotypes_equal(genotype1, genotype2))
1146
+ {
1147
+ N_common_discordant_genotypes++;
1148
+ N_discordant_site_counter++;
1149
+ indv_N_discord[indv_count]++;
1150
+
1151
+ if (non_ref_genotype)
1152
+ {
1153
+ N_discordant_non_ref_genotypes++;
1154
+ site_N_discordant_non_ref_genotypes++;
1155
+ }
1156
+ }
1157
+ else
1158
+ { // Have a matching genotype in files 1 and 2
1159
+ if (geno_ids1.first != geno_ids1.second)
1160
+ { // It's a heterozgote
1161
+ char phase1, phase2;
1162
+ phase1 = e1.get_indv_PHASE(indv1);
1163
+ phase2 = e2.get_indv_PHASE(indv2);
1164
+ if ((phase1 == '|') && (phase2 == '|'))
1165
+ { // Calculate Phasing error (switch error)
1166
+ N_phased_het_sites[indv_count]++;
1167
+ file1_hap1 = make_pair<string,string>(prev_geno_file1[indv_count].first, genotype1.first);
1168
+ file1_hap2 = make_pair<string,string>(prev_geno_file1[indv_count].second, genotype1.second);
1169
+ file2_hap1 = make_pair<string,string>(prev_geno_file2[indv_count].first, genotype2.first);
1170
+
1171
+ if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
1172
+ { // Must be a switch error
1173
+ string indv_id;
1174
+ N_switch_errors[indv_count]++;
1175
+ if (indv1 != -1)
1176
+ indv_id = indv[indv1];
1177
+ else
1178
+ indv_id = diff_vcf_file.indv[indv2];
1179
+ switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl;
1180
+ }
1181
+ prev_geno_file1[indv_count] = genotype1;
1182
+ prev_geno_file2[indv_count] = genotype2;
1183
+ }
1184
+ }
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+
1190
+ indv_count++;
1191
+ }
1192
+ double discordance = 0.0;
1193
+ if (N_indvs_with_data > 0)
1194
+ discordance = double(N_discordant_site_counter) / N_indvs_with_data;
1195
+ double non_ref_discordance = 0.0;
1196
+ if (site_N_non_ref_genotypes > 0)
1197
+ non_ref_discordance = double(site_N_discordant_non_ref_genotypes) / site_N_non_ref_genotypes;
1198
+ diffsites << N_indvs_with_data << "\t" << N_discordant_site_counter << "\t" << discordance;
1199
+ diffsites << "\t" << site_N_non_ref_genotypes << "\t" << non_ref_discordance;
1200
+ diffsites << endl;
1201
+ }
1202
+
1203
+ output_file = output_file_prefix + ".diff.4x4";
1204
+ ofstream four_by_four(output_file.c_str());
1205
+ if (!four_by_four.is_open())
1206
+ error("Could not open 3x3 File: " + output_file, 3);
1207
+
1208
+ four_by_four << "-\tN00_file1\tN01_file1\tN11_file1\tN.._file1" << endl;
1209
+
1210
+ four_by_four << "N00_file2\t" << genotype_concord_matrix[0][0] << "\t" << genotype_concord_matrix[1][0] << "\t" << genotype_concord_matrix[2][0] << "\t" << genotype_concord_matrix[3][0] << endl;
1211
+ four_by_four << "N01_file2\t" << genotype_concord_matrix[0][1] << "\t" << genotype_concord_matrix[1][1] << "\t" << genotype_concord_matrix[2][1] << "\t" << genotype_concord_matrix[3][1] << endl;
1212
+ four_by_four << "N11_file2\t" << genotype_concord_matrix[0][2] << "\t" << genotype_concord_matrix[1][2] << "\t" << genotype_concord_matrix[2][2] << "\t" << genotype_concord_matrix[3][2] << endl;
1213
+ four_by_four << "N.._file2\t" << genotype_concord_matrix[0][3] << "\t" << genotype_concord_matrix[1][3] << "\t" << genotype_concord_matrix[2][3] << "\t" << genotype_concord_matrix[3][3] << endl;
1214
+ four_by_four.close();
1215
+
1216
+
1217
+ output_file = output_file_prefix + ".diff.indv.discord";
1218
+ ofstream idiscord(output_file.c_str());
1219
+ if (!idiscord.is_open())
1220
+ error("Could not open Individual Discordance File: " + output_file, 3);
1221
+
1222
+ idiscord << "INDV\tMEAN_DP_1\tMEAN_DP_2\tN_COMMON_CALLED\tN_DISCORD\tDISCORD\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
1223
+ unsigned int indv_count=0;
1224
+ double discordance, switch_error;
1225
+ int indv1, indv2;
1226
+ string indv_id;
1227
+ for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1228
+ {
1229
+ indv1 = combined_individuals_it->second.first;
1230
+ indv2 = combined_individuals_it->second.second;
1231
+
1232
+ if (indv1 != -1)
1233
+ indv_id = indv[indv1];
1234
+ else
1235
+ indv_id = diff_vcf_file.indv[indv2];
1236
+
1237
+ if (indv_N_called_sites[indv_count] > 0)
1238
+ discordance = double(indv_N_discord[indv_count]) / indv_N_called_sites[indv_count];
1239
+ else
1240
+ discordance = 0.0;
1241
+ idiscord << indv_id;
1242
+
1243
+ double mean_depth1 = 0, mean_depth2=0;
1244
+ if (indv_count_at_common_sites[indv_count].first > 0)
1245
+ {
1246
+ mean_depth1 = double(indv_depth_at_common_sites[indv_count].first) / indv_count_at_common_sites[indv_count].first;
1247
+ }
1248
+
1249
+ if (indv_count_at_common_sites[indv_count].second > 0)
1250
+ {
1251
+ mean_depth2 = double(indv_depth_at_common_sites[indv_count].second) / indv_count_at_common_sites[indv_count].second;
1252
+ }
1253
+ idiscord << "\t" << mean_depth1 << "\t" << mean_depth2;
1254
+
1255
+ idiscord << "\t" << indv_N_called_sites[indv_count] << "\t" << indv_N_discord[indv_count] << "\t" << discordance;
1256
+ if (N_phased_het_sites[indv_count] > 0)
1257
+ switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
1258
+ else
1259
+ switch_error = 0;
1260
+ idiscord << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
1261
+
1262
+ indv_count++;
1263
+ }
1264
+ idiscord.close();
1265
+
1266
+ printLOG("Found " + int2str(N_sites_with_mismatching_ALT) + " sites with mismatching ALT alleles.\n");
1267
+
1268
+ printLOG("Found " + int2str(N_non_ref_genotypes) + " non-reference genotypes called in both files.\n");
1269
+ printLOG("Found " + int2str(N_discordant_non_ref_genotypes) + " discordant non-reference genotypes.\n");
1270
+ double concordance = 1.0 - (double(N_discordant_non_ref_genotypes)) / N_non_ref_genotypes;
1271
+ printLOG("Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n");
1272
+
1273
+ printLOG("Found " + int2str(N_common_genotypes) + " genotypes called in both files.\n");
1274
+ printLOG("Found " + int2str(N_common_discordant_genotypes) + " discordant genotypes.\n");
1275
+ concordance = 1.0 - (double(N_common_discordant_genotypes)) / N_common_genotypes;
1276
+ printLOG("Overall Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n");
1277
+
1278
+ diffsites.close();
1279
+ switcherror.close();
1280
+ printLOG("Done\n");
1281
+ }
1282
+ */