ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1193 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Author: petr.danecek@sanger
4
+ #
5
+
6
+ use strict;
7
+ use warnings;
8
+ use Carp;
9
+ use Vcf;
10
+ use FaSlice;
11
+
12
+ my $opts = parse_params();
13
+ if ( exists($$opts{plot}) )
14
+ {
15
+ plot_stats($opts);
16
+ }
17
+ else
18
+ {
19
+ compare_vcfs($opts);
20
+ }
21
+
22
+ exit;
23
+
24
+ #--------------------------------
25
+
26
+ sub error
27
+ {
28
+ my (@msg) = @_;
29
+ if ( scalar @msg )
30
+ {
31
+ croak @msg;
32
+ }
33
+ die
34
+ "About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n",
35
+ "Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n",
36
+ " vcf-compare -p plots chr1.cmp chr2.cmp ...\n",
37
+ "Options:\n",
38
+ " -c, --chromosomes <list|file> Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n",
39
+ " -d, --debug Debugging information. Giving the option multiple times increases verbosity\n",
40
+ " -g, --cmp-genotypes Compare genotypes, not only positions\n",
41
+ " -m, --name-mapping <list|file> Use with -g when comparing files with differing column names. The argument to this options is a\n",
42
+ " comma-separated list or one mapping per line in a file. The names are colon separated and must\n",
43
+ " appear in the same order as the files on the command line.\n",
44
+ " -p, --plot <prefix> Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n",
45
+ " -R, --refseq <file> Compare the actual sequence, not just positions. Use with -w to compare indels.\n",
46
+ " -r, --regions <list|file> Process the given regions (comma-separated list or one region per line in a file).\n",
47
+ " -s, --samples <list|file> Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n",
48
+ " -t, --title <string> Title for graphs (see also -p)\n",
49
+ " -w, --win <int> In repetitive sequences, the same indel can be called at different positions. Consider\n",
50
+ " records this far apart as matching (be it a SNP or an indel).\n",
51
+ " -h, -?, --help This help message.\n",
52
+ "\n";
53
+ }
54
+
55
+
56
+ sub parse_params
57
+ {
58
+ my $opts = { positions=>0 };
59
+ while (my $arg=shift(@ARGV))
60
+ {
61
+ if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; }
62
+ if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; }
63
+ if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; }
64
+ if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; }
65
+ if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; }
66
+ if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; }
67
+ if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; }
68
+ if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; }
69
+ if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; }
70
+ if ( $arg eq '-s' || $arg eq '--samples' )
71
+ {
72
+ my $samples = shift(@ARGV);
73
+ my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples);
74
+ $$opts{samples} = \@samples;
75
+ next;
76
+ }
77
+ if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; }
78
+ if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; }
79
+ if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; }
80
+ if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; }
81
+ if ( -e $arg ) { push @{$$opts{files}}, $arg; next }
82
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
83
+ error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
84
+ }
85
+ if ( !exists($$opts{files}) ) { error("What files should be compared?\n") }
86
+ return $opts;
87
+ }
88
+
89
+ sub read_list
90
+ {
91
+ my ($fname) = @_;
92
+ my @regions;
93
+ if ( -e $fname )
94
+ {
95
+ open(my $rgs,'<',$fname) or error("$fname: $!");
96
+ while (my $line=<$rgs>)
97
+ {
98
+ chomp($line);
99
+ push @regions, $line;
100
+ }
101
+ close($rgs);
102
+ }
103
+ else
104
+ {
105
+ @regions = split(/,/,$fname);
106
+ }
107
+ return (@regions);
108
+ }
109
+
110
+ sub read_mappings_list
111
+ {
112
+ my ($fname,$files) = @_;
113
+ my @maps = read_list($fname);
114
+ my %mapping;
115
+ for my $map (@maps)
116
+ {
117
+ my @items = split(/:/,$map);
118
+ if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); }
119
+ for (my $i=1; $i<@$files; $i++)
120
+ {
121
+ $mapping{$$files[$i]}{$items[$i]} = $items[0];
122
+ warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n");
123
+ }
124
+ }
125
+ return \%mapping;
126
+ }
127
+
128
+ sub compare_vcfs
129
+ {
130
+ my ($opts) = @_;
131
+
132
+ $$opts{match} = {};
133
+ $$opts{hapls} = {};
134
+
135
+ # Open the VCF files and initialize the list of chromosomes
136
+ my @vcfs;
137
+ my (@regions,%has_chrom,$mappings);
138
+ if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); }
139
+ if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); }
140
+
141
+ print "# This file was generated by vcf-compare.\n#\n";
142
+ if ( $$opts{debug} )
143
+ {
144
+ print
145
+ "#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n",
146
+ "#SD The columns are: \n",
147
+ "#SD 1 .. chromosome\n",
148
+ "#SD 2 .. position\n",
149
+ "#SD 3 .. indicates matching (+) or mismatching (-) site\n",
150
+ "#SD 4 .. number of Hom_RR mismatches\n",
151
+ "#SD 5 .. number of Het_RA mismatches\n",
152
+ "#SD 6 .. number of Hom_AA mismatches\n";
153
+ }
154
+
155
+ my $ifile = 0;
156
+ for my $file (@{$$opts{files}})
157
+ {
158
+ my $vcf = Vcf->new(file=>$file);
159
+ $$vcf{vcf_compare_ID} = $ifile++;
160
+ $vcf->parse_header();
161
+ $vcf->close();
162
+ $$vcf{nread} = 0;
163
+ push @vcfs, $vcf;
164
+
165
+ # Update the list of known chromosomes
166
+ if ( !exists($$opts{regions_list}) )
167
+ {
168
+ my $chrms = $vcf->get_chromosomes();
169
+ for my $chr (@$chrms)
170
+ {
171
+ if ( exists($has_chrom{$chr}) ) { next; }
172
+ $has_chrom{$chr} = 1;
173
+ push @regions, $chr;
174
+ }
175
+ }
176
+
177
+ # Check if column names need to be renamed
178
+ if ( defined $mappings && exists($$mappings{$$vcf{file}}) )
179
+ {
180
+ $$vcf{_col_mapping} = $$mappings{$$vcf{file}};
181
+ for my $name (keys %{$$vcf{_col_mapping}})
182
+ {
183
+ if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); }
184
+ my $new_name = $$vcf{_col_mapping}{$name};
185
+ $$vcf{_col_mapping_rev}{$new_name} = $name;
186
+ }
187
+ }
188
+ }
189
+
190
+ # Include only matching samples in haplotype comparison
191
+ if ( $$opts{cmp_genotypes} )
192
+ {
193
+ my %all_samples;
194
+ for my $vcf (@vcfs)
195
+ {
196
+ if ( exists $$opts{samples} )
197
+ {
198
+ for my $sample (@{$$opts{samples}})
199
+ {
200
+ if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
201
+ if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ }
202
+ }
203
+ }
204
+ else
205
+ {
206
+ my @samples = $vcf->get_samples();
207
+ for my $sample (@samples)
208
+ {
209
+ if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
210
+ $all_samples{$sample}++
211
+ }
212
+ }
213
+ }
214
+ my @include_samples;
215
+ while (my ($sample,$count)=each %all_samples)
216
+ {
217
+ if ( $count != scalar @vcfs ) { next; }
218
+ push @include_samples, $sample;
219
+ }
220
+ if ( !@include_samples )
221
+ {
222
+ error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n");
223
+ }
224
+ $$opts{gt_samples_compared} = scalar @include_samples;
225
+ for my $vcf (@vcfs)
226
+ {
227
+ my @include;
228
+ if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; }
229
+ else
230
+ {
231
+ for my $sample (@include_samples)
232
+ {
233
+ push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample
234
+ }
235
+ }
236
+ $vcf->set_samples(include=>\@include);
237
+ }
238
+ }
239
+
240
+ # Go through all the files simultaneously and get the stats.
241
+ for my $region (@regions)
242
+ {
243
+ # Open files
244
+ for my $vcf (@vcfs)
245
+ {
246
+ delete($$vcf{last_line});
247
+ $vcf->open(region=>$region,parse_header=>1);
248
+ delete($$vcf{eof});
249
+ }
250
+ do_region_stats($opts,\@vcfs);
251
+ }
252
+
253
+ report_stats($opts,\@vcfs);
254
+
255
+ for my $vcf (@vcfs)
256
+ {
257
+ if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); }
258
+ }
259
+ }
260
+
261
+ sub report_stats
262
+ {
263
+ my ($opts,$vcfs) = @_;
264
+
265
+ # if ( $$opts{debug} )
266
+ # {
267
+ # use Data::Dumper; print Dumper($opts);
268
+ # }
269
+
270
+ my (@counts,%totals);
271
+ while (my ($key,$num) = each %{$$opts{match}})
272
+ {
273
+ my @files = split(/\s+/,$key);
274
+ for my $file (@files)
275
+ {
276
+ $totals{$file} += $num;
277
+ }
278
+ push @counts, {count=>$num, files=>[@files]};
279
+ }
280
+
281
+ print
282
+ "#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n",
283
+ "#VN The columns are: \n",
284
+ "#VN 1 .. number of sites unique to this particular combination of files\n",
285
+ "#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n";
286
+ for my $rec (sort {$$a{count}<=>$$b{count}} @counts)
287
+ {
288
+ my $num = $$rec{count};
289
+ my $files = $$rec{files};
290
+
291
+ print "VN\t$num";
292
+ for my $file (@$files)
293
+ {
294
+ printf "\t$file (%.1f%%)", $num*100./$totals{$file};
295
+ }
296
+ print "\n";
297
+ }
298
+
299
+ if ( $$opts{refseq} && $$opts{indels} )
300
+ {
301
+ print
302
+ "#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n",
303
+ "#IN .. todo\n",
304
+ "#IN Number of matching indel haplotypes shared across:\n";
305
+ while (my ($file,$stat) = each %{$$opts{indels}})
306
+ {
307
+ print "IN\t$file\n";
308
+ my $match = $$stat{match} ? $$stat{match} : 0;
309
+ my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0;
310
+ printf "\t\tNumber of matches: %d\n", $match;
311
+ printf "\t\t mismatches: %d\n", $mismatch;
312
+ printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch);
313
+ }
314
+ }
315
+
316
+ print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n";
317
+ printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0;
318
+ printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0;
319
+ printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0;
320
+ printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0;
321
+ printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0;
322
+
323
+ my $out;
324
+ for my $vcf (@$vcfs)
325
+ {
326
+ if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; }
327
+ if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; }
328
+
329
+ my $diff = $$vcf{nread}-$totals{$$vcf{file}};
330
+ my $reported = $totals{$$vcf{file}};
331
+ my $total = $$vcf{nread};
332
+ $out .= sprintf "SN\t%d (%.1f%%) .. read %d, reported %d\t%s\n", $diff,$diff*100./$total,$total,$reported,$$vcf{file};
333
+ }
334
+ if ( $out )
335
+ {
336
+ print "SN\tNumber of sites lost due to grouping (e.g. duplicate sites)\n";
337
+ print $out;
338
+ }
339
+
340
+
341
+ if ( !$$opts{cmp_genotypes} ) { return; }
342
+
343
+ my %summary;
344
+ for my $id (keys %{$$opts{hapls}})
345
+ {
346
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
347
+ {
348
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; }
349
+ $$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
350
+
351
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; }
352
+ $$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'};
353
+
354
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; }
355
+ $$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'};
356
+
357
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; }
358
+ $$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'};
359
+
360
+ $summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'};
361
+ $summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
362
+ }
363
+ for my $key qw(het_RA_ het_AA_)
364
+ {
365
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; }
366
+ $$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'};
367
+
368
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; }
369
+ $$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'};
370
+
371
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; }
372
+ $$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'};
373
+ }
374
+ }
375
+ print
376
+ "#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n",
377
+ "#GS The columns are:\n",
378
+ "#GS 1 .. variant type\n",
379
+ "#GS 2 .. number of mismatches\n",
380
+ "#GS 3 .. number of matches\n",
381
+ "#GS 4 .. discordance\n";
382
+ print_gs($opts,\%summary);
383
+
384
+ print
385
+ "\n",
386
+ "#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n",
387
+ "#GC The columns are:\n",
388
+ "#GC 1 .. Sample\n",
389
+ "#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n",
390
+ "#GC 7-9 .. Gtype lost: total het_RA het_AA \n",
391
+ "#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n",
392
+ "#GC 15-17 .. Phase lost: total het_RA het_AA \n",
393
+ "#GC 18 .. Phase gained\n",
394
+ "#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n",
395
+ "#GC 24-26 .. Phased matches: total het_RA het_AA \n",
396
+ "#GC 27-29 .. Misphased matches: total het_RA het_AA \n";
397
+
398
+ for my $id (keys %{$$opts{hapls}})
399
+ {
400
+ print "GC\t$id";
401
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; }
402
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; }
403
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; }
404
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; }
405
+ if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; }
406
+ print "\t",$$opts{hapls}{$id}{phase_gained};
407
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; }
408
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; }
409
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; }
410
+ print "\n";
411
+ }
412
+
413
+ print
414
+ "#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n",
415
+ "#AF The columns are:\n",
416
+ "#AF 1 .. Non-ref allele count\n",
417
+ "#AF 2 .. Hom(RR) matches\n",
418
+ "#AF 3 .. Het(RA) matches\n",
419
+ "#AF 4 .. Hom(AA) matches\n",
420
+ "#AF 5 .. Het(AA) matches\n",
421
+ "#AF 6 .. Hom(RR) mismatches\n",
422
+ "#AF 7 .. Het(RA) mismatches\n",
423
+ "#AF 8 .. Hom(AA) mismatches\n",
424
+ "#AF 9 .. Het(AA) mismatches\n";
425
+ for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}})
426
+ {
427
+ print "AF\t$ac";
428
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
429
+ {
430
+ print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0;
431
+ }
432
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
433
+ {
434
+ print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0;
435
+ }
436
+ print "\n";
437
+ }
438
+
439
+ print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n";
440
+ print "#DP The columns are:\n";
441
+ print "#DP 1 .. depth\n";
442
+ print "#DP 2 .. RR matches\n";
443
+ print "#DP 3 .. RA matches\n";
444
+ print "#DP 4 .. AA matches\n";
445
+ print "#DP 5 .. RR -> RA mismatches\n";
446
+ print "#DP 6 .. RR -> AA mismatches\n";
447
+ print "#DP 7 .. RA -> RR mismatches\n";
448
+ print "#DP 8 .. RA -> AA mismatches\n";
449
+ print "#DP 9 .. AA -> RR mismatches\n";
450
+ print "#DP 10 .. AA -> RA mismatches\n";
451
+ for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}})
452
+ {
453
+ print "DP\t$dp";
454
+ for my $type qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)
455
+ {
456
+ printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0;
457
+ }
458
+ print "\n";
459
+ }
460
+
461
+ if ( $$opts{debug} )
462
+ {
463
+ print "#MT Mismatch Types\n";
464
+ for my $t1 (keys %{$$opts{mismatch_types}})
465
+ {
466
+ for my $t2 (keys %{$$opts{mismatch_types}{$t1}})
467
+ {
468
+ print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n";
469
+ }
470
+ }
471
+ }
472
+ }
473
+
474
+ sub print_gs
475
+ {
476
+ my ($opts,$stats) = @_;
477
+ my ($ndr_ms,$ndr_m);
478
+ for my $key qw(hom_RR het_RA hom_AA het_AA)
479
+ {
480
+ my $m = $$stats{"${key}_"}{match};
481
+ my $ms = $$stats{"${key}_"}{mismatch};
482
+ if ( !$m ) { $m=0; }
483
+ if ( !$ms ) { $ms=0; }
484
+ printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$m?$ms*100./($m+$ms):0;
485
+ $ndr_ms += $ms;
486
+ $ndr_m += $key eq 'hom_RR' ? 0 : $m;
487
+ }
488
+ printf
489
+ "SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $ndr_m+$ndr_ms ? $ndr_ms*100./($ndr_m+$ndr_ms) : 0;
490
+ }
491
+
492
+ sub read_stats
493
+ {
494
+ my ($stats,$file) = @_;
495
+ open(my $fh,'<',$file) or error("$file: $!");
496
+ while (my $line=<$fh>)
497
+ {
498
+ if ( $line=~/^#/ ) { next; }
499
+ my @items = split(/\t/,$line);
500
+ chomp($items[-1]);
501
+ if ( $items[0] eq 'DP' )
502
+ {
503
+ my $dp = $items[1];
504
+ $$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10];
505
+ $$stats{dp}{RR}{RA}{$dp} += $items[5];
506
+ $$stats{dp}{n}{RR}{RA} += $items[5];
507
+ $$stats{dp}{RR}{AA}{$dp} += $items[6];
508
+ $$stats{dp}{n}{RR}{AA} += $items[6];
509
+ $$stats{dp}{RA}{RR}{$dp} += $items[7];
510
+ $$stats{dp}{n}{RA}{RR} += $items[7];
511
+ $$stats{dp}{RA}{AA}{$dp} += $items[8];
512
+ $$stats{dp}{n}{RA}{AA} += $items[8];
513
+ $$stats{dp}{AA}{RR}{$dp} += $items[9];
514
+ $$stats{dp}{n}{AA}{RR} += $items[9];
515
+ $$stats{dp}{AA}{RA}{$dp} += $items[10];
516
+ $$stats{dp}{n}{AA}{RA} += $items[10];
517
+ }
518
+ if ( $items[0] eq 'AF' )
519
+ {
520
+ my $af = $items[1];
521
+ $$stats{af}{RR}{$af}{matches} += $items[2];
522
+ $$stats{af}{RA}{$af}{matches} += $items[3];
523
+ $$stats{af}{AA}{$af}{matches} += $items[4];
524
+ $$stats{af}{RR}{$af}{mismatches} += $items[6];
525
+ $$stats{af}{RA}{$af}{mismatches} += $items[7];
526
+ $$stats{af}{AA}{$af}{mismatches} += $items[8];
527
+ }
528
+ if ( $items[0] eq 'GS' )
529
+ {
530
+ my $type = $items[1];
531
+ $$stats{gs}{$type.'_'}{mismatch} += $items[2];
532
+ $$stats{gs}{$type.'_'}{match} += $items[3];
533
+ }
534
+ }
535
+ close($fh);
536
+ }
537
+
538
+ sub plot_stats
539
+ {
540
+ my ($opts) = @_;
541
+ my $stats = {};
542
+ for my $file (@{$$opts{files}})
543
+ {
544
+ read_stats($stats,$file);
545
+ }
546
+ plot_dp($opts,$$stats{dp});
547
+ plot_af($opts,$$stats{af});
548
+ print_gs($opts,$$stats{gs});
549
+ }
550
+
551
+ sub plot
552
+ {
553
+ my ($file) = @_;
554
+ system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file");
555
+ }
556
+
557
+ sub plot_dp
558
+ {
559
+ my ($opts,$stats) = @_;
560
+
561
+ my $out;
562
+ my @plots;
563
+ for my $agt (sort keys %$stats)
564
+ {
565
+ if ( $agt eq 'n' or $agt eq 'ndist' ) { next; }
566
+ for my $bgt (sort keys %{$$stats{$agt}})
567
+ {
568
+ if ( $bgt eq 'n' ) { next; }
569
+ for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}})
570
+ {
571
+ $out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100./$$stats{n}{$agt}{$bgt} : 0) . "\n";
572
+ }
573
+ $out .= "end\n";
574
+ push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"];
575
+ }
576
+ }
577
+
578
+ open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!");
579
+ print $fh q[
580
+ set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
581
+ set output "] . "$$opts{plot}-dp.png" . q["
582
+ set ylabel 'Fraction of GTs [%]'
583
+ set y2label 'Number of GTs total'
584
+ set y2tics
585
+ set ytics nomirror
586
+ set xlabel 'Depth'
587
+ set xrange [:20]
588
+ ];
589
+ if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
590
+ print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n];
591
+ print $fh $out;
592
+ for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}})
593
+ {
594
+ print $fh "$dp\t$$stats{ndist}{$dp}\n";
595
+ }
596
+ print $fh "end\n";
597
+ close($fh);
598
+
599
+ plot("$$opts{plot}-dp.gp");
600
+ }
601
+
602
+ sub plot_af
603
+ {
604
+ my ($opts,$stats) = @_;
605
+
606
+ open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!");
607
+ if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
608
+ print $fh q[
609
+ set terminal png size 550,400 truecolor font "DejaVuSansMono,9"
610
+ set output "] . "$$opts{plot}-af.png" . q["
611
+ set grid back lc rgb "#dddddd"
612
+ set xlabel "Non-reference allele frequency"
613
+ set ylabel "Concordance"
614
+ set y2label "Number of genotypes"
615
+ set yrange [0.0:1.0]
616
+ set y2tics
617
+ set key center
618
+
619
+ plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \
620
+ '-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \
621
+ '-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \
622
+ '-' with points pt 20 lc rgb "red" title "HomRef", \
623
+ '-' with points pt 20 lc rgb "green" title "Het", \
624
+ '-' with points pt 20 lc rgb "blue" title "HomAlt"
625
+ ];
626
+
627
+ for my $type qw(RR RA AA)
628
+ {
629
+ for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
630
+ {
631
+ print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n";
632
+ }
633
+ print $fh "end\n";
634
+ }
635
+ for my $type qw(RR RA AA)
636
+ {
637
+ for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
638
+ {
639
+ my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches};
640
+ print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n";
641
+ }
642
+ print $fh "end\n";
643
+ }
644
+ close($fh);
645
+
646
+ plot("$$opts{plot}-af.gp");
647
+ }
648
+
649
+ sub do_region_stats
650
+ {
651
+ my ($opts,$vcfs) = @_;
652
+
653
+ my $refseq;
654
+ if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); }
655
+
656
+ my $nvcfs = scalar @$vcfs;
657
+ my $debug = $$opts{debug} ? $$opts{debug} : 0;
658
+ my $match = $$opts{match};
659
+ my $win = $$opts{win} ? $$opts{win} : 0;
660
+
661
+ while (1)
662
+ {
663
+ my $grp = read_next_group($vcfs,$win);
664
+ if ( !$grp || !scalar @$grp ) { last }
665
+
666
+ if ( $debug>1 )
667
+ {
668
+ print "Group:\n";
669
+ for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; }
670
+ print "\n";
671
+ }
672
+
673
+ my %files;
674
+ for my $rec (@$grp)
675
+ {
676
+ $files{$$rec{vcf}{file}} = 1;
677
+ }
678
+ my $key = join(' ',sort(keys %files));
679
+ $$match{$key}++;
680
+
681
+ my $npresent = scalar keys %files;
682
+ if ( $npresent == $nvcfs )
683
+ {
684
+ ref_alt_stats($opts,$grp);
685
+ }
686
+
687
+ if ( $npresent>1 && defined $refseq )
688
+ {
689
+ cmp_sequence($opts,$grp,$refseq);
690
+ }
691
+
692
+ if ( $$opts{cmp_genotypes} )
693
+ {
694
+ # Check that in the group there is one record for each file
695
+ if ( $npresent==$nvcfs && scalar @$grp==$nvcfs )
696
+ {
697
+ cmp_genotypes($opts,$grp);
698
+ }
699
+ }
700
+ }
701
+ }
702
+
703
+ sub cmp_sequence
704
+ {
705
+ my ($opts,$grp,$fa_refseq) = @_;
706
+
707
+ # Detailed comparison will be performed only if there are indels or complex
708
+ # substitutions, SNPs are interesting only in their presence. There can be
709
+ # more events from the same file present simultaneously and at multiple
710
+ # positions. They all are treated as separate variants and if any of them
711
+ # yields a haplotype present in all files, match is reported.
712
+ # Note that the original version of the code expected all alternate
713
+ # variants to be present on a single VCF line and was able to compare
714
+ # consecutive non-overlapping events as one sequence. However, because the
715
+ # the major producer of indel calls (Dindel) does report one variant per
716
+ # line, this idea was abandoned.
717
+
718
+ # Check if there are any interesting events.
719
+ my %has_indels;
720
+ my %events_per_file;
721
+ my $vcf = $$grp[0]{vcf};
722
+ for (my $igrp=0; $igrp<@$grp; $igrp++)
723
+ {
724
+ my $rec = $$grp[$igrp];
725
+ my $ifile = $$rec{vcf}{vcf_compare_ID};
726
+
727
+ my $ref_len = length($$rec{ref});
728
+ my @alts = split(/,/,$$rec{alt});
729
+ for my $alt (@alts)
730
+ {
731
+ if ( $alt eq '.' ) { next; }
732
+ if ( $alt=~/^</ ) { next; }
733
+ my $alt_len = length($alt);
734
+ push @{$events_per_file{$ifile}}, { pos=>$$rec{pos}, alt=>$alt, ref_len=>$ref_len };
735
+
736
+ # Do complex checking of event type only if it is still not certain if this is waste of time or not
737
+ if ( exists($has_indels{$ifile}) ) { next; }
738
+
739
+ if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
740
+ elsif ( $ref_len>1 )
741
+ {
742
+ my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt);
743
+ if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
744
+ }
745
+ }
746
+ }
747
+
748
+ # Return if there is nothing interesting
749
+ if ( scalar keys %has_indels < 2 ) { return; }
750
+
751
+ for my $ifile (keys %events_per_file)
752
+ {
753
+ if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); }
754
+ }
755
+
756
+ # Cache the reference sequence chunk
757
+ my $ref_from = $$grp[0]{pos} - $$opts{win};
758
+ my $ref_to = $$grp[-1]{pos} + $$opts{win};
759
+ my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to);
760
+
761
+ # For each file get all possible sequences
762
+ for my $events (values %events_per_file)
763
+ {
764
+ for my $variant (@$events)
765
+ {
766
+ my $pos = $$variant{pos};
767
+ my $len = $pos - $ref_from;
768
+ my $seq = $len>0 ? substr($refseq,0,$len) : '';
769
+ $seq .= $$variant{alt};
770
+
771
+ $pos += $$variant{ref_len};
772
+ if ( $pos<=$ref_to )
773
+ {
774
+ $seq .= substr($refseq,$pos-$ref_from);
775
+ }
776
+
777
+ $$variant{seq} = $seq;
778
+ $$variant{length} = length($seq);
779
+ }
780
+ }
781
+
782
+ # Now compare the variants: is there a sequence shared across all files?
783
+ my $match = 1;
784
+ my @keys = keys %events_per_file;
785
+ for (my $ikey=0; $ikey<@keys; $ikey++)
786
+ {
787
+ my $ivars = $events_per_file{$ikey};
788
+ for (my $jkey=0; $jkey<$ikey; $jkey++)
789
+ {
790
+ my $jvars = $events_per_file{$jkey};
791
+ my $found = 0;
792
+ for my $ivar (@$ivars)
793
+ {
794
+ for my $jvar (@$jvars)
795
+ {
796
+ if ( $$ivar{length} != $$jvar{length} ) { next; }
797
+ if ( $$ivar{seq} ne $$jvar{seq} ) { next; }
798
+ $found=1;
799
+ last;
800
+ }
801
+ }
802
+ if ( !$found ) { $match=0; last; }
803
+ }
804
+ if ( !$match ) { last; }
805
+ }
806
+
807
+ my $key = join(' ',sort(values %has_indels));
808
+ if ( $match )
809
+ {
810
+ $$opts{indels}{$key}{match}++;
811
+ }
812
+ else
813
+ {
814
+ $$opts{indels}{$key}{mismatch}++;
815
+ }
816
+ }
817
+
818
+ sub ref_alt_stats
819
+ {
820
+ my ($opts,$grp) = @_;
821
+
822
+ my $ref = $$grp[0]{ref};
823
+ my $alt = join(',',sort split(/,/,$$grp[0]{alt}));
824
+
825
+ my $alt_mismatch = 0;
826
+ for (my $i=1; $i<@$grp; $i++)
827
+ {
828
+ my $rec = $$grp[$i];
829
+
830
+ if ( $ref ne $$rec{ref} )
831
+ {
832
+ $$opts{ref_mismatch}++;
833
+ return;
834
+ }
835
+
836
+ my $tmp = join(',',sort split(/,/,$$rec{alt}));
837
+ if ( $alt ne $tmp )
838
+ {
839
+ $alt_mismatch = 1;
840
+ }
841
+ }
842
+ if ( $alt ne '.' )
843
+ {
844
+ if ( $alt_mismatch ) { $$opts{alt_mismatch}++; }
845
+ else { $$opts{alt_match}++; }
846
+ }
847
+ $$opts{ref_match}++;
848
+ }
849
+
850
+
851
+ sub snp_type
852
+ {
853
+ my ($als,$ref) = @_;
854
+
855
+ # Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA)
856
+ if ( $$als[0] eq $$als[1] )
857
+ {
858
+ if ( $$als[0] eq $ref ) { return 'hom_RR_'; }
859
+ else { return 'hom_AA_'; }
860
+ }
861
+ else
862
+ {
863
+ if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; }
864
+ else { return 'het_AA_'; }
865
+ }
866
+ }
867
+
868
+ sub cmp_genotypes
869
+ {
870
+ my ($opts,$grp) = @_;
871
+ my $nrecs = @$grp;
872
+ my $hapls = $$opts{hapls};
873
+
874
+ # Break the VCF lines into hashes (required by parse_haplotype)
875
+ for my $grp_rec (@$grp)
876
+ {
877
+ $$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line});
878
+ if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; }
879
+ if ( exists($$grp_rec{vcf}{_col_mapping}) )
880
+ {
881
+ my %new_cols;
882
+ while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}})
883
+ {
884
+ $new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori};
885
+ delete($$grp_rec{rec}{gtypes}{$name_ori});
886
+ }
887
+ while (my ($name,$hash) = each %new_cols)
888
+ {
889
+ $$grp_rec{rec}{gtypes}{$name} = $hash;
890
+ }
891
+ }
892
+ }
893
+ if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); }
894
+ my $ref = $$grp[0]{rec}{REF};
895
+
896
+ my %gtype_matches = ();
897
+ my %gtype_mismatches = ();
898
+
899
+ my $min_dp;
900
+ my $ndp3 = 0;
901
+ for my $id (keys %{$$grp[0]{rec}{gtypes}})
902
+ {
903
+ my (@sorted_als1,$nploid,$type);
904
+
905
+ my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id);
906
+ if ( !$is_empty1 )
907
+ {
908
+ @sorted_als1 = sort @$als1;
909
+ $nploid = scalar @sorted_als1;
910
+ $type = snp_type($als1,$ref);
911
+ }
912
+
913
+ if ( exists($$opts{high_confidence_gls}) )
914
+ {
915
+ my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL});
916
+ if ( @gls!=3 or $gls[0] eq '.' ) { next; }
917
+ @gls = sort {$b<=>$a} @gls;
918
+ if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; }
919
+ }
920
+
921
+ # There may be multiple files entering the comparison. Report match only if all are present and all match.
922
+ # Report mismatch if all are present and they do not match. Otherwise report lost/gained event.
923
+ my $phase_match = 1;
924
+ my $gtype_match = 1;
925
+ my $gtype_lost = 0;
926
+ my $gtype_gained = 0;
927
+ my $phase_lost = 0;
928
+ my $phase_gained = 0;
929
+ my $type2;
930
+ for (my $i=1; $i<$nrecs; $i++)
931
+ {
932
+ my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id);
933
+ if ( $is_empty1 )
934
+ {
935
+ $gtype_match = 0;
936
+ if ( !$is_empty2 )
937
+ {
938
+ $gtype_gained = 1;
939
+ $type = snp_type($als2,$ref);
940
+ }
941
+ if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; }
942
+ last;
943
+ }
944
+ elsif ( $is_empty2 )
945
+ {
946
+ $gtype_match = 0;
947
+ $gtype_lost = 1;
948
+ last;
949
+ }
950
+ if ( $is_phased1 )
951
+ {
952
+ if ( !$is_phased2 )
953
+ {
954
+ $phase_lost = 1;
955
+ $phase_match = 0;
956
+ }
957
+ }
958
+ elsif ( $is_phased2 )
959
+ {
960
+ $phase_gained = 1;
961
+ $phase_match = 0;
962
+ }
963
+ else { $phase_match = 0; }
964
+
965
+ # Consider different number of alleles as mismatch (C vs C/C)
966
+ if ( scalar @$als1 != scalar @$als2 )
967
+ {
968
+ $gtype_match = 0;
969
+ if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ }
970
+ last;
971
+ }
972
+
973
+ my @sorted_als2 = sort @$als2;
974
+ for (my $ial=0; $ial<$nploid; $ial++)
975
+ {
976
+ if ( $sorted_als1[$ial] ne $sorted_als2[$ial] )
977
+ {
978
+ $gtype_match = 0;
979
+ if ( $$opts{debug} )
980
+ {
981
+ my $type2 = snp_type($als2,$ref);
982
+ $$opts{mismatch_types}{$type}{$type2}++;
983
+ }
984
+ last;
985
+ }
986
+ }
987
+
988
+ if ( !$gtype_match )
989
+ {
990
+ if ( !defined $type2 && !$is_empty2 )
991
+ {
992
+ $type2 = snp_type($als2,$ref);
993
+ }
994
+ last;
995
+ }
996
+
997
+ # They match, check also if their phase agrees
998
+ if ( $phase_match && $is_phased1 && $is_phased2 )
999
+ {
1000
+ for (my $ial=0; $ial<$nploid; $ial++)
1001
+ {
1002
+ if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_match=0; last; }
1003
+ }
1004
+ }
1005
+ }
1006
+ if ( $gtype_gained )
1007
+ {
1008
+ $$hapls{$id}{$type.'gtype_gained'}++;
1009
+ if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ }
1010
+ next;
1011
+ }
1012
+ if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; }
1013
+
1014
+ if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ }
1015
+ elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ }
1016
+
1017
+ my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1;
1018
+ if ( $gtype_match )
1019
+ {
1020
+ $$hapls{$id}{$type.'gtype_match'}++;
1021
+ if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ }
1022
+ $gtype_matches{$type}++;
1023
+ $$opts{counts_by_dp}{$dp}{$type.'-'.$type}++;
1024
+ }
1025
+ elsif ( defined $type )
1026
+ {
1027
+ $$hapls{$id}{$type.'gtype_mismatch'}++;
1028
+ $gtype_mismatches{$type}++;
1029
+ $$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++;
1030
+ }
1031
+ }
1032
+ $$opts{hapls_ncmp}++;
1033
+
1034
+ # Store the number of matching types by AC
1035
+ my $af;
1036
+ if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) )
1037
+ {
1038
+ $af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1};
1039
+ }
1040
+ elsif ( !$$opts{all_samples_af} )
1041
+ {
1042
+ my $ac = 0;
1043
+ my $an = 0;
1044
+ if ( exists($gtype_matches{hom_AA_}) )
1045
+ {
1046
+ $ac += 2*$gtype_matches{hom_AA_};
1047
+ $an += 2*$gtype_matches{hom_AA_};
1048
+ }
1049
+ if ( exists($gtype_mismatches{hom_AA_}) )
1050
+ {
1051
+ $ac += 2*$gtype_mismatches{hom_AA_};
1052
+ $an += 2*$gtype_mismatches{hom_AA_};
1053
+ }
1054
+ if ( exists($gtype_matches{het_RA_}) )
1055
+ {
1056
+ $ac += $gtype_matches{het_RA_};
1057
+ $an += 2*$gtype_matches{het_RA_};
1058
+ }
1059
+ if ( exists($gtype_mismatches{het_RA_}) )
1060
+ {
1061
+ $ac += $gtype_mismatches{het_RA_};
1062
+ $an += 2*$gtype_mismatches{het_RA_};
1063
+ }
1064
+ if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; }
1065
+ if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; }
1066
+ $af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
1067
+ }
1068
+ else
1069
+ {
1070
+ my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes});
1071
+ $af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
1072
+ }
1073
+
1074
+ for my $type (keys %gtype_matches)
1075
+ {
1076
+ $$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type};
1077
+ $$opts{gtypes_cmp_total} += $gtype_matches{$type};
1078
+ }
1079
+ for my $type (keys %gtype_mismatches)
1080
+ {
1081
+ $$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type};
1082
+ $$opts{gtypes_cmp_total} += $gtype_mismatches{$type};
1083
+ }
1084
+
1085
+ if ( $$opts{debug} )
1086
+ {
1087
+ my $match = '?';
1088
+ if ( scalar keys %gtype_mismatches ) { $match = '-' }
1089
+ elsif ( scalar keys %gtype_matches ) { $match = '+' }
1090
+ my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0;
1091
+ my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0;
1092
+ my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0;
1093
+ my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0;
1094
+ my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0;
1095
+ my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0;
1096
+ my $hom_rr_c = sprintf "%.2f", ($hom_rr_mm or $hom_rr_m) ? $hom_rr_m*1./($hom_rr_m + $hom_rr_mm) : -1;
1097
+ my $het_ra_c = sprintf "%.2f", ($het_ra_mm or $het_ra_m) ? $het_ra_m*1./($het_ra_m + $het_ra_mm) : -1;
1098
+ my $hom_aa_c = sprintf "%.2f", ($hom_aa_mm or $hom_aa_m) ? $hom_aa_m*1./($hom_aa_m + $hom_aa_mm) : -1;
1099
+ print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$match\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$hom_rr_c\t$het_ra_c\t$hom_aa_c\n";
1100
+ }
1101
+ }
1102
+
1103
+
1104
+ sub read_next_group
1105
+ {
1106
+ my ($vcfs,$win) = @_;
1107
+
1108
+ my @grp;
1109
+ my $prev_vcf;
1110
+ my $start;
1111
+
1112
+ while (1)
1113
+ {
1114
+ my $min_vcf = get_min_position($vcfs);
1115
+ if ( !$min_vcf ) { last; }
1116
+ if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; }
1117
+ $prev_vcf = $$min_vcf{buf}[0];
1118
+
1119
+ if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} )
1120
+ {
1121
+ my $rec = shift(@{$$min_vcf{buf}});
1122
+ push @grp,$rec;
1123
+
1124
+ $start = $$rec{pos};
1125
+ next;
1126
+ }
1127
+ }
1128
+ return \@grp;
1129
+ }
1130
+
1131
+ sub get_min_position
1132
+ {
1133
+ my ($vcfs) = @_;
1134
+
1135
+ my ($min_pos,$min_vcf);
1136
+ for my $vcf (@$vcfs)
1137
+ {
1138
+ # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof
1139
+ if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($vcf); }
1140
+ if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; }
1141
+
1142
+ my $line = $$vcf{buf}[0];
1143
+
1144
+ # Designate this position as the minimum of all the files if:
1145
+ # .. is this the first file?
1146
+ if ( !$min_pos )
1147
+ {
1148
+ $min_pos = $$line{pos};
1149
+ $min_vcf = $vcf;
1150
+ next;
1151
+ }
1152
+
1153
+ # .. has this file lower position?
1154
+ if ( $min_pos>$$line{pos} )
1155
+ {
1156
+ $min_pos = $$line{pos};
1157
+ $min_vcf = $vcf;
1158
+ next;
1159
+ }
1160
+ }
1161
+ return $min_vcf;
1162
+ }
1163
+
1164
+ sub read_line
1165
+ {
1166
+ my ($vcf) = @_;
1167
+
1168
+ if ( $$vcf{eof} ) { return; }
1169
+
1170
+ my $line = $vcf->next_line();
1171
+ if ( !$line )
1172
+ {
1173
+ $$vcf{eof} = 1;
1174
+ return;
1175
+ }
1176
+
1177
+ $$vcf{nread}++;
1178
+
1179
+ if ( !($line=~/^(\S+)\t(\S+)\t\S+\t(\S+)\t(\S+)/) ) { error("Could not parse the line: [$line]\n"); }
1180
+ my $chr = $1;
1181
+ my $pos = $2;
1182
+ my $ref = uc($3);
1183
+ my $alt = uc($4);
1184
+ if ( $$vcf{buf} && @{$$vcf{buf}} )
1185
+ {
1186
+ my $prev = $$vcf{buf}[-1];
1187
+ if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); }
1188
+ }
1189
+
1190
+ push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf };
1191
+ return;
1192
+ }
1193
+