ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1193 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Author: petr.danecek@sanger
4
+ #
5
+
6
+ use strict;
7
+ use warnings;
8
+ use Carp;
9
+ use Vcf;
10
+ use FaSlice;
11
+
12
+ my $opts = parse_params();
13
+ if ( exists($$opts{plot}) )
14
+ {
15
+ plot_stats($opts);
16
+ }
17
+ else
18
+ {
19
+ compare_vcfs($opts);
20
+ }
21
+
22
+ exit;
23
+
24
+ #--------------------------------
25
+
26
+ sub error
27
+ {
28
+ my (@msg) = @_;
29
+ if ( scalar @msg )
30
+ {
31
+ croak @msg;
32
+ }
33
+ die
34
+ "About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n",
35
+ "Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n",
36
+ " vcf-compare -p plots chr1.cmp chr2.cmp ...\n",
37
+ "Options:\n",
38
+ " -c, --chromosomes <list|file> Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n",
39
+ " -d, --debug Debugging information. Giving the option multiple times increases verbosity\n",
40
+ " -g, --cmp-genotypes Compare genotypes, not only positions\n",
41
+ " -m, --name-mapping <list|file> Use with -g when comparing files with differing column names. The argument to this options is a\n",
42
+ " comma-separated list or one mapping per line in a file. The names are colon separated and must\n",
43
+ " appear in the same order as the files on the command line.\n",
44
+ " -p, --plot <prefix> Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n",
45
+ " -R, --refseq <file> Compare the actual sequence, not just positions. Use with -w to compare indels.\n",
46
+ " -r, --regions <list|file> Process the given regions (comma-separated list or one region per line in a file).\n",
47
+ " -s, --samples <list|file> Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n",
48
+ " -t, --title <string> Title for graphs (see also -p)\n",
49
+ " -w, --win <int> In repetitive sequences, the same indel can be called at different positions. Consider\n",
50
+ " records this far apart as matching (be it a SNP or an indel).\n",
51
+ " -h, -?, --help This help message.\n",
52
+ "\n";
53
+ }
54
+
55
+
56
+ sub parse_params
57
+ {
58
+ my $opts = { positions=>0 };
59
+ while (my $arg=shift(@ARGV))
60
+ {
61
+ if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; }
62
+ if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; }
63
+ if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; }
64
+ if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; }
65
+ if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; }
66
+ if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; }
67
+ if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; }
68
+ if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; }
69
+ if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; }
70
+ if ( $arg eq '-s' || $arg eq '--samples' )
71
+ {
72
+ my $samples = shift(@ARGV);
73
+ my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples);
74
+ $$opts{samples} = \@samples;
75
+ next;
76
+ }
77
+ if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; }
78
+ if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; }
79
+ if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; }
80
+ if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; }
81
+ if ( -e $arg ) { push @{$$opts{files}}, $arg; next }
82
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
83
+ error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
84
+ }
85
+ if ( !exists($$opts{files}) ) { error("What files should be compared?\n") }
86
+ return $opts;
87
+ }
88
+
89
+ sub read_list
90
+ {
91
+ my ($fname) = @_;
92
+ my @regions;
93
+ if ( -e $fname )
94
+ {
95
+ open(my $rgs,'<',$fname) or error("$fname: $!");
96
+ while (my $line=<$rgs>)
97
+ {
98
+ chomp($line);
99
+ push @regions, $line;
100
+ }
101
+ close($rgs);
102
+ }
103
+ else
104
+ {
105
+ @regions = split(/,/,$fname);
106
+ }
107
+ return (@regions);
108
+ }
109
+
110
+ sub read_mappings_list
111
+ {
112
+ my ($fname,$files) = @_;
113
+ my @maps = read_list($fname);
114
+ my %mapping;
115
+ for my $map (@maps)
116
+ {
117
+ my @items = split(/:/,$map);
118
+ if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); }
119
+ for (my $i=1; $i<@$files; $i++)
120
+ {
121
+ $mapping{$$files[$i]}{$items[$i]} = $items[0];
122
+ warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n");
123
+ }
124
+ }
125
+ return \%mapping;
126
+ }
127
+
128
+ sub compare_vcfs
129
+ {
130
+ my ($opts) = @_;
131
+
132
+ $$opts{match} = {};
133
+ $$opts{hapls} = {};
134
+
135
+ # Open the VCF files and initialize the list of chromosomes
136
+ my @vcfs;
137
+ my (@regions,%has_chrom,$mappings);
138
+ if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); }
139
+ if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); }
140
+
141
+ print "# This file was generated by vcf-compare.\n#\n";
142
+ if ( $$opts{debug} )
143
+ {
144
+ print
145
+ "#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n",
146
+ "#SD The columns are: \n",
147
+ "#SD 1 .. chromosome\n",
148
+ "#SD 2 .. position\n",
149
+ "#SD 3 .. indicates matching (+) or mismatching (-) site\n",
150
+ "#SD 4 .. number of Hom_RR mismatches\n",
151
+ "#SD 5 .. number of Het_RA mismatches\n",
152
+ "#SD 6 .. number of Hom_AA mismatches\n";
153
+ }
154
+
155
+ my $ifile = 0;
156
+ for my $file (@{$$opts{files}})
157
+ {
158
+ my $vcf = Vcf->new(file=>$file);
159
+ $$vcf{vcf_compare_ID} = $ifile++;
160
+ $vcf->parse_header();
161
+ $vcf->close();
162
+ $$vcf{nread} = 0;
163
+ push @vcfs, $vcf;
164
+
165
+ # Update the list of known chromosomes
166
+ if ( !exists($$opts{regions_list}) )
167
+ {
168
+ my $chrms = $vcf->get_chromosomes();
169
+ for my $chr (@$chrms)
170
+ {
171
+ if ( exists($has_chrom{$chr}) ) { next; }
172
+ $has_chrom{$chr} = 1;
173
+ push @regions, $chr;
174
+ }
175
+ }
176
+
177
+ # Check if column names need to be renamed
178
+ if ( defined $mappings && exists($$mappings{$$vcf{file}}) )
179
+ {
180
+ $$vcf{_col_mapping} = $$mappings{$$vcf{file}};
181
+ for my $name (keys %{$$vcf{_col_mapping}})
182
+ {
183
+ if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); }
184
+ my $new_name = $$vcf{_col_mapping}{$name};
185
+ $$vcf{_col_mapping_rev}{$new_name} = $name;
186
+ }
187
+ }
188
+ }
189
+
190
+ # Include only matching samples in haplotype comparison
191
+ if ( $$opts{cmp_genotypes} )
192
+ {
193
+ my %all_samples;
194
+ for my $vcf (@vcfs)
195
+ {
196
+ if ( exists $$opts{samples} )
197
+ {
198
+ for my $sample (@{$$opts{samples}})
199
+ {
200
+ if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
201
+ if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ }
202
+ }
203
+ }
204
+ else
205
+ {
206
+ my @samples = $vcf->get_samples();
207
+ for my $sample (@samples)
208
+ {
209
+ if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; }
210
+ $all_samples{$sample}++
211
+ }
212
+ }
213
+ }
214
+ my @include_samples;
215
+ while (my ($sample,$count)=each %all_samples)
216
+ {
217
+ if ( $count != scalar @vcfs ) { next; }
218
+ push @include_samples, $sample;
219
+ }
220
+ if ( !@include_samples )
221
+ {
222
+ error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n");
223
+ }
224
+ $$opts{gt_samples_compared} = scalar @include_samples;
225
+ for my $vcf (@vcfs)
226
+ {
227
+ my @include;
228
+ if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; }
229
+ else
230
+ {
231
+ for my $sample (@include_samples)
232
+ {
233
+ push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample
234
+ }
235
+ }
236
+ $vcf->set_samples(include=>\@include);
237
+ }
238
+ }
239
+
240
+ # Go through all the files simultaneously and get the stats.
241
+ for my $region (@regions)
242
+ {
243
+ # Open files
244
+ for my $vcf (@vcfs)
245
+ {
246
+ delete($$vcf{last_line});
247
+ $vcf->open(region=>$region,parse_header=>1);
248
+ delete($$vcf{eof});
249
+ }
250
+ do_region_stats($opts,\@vcfs);
251
+ }
252
+
253
+ report_stats($opts,\@vcfs);
254
+
255
+ for my $vcf (@vcfs)
256
+ {
257
+ if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); }
258
+ }
259
+ }
260
+
261
+ sub report_stats
262
+ {
263
+ my ($opts,$vcfs) = @_;
264
+
265
+ # if ( $$opts{debug} )
266
+ # {
267
+ # use Data::Dumper; print Dumper($opts);
268
+ # }
269
+
270
+ my (@counts,%totals);
271
+ while (my ($key,$num) = each %{$$opts{match}})
272
+ {
273
+ my @files = split(/\s+/,$key);
274
+ for my $file (@files)
275
+ {
276
+ $totals{$file} += $num;
277
+ }
278
+ push @counts, {count=>$num, files=>[@files]};
279
+ }
280
+
281
+ print
282
+ "#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n",
283
+ "#VN The columns are: \n",
284
+ "#VN 1 .. number of sites unique to this particular combination of files\n",
285
+ "#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n";
286
+ for my $rec (sort {$$a{count}<=>$$b{count}} @counts)
287
+ {
288
+ my $num = $$rec{count};
289
+ my $files = $$rec{files};
290
+
291
+ print "VN\t$num";
292
+ for my $file (@$files)
293
+ {
294
+ printf "\t$file (%.1f%%)", $num*100./$totals{$file};
295
+ }
296
+ print "\n";
297
+ }
298
+
299
+ if ( $$opts{refseq} && $$opts{indels} )
300
+ {
301
+ print
302
+ "#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n",
303
+ "#IN .. todo\n",
304
+ "#IN Number of matching indel haplotypes shared across:\n";
305
+ while (my ($file,$stat) = each %{$$opts{indels}})
306
+ {
307
+ print "IN\t$file\n";
308
+ my $match = $$stat{match} ? $$stat{match} : 0;
309
+ my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0;
310
+ printf "\t\tNumber of matches: %d\n", $match;
311
+ printf "\t\t mismatches: %d\n", $mismatch;
312
+ printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch);
313
+ }
314
+ }
315
+
316
+ print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n";
317
+ printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0;
318
+ printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0;
319
+ printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0;
320
+ printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0;
321
+ printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0;
322
+
323
+ my $out;
324
+ for my $vcf (@$vcfs)
325
+ {
326
+ if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; }
327
+ if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; }
328
+
329
+ my $diff = $$vcf{nread}-$totals{$$vcf{file}};
330
+ my $reported = $totals{$$vcf{file}};
331
+ my $total = $$vcf{nread};
332
+ $out .= sprintf "SN\t%d (%.1f%%) .. read %d, reported %d\t%s\n", $diff,$diff*100./$total,$total,$reported,$$vcf{file};
333
+ }
334
+ if ( $out )
335
+ {
336
+ print "SN\tNumber of sites lost due to grouping (e.g. duplicate sites)\n";
337
+ print $out;
338
+ }
339
+
340
+
341
+ if ( !$$opts{cmp_genotypes} ) { return; }
342
+
343
+ my %summary;
344
+ for my $id (keys %{$$opts{hapls}})
345
+ {
346
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
347
+ {
348
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; }
349
+ $$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
350
+
351
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; }
352
+ $$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'};
353
+
354
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; }
355
+ $$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'};
356
+
357
+ if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; }
358
+ $$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'};
359
+
360
+ $summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'};
361
+ $summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'};
362
+ }
363
+ for my $key qw(het_RA_ het_AA_)
364
+ {
365
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; }
366
+ $$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'};
367
+
368
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; }
369
+ $$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'};
370
+
371
+ if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; }
372
+ $$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'};
373
+ }
374
+ }
375
+ print
376
+ "#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n",
377
+ "#GS The columns are:\n",
378
+ "#GS 1 .. variant type\n",
379
+ "#GS 2 .. number of mismatches\n",
380
+ "#GS 3 .. number of matches\n",
381
+ "#GS 4 .. discordance\n";
382
+ print_gs($opts,\%summary);
383
+
384
+ print
385
+ "\n",
386
+ "#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n",
387
+ "#GC The columns are:\n",
388
+ "#GC 1 .. Sample\n",
389
+ "#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n",
390
+ "#GC 7-9 .. Gtype lost: total het_RA het_AA \n",
391
+ "#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n",
392
+ "#GC 15-17 .. Phase lost: total het_RA het_AA \n",
393
+ "#GC 18 .. Phase gained\n",
394
+ "#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n",
395
+ "#GC 24-26 .. Phased matches: total het_RA het_AA \n",
396
+ "#GC 27-29 .. Misphased matches: total het_RA het_AA \n";
397
+
398
+ for my $id (keys %{$$opts{hapls}})
399
+ {
400
+ print "GC\t$id";
401
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; }
402
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; }
403
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; }
404
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; }
405
+ if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; }
406
+ print "\t",$$opts{hapls}{$id}{phase_gained};
407
+ for my $key qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; }
408
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; }
409
+ for my $key qw(total_ het_RA_ het_AA_) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; }
410
+ print "\n";
411
+ }
412
+
413
+ print
414
+ "#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n",
415
+ "#AF The columns are:\n",
416
+ "#AF 1 .. Non-ref allele count\n",
417
+ "#AF 2 .. Hom(RR) matches\n",
418
+ "#AF 3 .. Het(RA) matches\n",
419
+ "#AF 4 .. Hom(AA) matches\n",
420
+ "#AF 5 .. Het(AA) matches\n",
421
+ "#AF 6 .. Hom(RR) mismatches\n",
422
+ "#AF 7 .. Het(RA) mismatches\n",
423
+ "#AF 8 .. Hom(AA) mismatches\n",
424
+ "#AF 9 .. Het(AA) mismatches\n";
425
+ for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}})
426
+ {
427
+ print "AF\t$ac";
428
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
429
+ {
430
+ print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0;
431
+ }
432
+ for my $key qw(hom_RR_ het_RA_ hom_AA_ het_AA_)
433
+ {
434
+ print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0;
435
+ }
436
+ print "\n";
437
+ }
438
+
439
+ print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n";
440
+ print "#DP The columns are:\n";
441
+ print "#DP 1 .. depth\n";
442
+ print "#DP 2 .. RR matches\n";
443
+ print "#DP 3 .. RA matches\n";
444
+ print "#DP 4 .. AA matches\n";
445
+ print "#DP 5 .. RR -> RA mismatches\n";
446
+ print "#DP 6 .. RR -> AA mismatches\n";
447
+ print "#DP 7 .. RA -> RR mismatches\n";
448
+ print "#DP 8 .. RA -> AA mismatches\n";
449
+ print "#DP 9 .. AA -> RR mismatches\n";
450
+ print "#DP 10 .. AA -> RA mismatches\n";
451
+ for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}})
452
+ {
453
+ print "DP\t$dp";
454
+ for my $type qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)
455
+ {
456
+ printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0;
457
+ }
458
+ print "\n";
459
+ }
460
+
461
+ if ( $$opts{debug} )
462
+ {
463
+ print "#MT Mismatch Types\n";
464
+ for my $t1 (keys %{$$opts{mismatch_types}})
465
+ {
466
+ for my $t2 (keys %{$$opts{mismatch_types}{$t1}})
467
+ {
468
+ print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n";
469
+ }
470
+ }
471
+ }
472
+ }
473
+
474
+ sub print_gs
475
+ {
476
+ my ($opts,$stats) = @_;
477
+ my ($ndr_ms,$ndr_m);
478
+ for my $key qw(hom_RR het_RA hom_AA het_AA)
479
+ {
480
+ my $m = $$stats{"${key}_"}{match};
481
+ my $ms = $$stats{"${key}_"}{mismatch};
482
+ if ( !$m ) { $m=0; }
483
+ if ( !$ms ) { $ms=0; }
484
+ printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$m?$ms*100./($m+$ms):0;
485
+ $ndr_ms += $ms;
486
+ $ndr_m += $key eq 'hom_RR' ? 0 : $m;
487
+ }
488
+ printf
489
+ "SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $ndr_m+$ndr_ms ? $ndr_ms*100./($ndr_m+$ndr_ms) : 0;
490
+ }
491
+
492
+ sub read_stats
493
+ {
494
+ my ($stats,$file) = @_;
495
+ open(my $fh,'<',$file) or error("$file: $!");
496
+ while (my $line=<$fh>)
497
+ {
498
+ if ( $line=~/^#/ ) { next; }
499
+ my @items = split(/\t/,$line);
500
+ chomp($items[-1]);
501
+ if ( $items[0] eq 'DP' )
502
+ {
503
+ my $dp = $items[1];
504
+ $$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10];
505
+ $$stats{dp}{RR}{RA}{$dp} += $items[5];
506
+ $$stats{dp}{n}{RR}{RA} += $items[5];
507
+ $$stats{dp}{RR}{AA}{$dp} += $items[6];
508
+ $$stats{dp}{n}{RR}{AA} += $items[6];
509
+ $$stats{dp}{RA}{RR}{$dp} += $items[7];
510
+ $$stats{dp}{n}{RA}{RR} += $items[7];
511
+ $$stats{dp}{RA}{AA}{$dp} += $items[8];
512
+ $$stats{dp}{n}{RA}{AA} += $items[8];
513
+ $$stats{dp}{AA}{RR}{$dp} += $items[9];
514
+ $$stats{dp}{n}{AA}{RR} += $items[9];
515
+ $$stats{dp}{AA}{RA}{$dp} += $items[10];
516
+ $$stats{dp}{n}{AA}{RA} += $items[10];
517
+ }
518
+ if ( $items[0] eq 'AF' )
519
+ {
520
+ my $af = $items[1];
521
+ $$stats{af}{RR}{$af}{matches} += $items[2];
522
+ $$stats{af}{RA}{$af}{matches} += $items[3];
523
+ $$stats{af}{AA}{$af}{matches} += $items[4];
524
+ $$stats{af}{RR}{$af}{mismatches} += $items[6];
525
+ $$stats{af}{RA}{$af}{mismatches} += $items[7];
526
+ $$stats{af}{AA}{$af}{mismatches} += $items[8];
527
+ }
528
+ if ( $items[0] eq 'GS' )
529
+ {
530
+ my $type = $items[1];
531
+ $$stats{gs}{$type.'_'}{mismatch} += $items[2];
532
+ $$stats{gs}{$type.'_'}{match} += $items[3];
533
+ }
534
+ }
535
+ close($fh);
536
+ }
537
+
538
+ sub plot_stats
539
+ {
540
+ my ($opts) = @_;
541
+ my $stats = {};
542
+ for my $file (@{$$opts{files}})
543
+ {
544
+ read_stats($stats,$file);
545
+ }
546
+ plot_dp($opts,$$stats{dp});
547
+ plot_af($opts,$$stats{af});
548
+ print_gs($opts,$$stats{gs});
549
+ }
550
+
551
+ sub plot
552
+ {
553
+ my ($file) = @_;
554
+ system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file");
555
+ }
556
+
557
+ sub plot_dp
558
+ {
559
+ my ($opts,$stats) = @_;
560
+
561
+ my $out;
562
+ my @plots;
563
+ for my $agt (sort keys %$stats)
564
+ {
565
+ if ( $agt eq 'n' or $agt eq 'ndist' ) { next; }
566
+ for my $bgt (sort keys %{$$stats{$agt}})
567
+ {
568
+ if ( $bgt eq 'n' ) { next; }
569
+ for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}})
570
+ {
571
+ $out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100./$$stats{n}{$agt}{$bgt} : 0) . "\n";
572
+ }
573
+ $out .= "end\n";
574
+ push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"];
575
+ }
576
+ }
577
+
578
+ open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!");
579
+ print $fh q[
580
+ set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
581
+ set output "] . "$$opts{plot}-dp.png" . q["
582
+ set ylabel 'Fraction of GTs [%]'
583
+ set y2label 'Number of GTs total'
584
+ set y2tics
585
+ set ytics nomirror
586
+ set xlabel 'Depth'
587
+ set xrange [:20]
588
+ ];
589
+ if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
590
+ print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n];
591
+ print $fh $out;
592
+ for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}})
593
+ {
594
+ print $fh "$dp\t$$stats{ndist}{$dp}\n";
595
+ }
596
+ print $fh "end\n";
597
+ close($fh);
598
+
599
+ plot("$$opts{plot}-dp.gp");
600
+ }
601
+
602
+ sub plot_af
603
+ {
604
+ my ($opts,$stats) = @_;
605
+
606
+ open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!");
607
+ if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; }
608
+ print $fh q[
609
+ set terminal png size 550,400 truecolor font "DejaVuSansMono,9"
610
+ set output "] . "$$opts{plot}-af.png" . q["
611
+ set grid back lc rgb "#dddddd"
612
+ set xlabel "Non-reference allele frequency"
613
+ set ylabel "Concordance"
614
+ set y2label "Number of genotypes"
615
+ set yrange [0.0:1.0]
616
+ set y2tics
617
+ set key center
618
+
619
+ plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \
620
+ '-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \
621
+ '-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \
622
+ '-' with points pt 20 lc rgb "red" title "HomRef", \
623
+ '-' with points pt 20 lc rgb "green" title "Het", \
624
+ '-' with points pt 20 lc rgb "blue" title "HomAlt"
625
+ ];
626
+
627
+ for my $type qw(RR RA AA)
628
+ {
629
+ for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
630
+ {
631
+ print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n";
632
+ }
633
+ print $fh "end\n";
634
+ }
635
+ for my $type qw(RR RA AA)
636
+ {
637
+ for my $af (sort {$a<=>$b} keys %{$$stats{$type}})
638
+ {
639
+ my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches};
640
+ print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n";
641
+ }
642
+ print $fh "end\n";
643
+ }
644
+ close($fh);
645
+
646
+ plot("$$opts{plot}-af.gp");
647
+ }
648
+
649
+ sub do_region_stats
650
+ {
651
+ my ($opts,$vcfs) = @_;
652
+
653
+ my $refseq;
654
+ if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); }
655
+
656
+ my $nvcfs = scalar @$vcfs;
657
+ my $debug = $$opts{debug} ? $$opts{debug} : 0;
658
+ my $match = $$opts{match};
659
+ my $win = $$opts{win} ? $$opts{win} : 0;
660
+
661
+ while (1)
662
+ {
663
+ my $grp = read_next_group($vcfs,$win);
664
+ if ( !$grp || !scalar @$grp ) { last }
665
+
666
+ if ( $debug>1 )
667
+ {
668
+ print "Group:\n";
669
+ for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; }
670
+ print "\n";
671
+ }
672
+
673
+ my %files;
674
+ for my $rec (@$grp)
675
+ {
676
+ $files{$$rec{vcf}{file}} = 1;
677
+ }
678
+ my $key = join(' ',sort(keys %files));
679
+ $$match{$key}++;
680
+
681
+ my $npresent = scalar keys %files;
682
+ if ( $npresent == $nvcfs )
683
+ {
684
+ ref_alt_stats($opts,$grp);
685
+ }
686
+
687
+ if ( $npresent>1 && defined $refseq )
688
+ {
689
+ cmp_sequence($opts,$grp,$refseq);
690
+ }
691
+
692
+ if ( $$opts{cmp_genotypes} )
693
+ {
694
+ # Check that in the group there is one record for each file
695
+ if ( $npresent==$nvcfs && scalar @$grp==$nvcfs )
696
+ {
697
+ cmp_genotypes($opts,$grp);
698
+ }
699
+ }
700
+ }
701
+ }
702
+
703
+ sub cmp_sequence
704
+ {
705
+ my ($opts,$grp,$fa_refseq) = @_;
706
+
707
+ # Detailed comparison will be performed only if there are indels or complex
708
+ # substitutions, SNPs are interesting only in their presence. There can be
709
+ # more events from the same file present simultaneously and at multiple
710
+ # positions. They all are treated as separate variants and if any of them
711
+ # yields a haplotype present in all files, match is reported.
712
+ # Note that the original version of the code expected all alternate
713
+ # variants to be present on a single VCF line and was able to compare
714
+ # consecutive non-overlapping events as one sequence. However, because the
715
+ # the major producer of indel calls (Dindel) does report one variant per
716
+ # line, this idea was abandoned.
717
+
718
+ # Check if there are any interesting events.
719
+ my %has_indels;
720
+ my %events_per_file;
721
+ my $vcf = $$grp[0]{vcf};
722
+ for (my $igrp=0; $igrp<@$grp; $igrp++)
723
+ {
724
+ my $rec = $$grp[$igrp];
725
+ my $ifile = $$rec{vcf}{vcf_compare_ID};
726
+
727
+ my $ref_len = length($$rec{ref});
728
+ my @alts = split(/,/,$$rec{alt});
729
+ for my $alt (@alts)
730
+ {
731
+ if ( $alt eq '.' ) { next; }
732
+ if ( $alt=~/^</ ) { next; }
733
+ my $alt_len = length($alt);
734
+ push @{$events_per_file{$ifile}}, { pos=>$$rec{pos}, alt=>$alt, ref_len=>$ref_len };
735
+
736
+ # Do complex checking of event type only if it is still not certain if this is waste of time or not
737
+ if ( exists($has_indels{$ifile}) ) { next; }
738
+
739
+ if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
740
+ elsif ( $ref_len>1 )
741
+ {
742
+ my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt);
743
+ if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; }
744
+ }
745
+ }
746
+ }
747
+
748
+ # Return if there is nothing interesting
749
+ if ( scalar keys %has_indels < 2 ) { return; }
750
+
751
+ for my $ifile (keys %events_per_file)
752
+ {
753
+ if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); }
754
+ }
755
+
756
+ # Cache the reference sequence chunk
757
+ my $ref_from = $$grp[0]{pos} - $$opts{win};
758
+ my $ref_to = $$grp[-1]{pos} + $$opts{win};
759
+ my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to);
760
+
761
+ # For each file get all possible sequences
762
+ for my $events (values %events_per_file)
763
+ {
764
+ for my $variant (@$events)
765
+ {
766
+ my $pos = $$variant{pos};
767
+ my $len = $pos - $ref_from;
768
+ my $seq = $len>0 ? substr($refseq,0,$len) : '';
769
+ $seq .= $$variant{alt};
770
+
771
+ $pos += $$variant{ref_len};
772
+ if ( $pos<=$ref_to )
773
+ {
774
+ $seq .= substr($refseq,$pos-$ref_from);
775
+ }
776
+
777
+ $$variant{seq} = $seq;
778
+ $$variant{length} = length($seq);
779
+ }
780
+ }
781
+
782
+ # Now compare the variants: is there a sequence shared across all files?
783
+ my $match = 1;
784
+ my @keys = keys %events_per_file;
785
+ for (my $ikey=0; $ikey<@keys; $ikey++)
786
+ {
787
+ my $ivars = $events_per_file{$ikey};
788
+ for (my $jkey=0; $jkey<$ikey; $jkey++)
789
+ {
790
+ my $jvars = $events_per_file{$jkey};
791
+ my $found = 0;
792
+ for my $ivar (@$ivars)
793
+ {
794
+ for my $jvar (@$jvars)
795
+ {
796
+ if ( $$ivar{length} != $$jvar{length} ) { next; }
797
+ if ( $$ivar{seq} ne $$jvar{seq} ) { next; }
798
+ $found=1;
799
+ last;
800
+ }
801
+ }
802
+ if ( !$found ) { $match=0; last; }
803
+ }
804
+ if ( !$match ) { last; }
805
+ }
806
+
807
+ my $key = join(' ',sort(values %has_indels));
808
+ if ( $match )
809
+ {
810
+ $$opts{indels}{$key}{match}++;
811
+ }
812
+ else
813
+ {
814
+ $$opts{indels}{$key}{mismatch}++;
815
+ }
816
+ }
817
+
818
+ sub ref_alt_stats
819
+ {
820
+ my ($opts,$grp) = @_;
821
+
822
+ my $ref = $$grp[0]{ref};
823
+ my $alt = join(',',sort split(/,/,$$grp[0]{alt}));
824
+
825
+ my $alt_mismatch = 0;
826
+ for (my $i=1; $i<@$grp; $i++)
827
+ {
828
+ my $rec = $$grp[$i];
829
+
830
+ if ( $ref ne $$rec{ref} )
831
+ {
832
+ $$opts{ref_mismatch}++;
833
+ return;
834
+ }
835
+
836
+ my $tmp = join(',',sort split(/,/,$$rec{alt}));
837
+ if ( $alt ne $tmp )
838
+ {
839
+ $alt_mismatch = 1;
840
+ }
841
+ }
842
+ if ( $alt ne '.' )
843
+ {
844
+ if ( $alt_mismatch ) { $$opts{alt_mismatch}++; }
845
+ else { $$opts{alt_match}++; }
846
+ }
847
+ $$opts{ref_match}++;
848
+ }
849
+
850
+
851
+ sub snp_type
852
+ {
853
+ my ($als,$ref) = @_;
854
+
855
+ # Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA)
856
+ if ( $$als[0] eq $$als[1] )
857
+ {
858
+ if ( $$als[0] eq $ref ) { return 'hom_RR_'; }
859
+ else { return 'hom_AA_'; }
860
+ }
861
+ else
862
+ {
863
+ if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; }
864
+ else { return 'het_AA_'; }
865
+ }
866
+ }
867
+
868
+ sub cmp_genotypes
869
+ {
870
+ my ($opts,$grp) = @_;
871
+ my $nrecs = @$grp;
872
+ my $hapls = $$opts{hapls};
873
+
874
+ # Break the VCF lines into hashes (required by parse_haplotype)
875
+ for my $grp_rec (@$grp)
876
+ {
877
+ $$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line});
878
+ if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; }
879
+ if ( exists($$grp_rec{vcf}{_col_mapping}) )
880
+ {
881
+ my %new_cols;
882
+ while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}})
883
+ {
884
+ $new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori};
885
+ delete($$grp_rec{rec}{gtypes}{$name_ori});
886
+ }
887
+ while (my ($name,$hash) = each %new_cols)
888
+ {
889
+ $$grp_rec{rec}{gtypes}{$name} = $hash;
890
+ }
891
+ }
892
+ }
893
+ if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); }
894
+ my $ref = $$grp[0]{rec}{REF};
895
+
896
+ my %gtype_matches = ();
897
+ my %gtype_mismatches = ();
898
+
899
+ my $min_dp;
900
+ my $ndp3 = 0;
901
+ for my $id (keys %{$$grp[0]{rec}{gtypes}})
902
+ {
903
+ my (@sorted_als1,$nploid,$type);
904
+
905
+ my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id);
906
+ if ( !$is_empty1 )
907
+ {
908
+ @sorted_als1 = sort @$als1;
909
+ $nploid = scalar @sorted_als1;
910
+ $type = snp_type($als1,$ref);
911
+ }
912
+
913
+ if ( exists($$opts{high_confidence_gls}) )
914
+ {
915
+ my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL});
916
+ if ( @gls!=3 or $gls[0] eq '.' ) { next; }
917
+ @gls = sort {$b<=>$a} @gls;
918
+ if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; }
919
+ }
920
+
921
+ # There may be multiple files entering the comparison. Report match only if all are present and all match.
922
+ # Report mismatch if all are present and they do not match. Otherwise report lost/gained event.
923
+ my $phase_match = 1;
924
+ my $gtype_match = 1;
925
+ my $gtype_lost = 0;
926
+ my $gtype_gained = 0;
927
+ my $phase_lost = 0;
928
+ my $phase_gained = 0;
929
+ my $type2;
930
+ for (my $i=1; $i<$nrecs; $i++)
931
+ {
932
+ my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id);
933
+ if ( $is_empty1 )
934
+ {
935
+ $gtype_match = 0;
936
+ if ( !$is_empty2 )
937
+ {
938
+ $gtype_gained = 1;
939
+ $type = snp_type($als2,$ref);
940
+ }
941
+ if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; }
942
+ last;
943
+ }
944
+ elsif ( $is_empty2 )
945
+ {
946
+ $gtype_match = 0;
947
+ $gtype_lost = 1;
948
+ last;
949
+ }
950
+ if ( $is_phased1 )
951
+ {
952
+ if ( !$is_phased2 )
953
+ {
954
+ $phase_lost = 1;
955
+ $phase_match = 0;
956
+ }
957
+ }
958
+ elsif ( $is_phased2 )
959
+ {
960
+ $phase_gained = 1;
961
+ $phase_match = 0;
962
+ }
963
+ else { $phase_match = 0; }
964
+
965
+ # Consider different number of alleles as mismatch (C vs C/C)
966
+ if ( scalar @$als1 != scalar @$als2 )
967
+ {
968
+ $gtype_match = 0;
969
+ if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ }
970
+ last;
971
+ }
972
+
973
+ my @sorted_als2 = sort @$als2;
974
+ for (my $ial=0; $ial<$nploid; $ial++)
975
+ {
976
+ if ( $sorted_als1[$ial] ne $sorted_als2[$ial] )
977
+ {
978
+ $gtype_match = 0;
979
+ if ( $$opts{debug} )
980
+ {
981
+ my $type2 = snp_type($als2,$ref);
982
+ $$opts{mismatch_types}{$type}{$type2}++;
983
+ }
984
+ last;
985
+ }
986
+ }
987
+
988
+ if ( !$gtype_match )
989
+ {
990
+ if ( !defined $type2 && !$is_empty2 )
991
+ {
992
+ $type2 = snp_type($als2,$ref);
993
+ }
994
+ last;
995
+ }
996
+
997
+ # They match, check also if their phase agrees
998
+ if ( $phase_match && $is_phased1 && $is_phased2 )
999
+ {
1000
+ for (my $ial=0; $ial<$nploid; $ial++)
1001
+ {
1002
+ if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_match=0; last; }
1003
+ }
1004
+ }
1005
+ }
1006
+ if ( $gtype_gained )
1007
+ {
1008
+ $$hapls{$id}{$type.'gtype_gained'}++;
1009
+ if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ }
1010
+ next;
1011
+ }
1012
+ if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; }
1013
+
1014
+ if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ }
1015
+ elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ }
1016
+
1017
+ my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1;
1018
+ if ( $gtype_match )
1019
+ {
1020
+ $$hapls{$id}{$type.'gtype_match'}++;
1021
+ if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ }
1022
+ $gtype_matches{$type}++;
1023
+ $$opts{counts_by_dp}{$dp}{$type.'-'.$type}++;
1024
+ }
1025
+ elsif ( defined $type )
1026
+ {
1027
+ $$hapls{$id}{$type.'gtype_mismatch'}++;
1028
+ $gtype_mismatches{$type}++;
1029
+ $$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++;
1030
+ }
1031
+ }
1032
+ $$opts{hapls_ncmp}++;
1033
+
1034
+ # Store the number of matching types by AC
1035
+ my $af;
1036
+ if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) )
1037
+ {
1038
+ $af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1};
1039
+ }
1040
+ elsif ( !$$opts{all_samples_af} )
1041
+ {
1042
+ my $ac = 0;
1043
+ my $an = 0;
1044
+ if ( exists($gtype_matches{hom_AA_}) )
1045
+ {
1046
+ $ac += 2*$gtype_matches{hom_AA_};
1047
+ $an += 2*$gtype_matches{hom_AA_};
1048
+ }
1049
+ if ( exists($gtype_mismatches{hom_AA_}) )
1050
+ {
1051
+ $ac += 2*$gtype_mismatches{hom_AA_};
1052
+ $an += 2*$gtype_mismatches{hom_AA_};
1053
+ }
1054
+ if ( exists($gtype_matches{het_RA_}) )
1055
+ {
1056
+ $ac += $gtype_matches{het_RA_};
1057
+ $an += 2*$gtype_matches{het_RA_};
1058
+ }
1059
+ if ( exists($gtype_mismatches{het_RA_}) )
1060
+ {
1061
+ $ac += $gtype_mismatches{het_RA_};
1062
+ $an += 2*$gtype_mismatches{het_RA_};
1063
+ }
1064
+ if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; }
1065
+ if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; }
1066
+ $af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
1067
+ }
1068
+ else
1069
+ {
1070
+ my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes});
1071
+ $af = sprintf "%.2f", $an>0 ? $ac/$an : 0;
1072
+ }
1073
+
1074
+ for my $type (keys %gtype_matches)
1075
+ {
1076
+ $$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type};
1077
+ $$opts{gtypes_cmp_total} += $gtype_matches{$type};
1078
+ }
1079
+ for my $type (keys %gtype_mismatches)
1080
+ {
1081
+ $$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type};
1082
+ $$opts{gtypes_cmp_total} += $gtype_mismatches{$type};
1083
+ }
1084
+
1085
+ if ( $$opts{debug} )
1086
+ {
1087
+ my $match = '?';
1088
+ if ( scalar keys %gtype_mismatches ) { $match = '-' }
1089
+ elsif ( scalar keys %gtype_matches ) { $match = '+' }
1090
+ my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0;
1091
+ my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0;
1092
+ my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0;
1093
+ my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0;
1094
+ my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0;
1095
+ my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0;
1096
+ my $hom_rr_c = sprintf "%.2f", ($hom_rr_mm or $hom_rr_m) ? $hom_rr_m*1./($hom_rr_m + $hom_rr_mm) : -1;
1097
+ my $het_ra_c = sprintf "%.2f", ($het_ra_mm or $het_ra_m) ? $het_ra_m*1./($het_ra_m + $het_ra_mm) : -1;
1098
+ my $hom_aa_c = sprintf "%.2f", ($hom_aa_mm or $hom_aa_m) ? $hom_aa_m*1./($hom_aa_m + $hom_aa_mm) : -1;
1099
+ print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$match\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$hom_rr_c\t$het_ra_c\t$hom_aa_c\n";
1100
+ }
1101
+ }
1102
+
1103
+
1104
+ sub read_next_group
1105
+ {
1106
+ my ($vcfs,$win) = @_;
1107
+
1108
+ my @grp;
1109
+ my $prev_vcf;
1110
+ my $start;
1111
+
1112
+ while (1)
1113
+ {
1114
+ my $min_vcf = get_min_position($vcfs);
1115
+ if ( !$min_vcf ) { last; }
1116
+ if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; }
1117
+ $prev_vcf = $$min_vcf{buf}[0];
1118
+
1119
+ if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} )
1120
+ {
1121
+ my $rec = shift(@{$$min_vcf{buf}});
1122
+ push @grp,$rec;
1123
+
1124
+ $start = $$rec{pos};
1125
+ next;
1126
+ }
1127
+ }
1128
+ return \@grp;
1129
+ }
1130
+
1131
+ sub get_min_position
1132
+ {
1133
+ my ($vcfs) = @_;
1134
+
1135
+ my ($min_pos,$min_vcf);
1136
+ for my $vcf (@$vcfs)
1137
+ {
1138
+ # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof
1139
+ if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($vcf); }
1140
+ if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; }
1141
+
1142
+ my $line = $$vcf{buf}[0];
1143
+
1144
+ # Designate this position as the minimum of all the files if:
1145
+ # .. is this the first file?
1146
+ if ( !$min_pos )
1147
+ {
1148
+ $min_pos = $$line{pos};
1149
+ $min_vcf = $vcf;
1150
+ next;
1151
+ }
1152
+
1153
+ # .. has this file lower position?
1154
+ if ( $min_pos>$$line{pos} )
1155
+ {
1156
+ $min_pos = $$line{pos};
1157
+ $min_vcf = $vcf;
1158
+ next;
1159
+ }
1160
+ }
1161
+ return $min_vcf;
1162
+ }
1163
+
1164
+ sub read_line
1165
+ {
1166
+ my ($vcf) = @_;
1167
+
1168
+ if ( $$vcf{eof} ) { return; }
1169
+
1170
+ my $line = $vcf->next_line();
1171
+ if ( !$line )
1172
+ {
1173
+ $$vcf{eof} = 1;
1174
+ return;
1175
+ }
1176
+
1177
+ $$vcf{nread}++;
1178
+
1179
+ if ( !($line=~/^(\S+)\t(\S+)\t\S+\t(\S+)\t(\S+)/) ) { error("Could not parse the line: [$line]\n"); }
1180
+ my $chr = $1;
1181
+ my $pos = $2;
1182
+ my $ref = uc($3);
1183
+ my $alt = uc($4);
1184
+ if ( $$vcf{buf} && @{$$vcf{buf}} )
1185
+ {
1186
+ my $prev = $$vcf{buf}[-1];
1187
+ if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); }
1188
+ }
1189
+
1190
+ push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf };
1191
+ return;
1192
+ }
1193
+