ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,577 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Author: petr.danecek@sanger
4
+ #
5
+
6
+ use strict;
7
+ use warnings;
8
+ use Carp;
9
+ use Vcf;
10
+
11
+ my $opts = parse_params();
12
+ merge_vcf_files($opts);
13
+
14
+ exit;
15
+
16
+ #--------------------------------
17
+
18
+ sub error
19
+ {
20
+ my (@msg) = @_;
21
+ if ( scalar @msg ) { croak join('',@msg); }
22
+ die
23
+ "About: Merge the bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n",
24
+ "Usage: merge-vcf [OPTIONS] file1.vcf file2.vcf.gz ... > out.vcf\n",
25
+ "Options:\n",
26
+ " -c, --chromosomes <list|file> Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n",
27
+ " -d, --remove-duplicates If there should be two consecutive rows with the same chr:pos, print only the first one.\n",
28
+ " -H, --vcf-header <file> Use the VCF header\n",
29
+ " -h, -?, --help This help message.\n",
30
+ " -r, --regions <list|file> Do only the given regions (comma-separated list or one region per line in a file).\n",
31
+ " -s, --silent Try to be a bit more silent, no warnings about duplicate lines.\n",
32
+ "\n";
33
+ }
34
+
35
+
36
+ sub parse_params
37
+ {
38
+ my $opts = { args=>[$0, @ARGV] };
39
+ while (my $arg=shift(@ARGV))
40
+ {
41
+ if ( $arg eq '-d' || $arg eq '--remove-duplicates' ) { $$opts{rm_dups}=1; next; }
42
+ if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{silent_dups}=1; next; }
43
+ if ( $arg eq '-H' || $arg eq '--vcf-header' ) { $$opts{vcf_header}=shift(@ARGV); next; }
44
+ if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; }
45
+ if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; }
46
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
47
+ if ( -e $arg ) { push @{$$opts{files}},$arg; next; }
48
+ error("Unknown parameter or non-existent file \"$arg\". Run -? for help.\n");
49
+ }
50
+ if ( !exists($$opts{files}) ) { error() }
51
+ return $opts;
52
+ }
53
+
54
+
55
+ # Returns the common prefix of the files.
56
+ sub common_prefix
57
+ {
58
+ my ($files) = @_;
59
+ my @paths;
60
+ my $len = -1;
61
+ for my $file (@$files)
62
+ {
63
+ my @path = split(m{/+},$file);
64
+ if ( $len<0 || $len>scalar @path ) { $len=scalar @path; }
65
+ push @paths, \@path;
66
+ }
67
+ my @common;
68
+ for (my $i=0; $i<$len; $i++)
69
+ {
70
+ my $identical=1;
71
+ for (my $ifile=1; $ifile<scalar @paths; $ifile++)
72
+ {
73
+ if ( $paths[$ifile]->[$i] ne $paths[0]->[$i] ) { $identical=0; last; }
74
+ }
75
+ if ( !$identical ) { last; }
76
+ push @common, $paths[0]->[$i];
77
+ }
78
+ return join('/+',@common);
79
+ }
80
+
81
+
82
+ sub read_region_list
83
+ {
84
+ my ($opts) = @_;
85
+
86
+ my @regions = ();
87
+ if ( exists($$opts{regions_list}) )
88
+ {
89
+ if ( -e $$opts{regions_list} )
90
+ {
91
+ open(my $rgs,'<',$$opts{regions_list}) or error("$$opts{regions_list}: $!");
92
+ while (my $line=<$rgs>)
93
+ {
94
+ chomp($line);
95
+ push @regions, $line;
96
+ }
97
+ close($rgs);
98
+ }
99
+ else
100
+ {
101
+ @regions = split(/,/,$$opts{regions_list});
102
+ }
103
+ }
104
+ return (@regions);
105
+ }
106
+
107
+ sub check_AGtags_definition
108
+ {
109
+ my ($vcf) = @_;
110
+ if ( $$vcf{version} >= 4.1 ) { return; }
111
+
112
+ # Whatever is the value set to, the user takes the responsibility for the merging strategy used
113
+ if ( exists($ENV{DONT_FIX_VCF40_AG_TAGS}) ) { return; }
114
+
115
+ my @tags;
116
+ if ( exists($$vcf{header}{INFO}{PL}) && $$vcf{header}{INFO}{PL}{Number} ne '.' ) { push @tags, 'PL'; }
117
+ if ( exists($$vcf{header}{INFO}{GL}) && $$vcf{header}{INFO}{GL}{Number} ne '.' ) { push @tags, 'GL'; }
118
+ if ( exists($$vcf{header}{INFO}{AC}) && $$vcf{header}{INFO}{AC}{Number} ne '.' ) { push @tags, 'AC'; }
119
+ if ( exists($$vcf{header}{INFO}{AF}) && $$vcf{header}{INFO}{AF}{Number} ne '.' ) { push @tags, 'AF'; }
120
+
121
+ if ( !@tags ) { return; }
122
+
123
+ $ENV{DONT_FIX_VCF40_AG_TAGS} = 1;
124
+ my $tags = join(',',@tags);
125
+ print STDERR
126
+ "Warning: The $tags tag(s) will not be merged correctly for multiallelic sites.\n",
127
+ " To be handled correctly, please redefine with Number=. or set the environment\n",
128
+ " variable DONT_FIX_VCF40_AG_TAGS=0.\n";
129
+ }
130
+
131
+ sub init_cols
132
+ {
133
+ my ($opts,$vcf_out) = @_;
134
+
135
+ my $prefix;
136
+ my @regions = read_region_list($opts);
137
+ my @vcfs;
138
+ my @cols;
139
+ my %has_chrom;
140
+ my %col_names;
141
+ my $icol = 9;
142
+ my $ncols_total = 0;
143
+
144
+ if ( !$$opts{has_col_names} ) { $prefix = common_prefix($$opts{files}); }
145
+
146
+ # Go through all files and read header, obtain list of chromosomes. The file names will be used for columns, unless
147
+ # they were read from the header.
148
+ for my $file (@{$$opts{files}})
149
+ {
150
+ my $vcf = Vcf->new(file=>$file);
151
+ $vcf->parse_header();
152
+ check_AGtags_definition($vcf);
153
+ $vcf->close();
154
+ push @vcfs, $vcf;
155
+
156
+ # Precompute the weighting factor for the QUAL column
157
+ my $ncols = scalar @{$$vcf{columns}} - 9;
158
+ if ( $ncols<=0 ) { $ncols = 1; }
159
+ $$vcf{qual_weight} = 1.0*$ncols;
160
+ $ncols_total += $ncols;
161
+
162
+ # Update the list of known chromosomes
163
+ if ( !exists($$opts{regions_list}) )
164
+ {
165
+ my $chrms = $vcf->get_chromosomes();
166
+ for my $chr (@$chrms)
167
+ {
168
+ if ( exists($has_chrom{$chr}) ) { next; }
169
+ $has_chrom{$chr} = 1;
170
+ push @regions, $chr;
171
+ }
172
+ }
173
+
174
+ my $col_prefix = '';
175
+ if ( !$$opts{has_col_names} )
176
+ {
177
+ # Make the column names nice - strip common prefix and the suffix .vcf.gz
178
+ $col_prefix = $file;
179
+ $col_prefix =~ s{^/*$prefix/*}{};
180
+ $col_prefix =~ s/\.gz$//i;
181
+ $col_prefix =~ s/\.vcf$//i;
182
+ $col_prefix .= '_';
183
+ }
184
+
185
+ if ( !exists($$vcf{columns}) ) { error("No header present? $file\n"); }
186
+
187
+ # Create good names for the columns in the merged vcf file
188
+ my @vcf_cols = @{$$vcf{columns}};
189
+ $$vcf{__col_names} = [];
190
+ for my $col (@vcf_cols[9..$#vcf_cols])
191
+ {
192
+ my $col_name = $col;
193
+ if ( $$opts{has_col_names} )
194
+ {
195
+ if ( $icol >= @{$$vcf_out{columns}} ) { error("Fewer columns in the header than in the VCF files total.\n"); }
196
+ $col_name = $$vcf_out{columns}[$icol];
197
+ $icol++;
198
+
199
+ if ( exists($col_names{$col_name}) ) { error("The column names not unique in the header: $col_name\n"); }
200
+ }
201
+ else
202
+ {
203
+ if ( exists($col_names{$col_name}) ) { $col_name = $col_prefix.$col; }
204
+ if ( exists($col_names{$col_name}) ) { warn("FIXME: the column name [$col_name] not unique.\n"); }
205
+ }
206
+ warn("Using column name '$col_name' for $file:$col\n");
207
+ $col_names{$col_name} = 1;
208
+
209
+ push @cols, $col_name;
210
+ push @{$$vcf{__col_names}}, $col_name;
211
+ }
212
+ }
213
+
214
+ if ( $$opts{has_col_names} && $icol!=@{$$vcf_out{columns}} ) { error("More columns in the header than in the VCF files total.\n"); }
215
+
216
+ # QUAL weighting
217
+ for my $vcf (@vcfs)
218
+ {
219
+ $$vcf{qual_weight} /= $ncols_total;
220
+ }
221
+
222
+ $$opts{vcfs} = \@vcfs;
223
+ $$opts{cols} = \@cols;
224
+ $$opts{regions} = \@regions;
225
+ }
226
+
227
+
228
+ sub merge_vcf_files
229
+ {
230
+ my ($opts) = @_;
231
+
232
+ # Create output VCF
233
+ my $vcf_out;
234
+ if ( $$opts{vcf_header} )
235
+ {
236
+ $vcf_out = Vcf->new(file=>$$opts{vcf_header});
237
+ $vcf_out->parse_header();
238
+ if ( $$vcf_out{columns} && @{$$vcf_out{columns}} ) { $$opts{has_col_names}=1; }
239
+ }
240
+ else
241
+ {
242
+ $vcf_out = Vcf->new();
243
+ }
244
+
245
+ init_cols($opts,$vcf_out);
246
+ my @regions = @{$$opts{regions}};
247
+ my @cols = @{$$opts{cols}};
248
+ my @vcfs = @{$$opts{vcfs}};
249
+
250
+ # Get the header of the output VCF ready
251
+ $vcf_out->add_columns(@cols);
252
+ if ( !$$vcf_out{has_header} )
253
+ {
254
+ for my $vcf (@vcfs)
255
+ {
256
+ # To get the missig fields filled by the default values
257
+ for my $hline (@{$$vcf{header_lines}})
258
+ {
259
+ if ( $$hline{key} eq 'fileformat' ) { next; }
260
+ $vcf_out->add_header_line($hline,silent=>1);
261
+ }
262
+ }
263
+ }
264
+
265
+ # List source files
266
+ my $source;
267
+ for (my $i=0; $i<@vcfs; $i++)
268
+ {
269
+ if ( $i ) { $source .= ','; }
270
+ $source .= "$i:$vcfs[$i]{file}";
271
+ }
272
+ $vcf_out->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp');
273
+ $vcf_out->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp');
274
+ $vcf_out->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'});
275
+
276
+ my $have_samples = @{$$vcf_out{columns}}>9 ? 1 : 0;
277
+
278
+ $vcf_out->recalc_ac_an($have_samples ? 2 : 0);
279
+ $vcf_out->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'});
280
+ $vcf_out->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'});
281
+ print $vcf_out->format_header();
282
+
283
+ # Go through all VCF files simultaneously and output each line, one region at a time.
284
+ for my $region (@regions)
285
+ {
286
+ # Open files
287
+ for my $vcf (@vcfs)
288
+ {
289
+ delete($$vcf{last_line});
290
+ $vcf->open(region=>$region,parse_header=>1);
291
+ advance_position($vcf);
292
+ }
293
+
294
+ while (1)
295
+ {
296
+ my $pos = get_min_position(\@vcfs);
297
+ if ( !defined $pos ) { last; }
298
+
299
+ my %out;
300
+ $out{POS} = $pos;
301
+ $out{ID} = '.';
302
+ $out{ALT} = [];
303
+ $out{FORMAT} = [];
304
+ my %format;
305
+ my %info;
306
+ my @src_files;
307
+ my %filters;
308
+ my (@quals,@qual_weights,$qual_weights_sum,%ac,$an);
309
+
310
+ my %ref_alt_map = ();
311
+ # Find out the REFs and ALTs: in VCFv4.0, the REFs can differ and ALTs must be converted
312
+ for my $vcf (@vcfs)
313
+ {
314
+ my $line = $$vcf{last_line};
315
+ if ( !$line or $pos ne $$line{POS} ) { next; }
316
+ if ( !exists($out{CHROM}) ) { $out{CHROM} = $$line{CHROM}; }
317
+ my $ref = $$line{REF};
318
+ for my $alt (@{$$line{ALT}}) { $ref_alt_map{$ref}{$alt}=$alt; }
319
+ }
320
+ # Do the REF,ALT conversion only when necessary
321
+ my $new_ref;
322
+ if ( scalar keys %ref_alt_map > 1 )
323
+ {
324
+ $new_ref = $vcf_out->fill_ref_alt_mapping(\%ref_alt_map);
325
+ }
326
+ if ( !$have_samples )
327
+ {
328
+ # Do not loose information from the ALT column when samples are not present
329
+ my %alts;
330
+ for my $vcf (@vcfs)
331
+ {
332
+ my $line = $$vcf{last_line};
333
+ if ( !$line or $pos ne $$line{POS} ) { next; }
334
+ my $ref = $$line{REF};
335
+ for my $alt (@{$$line{ALT}}) { $alts{$ref_alt_map{$ref}{$alt}}=1; }
336
+ $out{ALT} = [ keys %alts ];
337
+ }
338
+ }
339
+ for (my $ivcf=0; $ivcf<@vcfs; $ivcf++)
340
+ {
341
+ my $vcf = $vcfs[$ivcf];
342
+ my $line = $$vcf{last_line};
343
+
344
+ # If this file does not have a record for this position, then for all its columns output undef gtype
345
+ if ( !$line or $pos ne $$line{POS} )
346
+ {
347
+ for (my $i=0; $i<@{$$vcf{__col_names}}; $i++)
348
+ {
349
+ my $name = $$vcf{__col_names}->[$i];
350
+ $out{gtypes}{$name}{GT} = $$vcf_out{defaults}{GT};
351
+ }
352
+ next;
353
+ }
354
+
355
+ # Check if the site has been filtered
356
+ if ( scalar @{$$line{FILTER}}>1 or ($$line{FILTER}[0] ne $$vcf{filter_passed} && $$line{FILTER}[0] ne $$vcf{defaults}{default}) )
357
+ {
358
+ push @src_files,$ivcf.'f';
359
+ }
360
+ else
361
+ {
362
+ push @src_files,$ivcf;
363
+ }
364
+
365
+ # Collect information for the FILTER field
366
+ for my $flt (@{$$line{FILTER}})
367
+ {
368
+ if ( $flt eq $$vcf{filter_passed} )
369
+ {
370
+ $filters{$$vcf_out{filter_passed}} = 1;
371
+ }
372
+ elsif ( $flt ne $$vcf{defaults}{default} )
373
+ {
374
+ $filters{$flt} = 1;
375
+ }
376
+ }
377
+
378
+ # Collect information for the QUAL field
379
+ if ( $$line{QUAL} ne $$vcf{defaults}{QUAL} && $$line{QUAL} ne $$vcf{defaults}{default} && $$line{QUAL}>0 )
380
+ {
381
+ push @quals,$$line{QUAL};
382
+ push @qual_weights,$$vcf{qual_weight};
383
+ $qual_weights_sum += $$vcf{qual_weight};
384
+ }
385
+
386
+ if ( $$line{ID} ne '.' && $out{ID} eq '.' ) { $out{ID}=$$line{ID}; }
387
+
388
+ # Remember the FORMAT fields
389
+ for my $field (@{$$line{FORMAT}}) { $format{$field} = 1; }
390
+
391
+ # VCF without genotypes: calculate AC,AN if present
392
+ if ( !$have_samples )
393
+ {
394
+ if ( exists($$line{INFO}{AN}) ) { $an += $$line{INFO}{AN}; }
395
+ if ( exists($$line{INFO}{AC}) )
396
+ {
397
+ my (@acs) = split(/,/,$$line{INFO}{AC});
398
+ for (my $i=0; $i<@acs; $i++)
399
+ {
400
+ my $alt = $ref_alt_map{$$line{REF}}{$$line{ALT}[$i]};
401
+ $ac{$alt} += $acs[$i];
402
+ }
403
+ }
404
+ }
405
+
406
+ # Join the INFO field
407
+ for my $inf (keys %{$$line{INFO}})
408
+ {
409
+ # When conflicting INFO fields are present, use the first one
410
+ if ( exists($info{$inf}) ) { next; }
411
+ $info{$inf} = $$line{INFO}{$inf};
412
+ }
413
+
414
+ my $ref = $$line{REF};
415
+
416
+ # The ALT column may change after the merge, take care of ALT dependent tags such as GL.
417
+ if ( $have_samples )
418
+ {
419
+ if ( defined $new_ref )
420
+ {
421
+ $vcf->parse_AGtags($line,\%ref_alt_map,$$line{REF});
422
+ }
423
+ else
424
+ {
425
+ $vcf->parse_AGtags($line);
426
+ }
427
+ }
428
+
429
+ # Now fill in the genotype information for each column
430
+ for (my $i=0; $i<@{$$vcf{__col_names}}; $i++)
431
+ {
432
+ my $ori_name = $$vcf{columns}->[$i+9];
433
+ my $out_name = $$vcf{__col_names}->[$i];
434
+
435
+ $out{gtypes}{$out_name} = $$line{gtypes}{$ori_name};
436
+
437
+ # This is to convert 0/1 to G/C
438
+ my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($line,$ori_name);
439
+ if ( defined $new_ref )
440
+ {
441
+ my @als;
442
+ for my $al (@$alleles)
443
+ {
444
+ push @als, exists($ref_alt_map{$ref}{$al}) ? $ref_alt_map{$ref}{$al} : '.';
445
+ }
446
+ $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype(\@als,$seps);
447
+ }
448
+ else
449
+ {
450
+ $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype($alleles,$seps);
451
+ }
452
+ }
453
+ $out{REF} = defined $new_ref ? $new_ref : $ref;
454
+ advance_position($vcf,$opts);
455
+ }
456
+
457
+ $out{INFO} = { %info };
458
+ $out{INFO}{SF} = join(',',@src_files);
459
+
460
+ # Output the QUAL information
461
+ my $qual;
462
+ for (my $i=0; $i<@quals; $i++)
463
+ {
464
+ $qual += $quals[$i] * $qual_weights[$i] * (1.0 / $qual_weights_sum);
465
+ }
466
+ $out{QUAL} = defined $qual ? sprintf("%.2f",$qual) : $$vcf_out{defaults}{QUAL};
467
+
468
+ # Output the FILTER information: remove PASS or missing value if some other information
469
+ # is present.
470
+ delete($filters{$$vcf_out{defaults}{default}});
471
+ if ( exists($filters{$$vcf_out{filter_passed}}) && scalar keys %filters > 1 )
472
+ {
473
+ delete($filters{$$vcf_out{filter_passed}});
474
+ }
475
+ $out{FILTER} = [ keys %filters ];
476
+ if ( !@{$out{FILTER}} ) { push @{$out{FILTER}},$$vcf_out{defaults}{default}; }
477
+
478
+ # The GT field must come as first
479
+ delete($format{GT});
480
+ $out{FORMAT} = ['GT'];
481
+ for my $key (keys %format) { push @{$out{FORMAT}},$key; }
482
+
483
+ if ( $have_samples )
484
+ {
485
+ $vcf_out->format_genotype_strings(\%out);
486
+ }
487
+ else
488
+ {
489
+ if ( defined $an ) { $out{INFO}{AN}=$an; }
490
+ if ( scalar keys %ac )
491
+ {
492
+ my @acs;
493
+ for my $alt (@{$out{ALT}})
494
+ {
495
+ # Some of the files may not have AC, the AC count can be undefined in such a case.
496
+ push @acs, exists($ac{$alt}) ? $ac{$alt} : 0;
497
+ }
498
+ $out{INFO}{AC} = join(',',@acs);
499
+ }
500
+ }
501
+ print $vcf_out->format_line(\%out);
502
+ }
503
+ }
504
+ }
505
+
506
+
507
+ sub advance_position
508
+ {
509
+ my ($vcf,$opts) = @_;
510
+
511
+ if ( exists($$vcf{last_line}) && !$$vcf{last_line} ) { return; }
512
+
513
+ my $line;
514
+ while (!$line)
515
+ {
516
+ $line = $vcf->next_data_hash();
517
+ if ( !$line )
518
+ {
519
+ $$vcf{last_line} = $line;
520
+ return;
521
+ }
522
+ if ( !$$vcf{last_line} ) { last; }
523
+
524
+ if ( $$vcf{last_line}{POS} eq $$line{POS} )
525
+ {
526
+ print STDERR "The position appeared twice: $$vcf{file} .. $$line{CHROM}:$$line{POS}\n" unless $$opts{silent_dups};
527
+
528
+ # This is the only reason for the while loop: if ignoring dups, get the next line
529
+ if ( $$opts{rm_dups} )
530
+ {
531
+ undef($line);
532
+ }
533
+ }
534
+ elsif ( $$vcf{last_line}{POS} > $$line{POS})
535
+ {
536
+ error("Wrong order: $$vcf{file} .. $$line{CHROM}:$$line{POS} comes after $$vcf{last_line}{CHROM}:$$vcf{last_line}{POS}\n");
537
+ }
538
+ }
539
+
540
+ $$vcf{last_line} = $line;
541
+
542
+ return;
543
+ }
544
+
545
+
546
+ sub get_min_position
547
+ {
548
+ my ($vcfs) = @_;
549
+ my ($pos,$ref);
550
+ for my $vcf (@$vcfs)
551
+ {
552
+ my $line = $$vcf{last_line};
553
+ if ( !$line ) { next; }
554
+
555
+ # Designate this position as the minimum of all the files if:
556
+ # .. is this the first file?
557
+ if ( !defined $pos )
558
+ {
559
+ $pos = $$line{POS};
560
+ $ref = $$line{REF};
561
+
562
+ next;
563
+ }
564
+
565
+ # .. has this file lower position?
566
+ if ( $pos>$$line{POS} )
567
+ {
568
+ $pos = $$line{POS};
569
+ $ref = $$line{REF};
570
+
571
+ next;
572
+ }
573
+ }
574
+ return $pos;
575
+ }
576
+
577
+