ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,2853 @@
1
+ package Vcf;
2
+
3
+ # http://vcftools.sourceforge.net/specs.html
4
+ # http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
5
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:variant_call_format
6
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0
7
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf_4.0_sv
8
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
9
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2
10
+ #
11
+ # Authors: petr.danecek@sanger
12
+ # for VCF v3.2, v3.3, v4.0, v4.1
13
+ #
14
+
15
+ =head1 NAME
16
+
17
+ Vcf.pm. Module for validation, parsing and creating VCF files.
18
+ Supported versions: 3.2, 3.3, 4.0, 4.1
19
+
20
+ =head1 SYNOPSIS
21
+
22
+ From the command line:
23
+ perl -MVcf -e validate example.vcf
24
+ perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf
25
+
26
+ From a script:
27
+ use Vcf;
28
+
29
+ my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000');
30
+ $vcf->parse_header();
31
+
32
+ # Do some simple parsing. Most thorough but slowest way how to get the data.
33
+ while (my $x=$vcf->next_data_hash())
34
+ {
35
+ for my $gt (keys %{$$x{gtypes}})
36
+ {
37
+ my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt);
38
+ print "\t$gt: $al1$sep$al2\n";
39
+ }
40
+ print "\n";
41
+ }
42
+
43
+ # This will split the fields and print a list of CHR:POS
44
+ while (my $x=$vcf->next_data_array())
45
+ {
46
+ print "$$x[0]:$$x[1]\n";
47
+ }
48
+
49
+ # This will return the lines as they were read, including the newline at the end
50
+ while (my $x=$vcf->next_line())
51
+ {
52
+ print $x;
53
+ }
54
+
55
+ # Only the columns NA00001, NA00002 and NA00003 will be printed.
56
+ my @columns = qw(NA00001 NA00002 NA00003);
57
+ print $vcf->format_header(\@columns);
58
+ while (my $x=$vcf->next_data_array())
59
+ {
60
+ # this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0
61
+ print $vcf->format_line($x,\@columns);
62
+ }
63
+
64
+ $vcf->close();
65
+
66
+ =cut
67
+
68
+
69
+ use strict;
70
+ use warnings;
71
+ use Carp;
72
+ use Exporter;
73
+ use Data::Dumper;
74
+ use POSIX ":sys_wait_h";
75
+
76
+ use vars qw/@ISA @EXPORT/;
77
+ @ISA = qw/Exporter/;
78
+ @EXPORT = qw/validate validate_v32/;
79
+
80
+ =head2 validate
81
+
82
+ About : Validates the VCF file.
83
+ Usage : perl -MVcf -e validate example.vcf.gz # (from the command line)
84
+ validate('example.vcf.gz'); # (from a script)
85
+ validate(\*STDIN);
86
+ Args : File name or file handle. When no argument given, the first command line
87
+ argument is interpreted as the file name.
88
+
89
+ =cut
90
+
91
+ sub validate
92
+ {
93
+ my ($fh) = @_;
94
+
95
+ if ( !$fh && @ARGV ) { $fh = $ARGV[0]; }
96
+
97
+ my $vcf;
98
+ if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); }
99
+ else { $vcf = Vcf->new(fh=>\*STDIN); }
100
+
101
+ $vcf->run_validation();
102
+ }
103
+
104
+
105
+ =head2 validate_v32
106
+
107
+ About : Same as validate, but assumes v3.2 VCF version.
108
+ Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line)
109
+ Args : File name or file handle. When no argument given, the first command line
110
+ argument is interpreted as the file name.
111
+
112
+ =cut
113
+
114
+ sub validate_v32
115
+ {
116
+ my ($fh) = @_;
117
+
118
+ if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; }
119
+
120
+ my %params = ( version=>'3.2' );
121
+
122
+ my $vcf;
123
+ if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); }
124
+ else { $vcf = Vcf->new(%params, fh=>\*STDIN); }
125
+
126
+ $vcf->run_validation();
127
+ }
128
+
129
+
130
+ =head2 new
131
+
132
+ About : Creates new VCF reader/writer.
133
+ Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2');
134
+ Args :
135
+ fh .. Open file handle. If neither file nor fh is given, open in write mode.
136
+ file .. The file name. If neither file nor fh is given, open in write mode.
137
+ region .. Optional region to parse (requires tabix indexed VCF file)
138
+ silent .. Unless set to 0, warning messages may be printed.
139
+ strict .. Unless set to 0, the reader will die when the file violates the specification.
140
+ version .. If not given, '4.0' is assumed. The header information overrides this setting.
141
+
142
+ =cut
143
+
144
+ sub new
145
+ {
146
+ my ($class,@args) = @_;
147
+ my $self = {@args};
148
+ bless $self, ref($class) || $class;
149
+
150
+ $$self{silent} = 0 unless exists($$self{silent});
151
+ $$self{strict} = 0 unless exists($$self{strict});
152
+ $$self{buffer} = []; # buffer stores the lines in the reverse order
153
+ $$self{columns} = undef; # column names
154
+ $$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory});
155
+ $$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols});
156
+ $$self{recalc_ac_an} = 1;
157
+ $$self{has_header} = 0;
158
+ $$self{default_version} = '4.1';
159
+ $$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1) ];
160
+ if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} }
161
+ $$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1;
162
+ my %open_args = ();
163
+ if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; }
164
+ if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; }
165
+ return $self->_open(%open_args);
166
+ }
167
+
168
+ sub throw
169
+ {
170
+ my ($self,@msg) = @_;
171
+ confess @msg,"\n";
172
+ }
173
+
174
+ sub warn
175
+ {
176
+ my ($self,@msg) = @_;
177
+ if ( $$self{silent} ) { return; }
178
+ if ( $$self{strict} ) { $self->throw(@msg); }
179
+ warn @msg;
180
+ }
181
+
182
+ sub _open
183
+ {
184
+ my ($self,%args) = @_;
185
+
186
+ if ( !exists($$self{fh}) && !exists($$self{file}) )
187
+ {
188
+ # Write mode, the version must be supplied by the user
189
+ return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version});
190
+ }
191
+
192
+ # Open the file unless filehandle is provided
193
+ if ( !exists($$self{fh}) )
194
+ {
195
+ my $cmd = "<$$self{file}";
196
+
197
+ my $tabix_args = '';
198
+ if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; }
199
+ $tabix_args .= $$self{file};
200
+ if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= ' '.$args{region}; }
201
+
202
+ if ( -e $$self{file} && $$self{file}=~/\.gz/i )
203
+ {
204
+ if ( exists($args{region}) && defined($args{region}) )
205
+ {
206
+ $cmd = "tabix $tabix_args |";
207
+ }
208
+ else { $cmd = "gunzip -c $$self{file} |"; }
209
+ $$self{check_exit_status} = 1;
210
+ }
211
+ elsif ( $$self{file}=~m{^(?:http|ftp)://} )
212
+ {
213
+ if ( !exists($args{region}) ) { $tabix_args .= ' .'; }
214
+ $cmd = "tabix $tabix_args |";
215
+ $$self{check_exit_status} = 1;
216
+ }
217
+ open($$self{fh},$cmd) or $self->throw("$cmd: $!");
218
+ }
219
+
220
+ # Set the correct VCF version, but only when called for the first time
221
+ my $vcf = $self;
222
+ if ( !$$self{_version_set} )
223
+ {
224
+ my $first_line = $self->next_line();
225
+ $vcf = $self->_set_version($first_line);
226
+ $self->_unread_line($first_line);
227
+ }
228
+ return $vcf;
229
+ }
230
+
231
+
232
+
233
+ =head2 open
234
+
235
+ About : (Re)Open file. No need to call this explicitly unless reading from a different
236
+ region is requested.
237
+ Usage : $vcf->open(); # Read from the start
238
+ $vcf->open(region=>'1:12345-92345');
239
+ Args : region .. Supported only for tabix indexed files
240
+
241
+ =cut
242
+
243
+ sub open
244
+ {
245
+ my ($self,%args) = @_;
246
+ $self->close();
247
+ $self->_open(%args);
248
+ }
249
+
250
+
251
+ =head2 close
252
+
253
+ About : Close the filehandle
254
+ Usage : $vcf->close();
255
+ Args : none
256
+
257
+ =cut
258
+
259
+ sub close
260
+ {
261
+ my ($self) = @_;
262
+ if ( !$$self{fh} ) { return; }
263
+ close($$self{fh});
264
+ delete($$self{fh});
265
+ }
266
+
267
+
268
+ =head2 next_line
269
+
270
+ About : Reads next VCF line.
271
+ Usage : my $vcf = Vcf->new();
272
+ my $x = $vcf->next_line();
273
+ Args : none
274
+
275
+ =cut
276
+
277
+ sub next_line
278
+ {
279
+ my ($self) = @_;
280
+ if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
281
+
282
+ my $line;
283
+ if ( !exists($$self{max_line_len}) )
284
+ {
285
+ $line = readline($$self{fh});
286
+ }
287
+ else
288
+ {
289
+ while (1)
290
+ {
291
+ $line = readline($$self{fh});
292
+ if ( !defined $line ) { last; }
293
+
294
+ my $len = length($line);
295
+ if ( $len>$$self{max_line_len} && !($line=~/^#/) )
296
+ {
297
+ if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); }
298
+ $self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n");
299
+ next;
300
+ }
301
+ last;
302
+ }
303
+ }
304
+ if ( !defined $line && $$self{check_exit_status} )
305
+ {
306
+ my $pid = waitpid(-1, WNOHANG);
307
+ if ( $pid!=0 && $pid!=-1 && $? !=0 )
308
+ {
309
+ $self->throw("Error reading VCF file.\n");
310
+ }
311
+ }
312
+ return $line;
313
+ }
314
+
315
+ sub _unread_line
316
+ {
317
+ my ($self,$line) = @_;
318
+ unshift @{$$self{buffer}}, $line;
319
+ return;
320
+ }
321
+
322
+
323
+ =head2 next_data_array
324
+
325
+ About : Reads next VCF line and splits it into an array. The last element is chomped.
326
+ Usage : my $vcf = Vcf->new();
327
+ $vcf->parse_header();
328
+ my $x = $vcf->next_data_array();
329
+ Args : Optional line to parse
330
+
331
+ =cut
332
+
333
+ sub next_data_array
334
+ {
335
+ my ($self,$line) = @_;
336
+ if ( !$line ) { $line = $self->next_line(); }
337
+ if ( !$line ) { return undef; }
338
+ my @items = split(/\t/,$line);
339
+ chomp($items[-1]);
340
+ return \@items;
341
+ }
342
+
343
+
344
+ =head2 set_samples
345
+
346
+ About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit.
347
+ Usage : my $vcf = Vcf->new();
348
+ $vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded.
349
+ $vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included.
350
+ my $x = $vcf->next_data_hash();
351
+ Args : Optional line to parse
352
+
353
+ =cut
354
+
355
+ sub set_samples
356
+ {
357
+ my ($self,%args) = @_;
358
+
359
+ if ( exists($args{include}) )
360
+ {
361
+ for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; }
362
+ for my $sample (@{$args{include}})
363
+ {
364
+ if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
365
+ my $idx = $$self{has_column}{$sample} - 1;
366
+ $$self{samples_to_parse}[$idx] = 1;
367
+ }
368
+ }
369
+
370
+ if ( exists($args{exclude}) )
371
+ {
372
+ for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; }
373
+ for my $sample (@{$args{exclude}})
374
+ {
375
+ if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
376
+ my $idx = $$self{has_column}{$sample} - 1;
377
+ $$self{samples_to_parse}[$idx] = 0;
378
+ }
379
+ }
380
+ }
381
+
382
+
383
+ sub _set_version
384
+ {
385
+ my ($self,$version_line) = @_;
386
+
387
+ if ( $$self{_version_set} ) { return $self; }
388
+ $$self{_version_set} = 1;
389
+
390
+ $$self{version} = $$self{default_version};
391
+ if ( $version_line )
392
+ {
393
+ if ( $version_line=~/^(\d+(?:\.\d+)?)$/ )
394
+ {
395
+ $$self{version} = $1;
396
+ undef $version_line;
397
+ }
398
+ elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) )
399
+ {
400
+ $self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n");
401
+ undef $version_line;
402
+ }
403
+ else
404
+ {
405
+ $$self{version} = $1;
406
+ }
407
+ }
408
+
409
+ my $reader;
410
+ if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); }
411
+ elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); }
412
+ elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); }
413
+ elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); }
414
+ else
415
+ {
416
+ $self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]);
417
+ $$self{version} = '4.1';
418
+ $reader = Vcf4_1->new(%$self);
419
+ }
420
+
421
+ $self = $reader;
422
+ # When changing version, change also the fileformat header line
423
+ if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' )
424
+ {
425
+ shift(@{$$self{header_lines}});
426
+ }
427
+
428
+ return $self;
429
+ }
430
+
431
+
432
+ #---------------------------------------
433
+
434
+ package VcfReader;
435
+ use base qw(Vcf);
436
+ use strict;
437
+ use warnings;
438
+ use Carp;
439
+ use Data::Dumper;
440
+
441
+ sub new
442
+ {
443
+ my ($class,@args) = @_;
444
+ my $self = {@args};
445
+ bless $self, ref($class) || $class;
446
+ return $self;
447
+ }
448
+
449
+
450
+ =head2 next_data_hash
451
+
452
+ About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
453
+ Usage : my $vcf = Vcf->new();
454
+ $vcf->parse_header();
455
+ my $x = $vcf->next_data_hash();
456
+
457
+ # Or having a VCF data line $line
458
+ my $x = $vcf->next_data_hash($line);
459
+
460
+ Args : Optional line to parse.
461
+
462
+ =cut
463
+
464
+ sub next_data_hash
465
+ {
466
+ my ($self,$line) = @_;
467
+ if ( !$line ) { $line = $self->next_line(); }
468
+ if ( !$line ) { return undef; }
469
+ my @items;
470
+ if ( ref($line) eq 'ARRAY' ) { @items = @$line; }
471
+ else { @items = split(/\t/,$line); }
472
+ chomp($items[-1]);
473
+
474
+ my $cols = $$self{columns};
475
+ if ( !$cols )
476
+ {
477
+ $self->_fake_column_names(scalar @items - 9);
478
+ $cols = $$self{columns};
479
+ }
480
+
481
+ # Check the number of columns
482
+ if ( scalar @items != scalar @$cols )
483
+ {
484
+ if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); }
485
+ if ( $line=~/^#/ ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); }
486
+
487
+ $self->warn("Different number of columns at $items[0]:$items[1] (expected ".scalar @$cols.", got ".scalar @items.")\n");
488
+ while ( $items[-1] eq '' ) { pop(@items); }
489
+ if ( scalar @items != scalar @$cols )
490
+ {
491
+ my @test = split(/\s+/,$line);
492
+ if ( scalar @test == scalar @$cols ) { $self->warn("(Have spaces been used instead of tabs?)\n\n"); }
493
+ else { $self->throw("Error not recoverable, exiting.\n"); }
494
+
495
+ @items = @test;
496
+ }
497
+ else { $self->warn("(Trailing tabs?)\n\n"); }
498
+ }
499
+ my %out;
500
+
501
+ # Mandatory fields
502
+ $out{CHROM} = $items[0];
503
+ $out{POS} = $items[1];
504
+ $out{ID} = $items[2];
505
+ $out{REF} = $items[3];
506
+ $out{ALT} = [ split(/,/,$items[4]) ];
507
+ $out{QUAL} = $items[5];
508
+ $out{FILTER} = [ split(/;/,$items[6]) ];
509
+
510
+ # INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2
511
+ if ( defined $items[7] )
512
+ {
513
+ my %hash;
514
+ for my $info (split(/;/,$items[7]))
515
+ {
516
+ my ($key,$val) = split(/=/,$info);
517
+ if ( !defined $key )
518
+ {
519
+ $self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n");
520
+ next;
521
+ }
522
+ if ( defined $val )
523
+ {
524
+ $hash{$key} = $val;
525
+ }
526
+ elsif ( exists($$self{header}{INFO}{$key}) )
527
+ {
528
+ $hash{$key} = $$self{header}{INFO}{$key}{default};
529
+ }
530
+ else
531
+ {
532
+ $hash{$key} = undef;
533
+ }
534
+ }
535
+ $out{INFO} = \%hash;
536
+ }
537
+
538
+ # The FORMAT field may not be present. GT:GQ:DP:HQ
539
+ my $format;
540
+ if ( $$cols[8] || $items[8] )
541
+ {
542
+ $format = $out{FORMAT} = [ split(/:/,$items[8]) ];
543
+ if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); }
544
+ }
545
+
546
+ # Genotype fields
547
+ my %gtypes;
548
+ my $check_nformat = $$self{drop_trailings} ? 0 : 1;
549
+ for (my $icol=9; $icol<@items; $icol++)
550
+ {
551
+ if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; }
552
+ if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; }
553
+
554
+ my @fields = split(/:/, $items[$icol]);
555
+ if ( $check_nformat && @fields != @$format )
556
+ {
557
+ $self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] ("
558
+ .scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n");
559
+ }
560
+ my %hash;
561
+ for (my $ifield=0; $ifield<@fields; $ifield++)
562
+ {
563
+ $hash{$$format[$ifield]} = $fields[$ifield];
564
+ }
565
+ $gtypes{$$cols[$icol]} = \%hash;
566
+ }
567
+ $out{gtypes} = \%gtypes;
568
+
569
+ return \%out;
570
+ }
571
+
572
+
573
+ =head2 parse_header
574
+
575
+ About : Reads (and stores) the VCF header.
576
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header();
577
+ Args : silent .. do not warn about duplicate header lines
578
+
579
+ =cut
580
+
581
+ sub parse_header
582
+ {
583
+ my ($self,%args) = @_;
584
+
585
+ # First come the header lines prefixed by ##
586
+ while ($self->_next_header_line(%args)) { ; }
587
+
588
+ # Now comes the column names line prefixed by #
589
+ $self->_read_column_names();
590
+ }
591
+
592
+
593
+ =head2 _next_header_line
594
+
595
+ About : Stores the header lines and meta information, such as fields types, etc.
596
+ Args : silent .. do not warn about duplicate column names
597
+
598
+ =cut
599
+
600
+ sub _next_header_line
601
+ {
602
+ my ($self,%args) = @_;
603
+ my $line = $self->next_line();
604
+ if ( !defined $line ) { return undef; }
605
+ if ( substr($line,0,2) ne '##' )
606
+ {
607
+ $self->_unread_line($line);
608
+ return undef;
609
+ }
610
+
611
+ my $rec = $self->parse_header_line($line);
612
+ if ( $rec ) { $self->add_header_line($rec,%args); }
613
+
614
+ return $rec;
615
+ }
616
+
617
+ =head2 get_header_line
618
+
619
+ Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC')
620
+ $vcf->get_header_line(key=>'FILTER', ID=>'q10')
621
+ $vcf->get_header_line(key=>'reference')
622
+ $vcf->get_header_line(key=>'contig',ID=>'20')
623
+ Args : Header line filter as in the example above
624
+ Returns : List ref of header line hashes matching the filter
625
+
626
+ =cut
627
+
628
+ sub get_header_line
629
+ {
630
+ my ($self,%filter) = @_;
631
+
632
+ my $key = $filter{key};
633
+ delete($filter{key});
634
+
635
+ my $id = $filter{ID};
636
+
637
+ my @out;
638
+ while (my ($hline_key,$hline_hash) = each %{$$self{header}})
639
+ {
640
+ if ( $key ne $hline_key ) { next; }
641
+
642
+ if ( defined $id )
643
+ {
644
+ if ( !exists($$hline_hash{$id}) ) { next; }
645
+ $hline_hash = $$hline_hash{$id};
646
+ }
647
+
648
+ my $match = 1;
649
+ while (my ($fkey,$fval) = each %filter)
650
+ {
651
+ if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval )
652
+ {
653
+ $match=0;
654
+ last;
655
+ }
656
+ }
657
+ if ( $match ) { push @out,$hline_hash }
658
+ }
659
+ return \@out;
660
+ }
661
+
662
+
663
+ =head2 add_header_line
664
+
665
+ Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
666
+ $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
667
+ Args : Header line hash as in the example above
668
+ Hash with additional parameters [optional]
669
+ silent .. do not warn about existing header keys
670
+ append .. append timestamp to the name of the new one
671
+ Returns :
672
+
673
+ =cut
674
+
675
+ sub add_header_line
676
+ {
677
+ my ($self,$rec,%args) = @_;
678
+
679
+ if ( !%args ) { $args{silent}=0; }
680
+
681
+ my $key = $$rec{key};
682
+ if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); }
683
+
684
+ if ( exists($$rec{Type}) )
685
+ {
686
+ if ( !exists($$rec{default}) )
687
+ {
688
+ my $type = $$rec{Type};
689
+ if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; }
690
+ else { $$rec{default}=$$self{defaults}{default}; }
691
+ }
692
+ if ( !exists($$rec{handler}) )
693
+ {
694
+ my $type = $$rec{Type};
695
+ if ( !exists($$self{handlers}{$type}) )
696
+ {
697
+ $self->warn("Unknown type [$type]\n");
698
+ $type = 'String';
699
+ $$rec{Type} = $type;
700
+ }
701
+ if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; }
702
+ else { $self->throw("Unknown type [$type].\n"); }
703
+ }
704
+ }
705
+
706
+ if ( exists($$rec{ID}) )
707
+ {
708
+ my $id = $$rec{ID};
709
+ if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); }
710
+ $$self{header}{$key}{$id} = $rec;
711
+ push @{$$self{header_lines}}, $rec;
712
+ return;
713
+ }
714
+
715
+ if ( $args{append} )
716
+ {
717
+ my @tm = gmtime(time);
718
+ $key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4],$tm[3];
719
+ my $i = 1;
720
+ while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; }
721
+ $key = $key.'.'.$i;
722
+ $$rec{key} = $key;
723
+ }
724
+
725
+ if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); }
726
+
727
+ push @{$$self{header}{$key}}, $rec;
728
+ if ( $$rec{key} eq 'fileformat' )
729
+ {
730
+ unshift @{$$self{header_lines}}, $rec;
731
+ }
732
+ else
733
+ {
734
+ push @{$$self{header_lines}}, $rec;
735
+ }
736
+ }
737
+
738
+ sub _header_line_exists
739
+ {
740
+ my ($self,$key,$rec) = @_;
741
+ if ( !exists($$self{header}{$key}) ) { return 0; }
742
+ if ( $key eq 'fileformat' ) { return 1; }
743
+ for my $hrec (@{$$self{header}{$key}})
744
+ {
745
+ my $differ = 0;
746
+ for my $item (keys %$rec)
747
+ {
748
+ if ( !exists($$hrec{$item}) ) { $differ=1; last; }
749
+ if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; }
750
+ }
751
+ if ( !$differ ) { return $hrec; }
752
+ }
753
+ return 0;
754
+ }
755
+
756
+ =head2 remove_header_line
757
+
758
+ Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
759
+ Args :
760
+ Returns :
761
+
762
+ =cut
763
+
764
+ sub remove_header_line
765
+ {
766
+ my ($self,%args) = @_;
767
+ my $key = $args{key};
768
+ for (my $i=0; $i<@{$$self{header_lines}}; $i++)
769
+ {
770
+ my $line = $$self{header_lines}[$i];
771
+ if ( $$line{key} ne $key ) { next; }
772
+ if ( exists($args{ID}) )
773
+ {
774
+ if ( $args{ID} ne $$line{ID} ) { next; }
775
+ delete($$self{header}{$key}{$args{ID}});
776
+ splice(@{$$self{header_lines}},$i,1);
777
+ }
778
+ else
779
+ {
780
+ my $to_be_removed = $self->_header_line_exists($key,\%args);
781
+ if ( !$to_be_removed ) { next; }
782
+ for (my $j=0; $j<@{$$self{header}{$key}}; $j++)
783
+ {
784
+ if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; }
785
+ }
786
+ splice(@{$$self{header_lines}},$i,1);
787
+ }
788
+ }
789
+ }
790
+
791
+
792
+ =head2 parse_header_line
793
+
794
+ Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
795
+ $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
796
+ Args :
797
+ Returns :
798
+
799
+ =cut
800
+
801
+ sub parse_header_line
802
+ {
803
+ my ($self,$line) = @_;
804
+
805
+ chomp($line);
806
+ $line =~ s/^##//;
807
+
808
+ if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; }
809
+ my $key = $1;
810
+ my $value = $';
811
+
812
+ my $desc;
813
+ if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; }
814
+
815
+ if ( !$desc ) { return { key=>$key, value=>$value }; }
816
+
817
+ if ( $key eq 'INFO' or $key eq 'FORMAT' )
818
+ {
819
+ my ($id,$number,$type,@rest) = split(/,\s*/,$value);
820
+ if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
821
+ return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc };
822
+ }
823
+ if ( $key eq 'FILTER' )
824
+ {
825
+ my ($id,@rest) = split(/,\s*/,$value);
826
+ if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
827
+ return { key=>$key, ID=>$id, Description=>$desc };
828
+ }
829
+ $self->throw("Could not parse the header line: $line\n");
830
+ }
831
+
832
+ =head2 _read_column_names
833
+
834
+ About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
835
+ The indexes go from 1.
836
+ Usage : $vcf->_read_column_names();
837
+ Args : none
838
+
839
+ =cut
840
+
841
+ sub _read_column_names
842
+ {
843
+ my ($self) = @_;
844
+ my $line = $self->next_line();
845
+ if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); }
846
+ $$self{column_line} = $line;
847
+
848
+ chomp($line);
849
+ my @cols = split(/\t/, substr($line,1));
850
+ my $ncols = scalar @cols;
851
+ if ( $ncols == 1 )
852
+ {
853
+ # If there is only one name, it can be space-seprated instead of tab separated
854
+ @cols = split(/\s+/, $cols[0]);
855
+ $ncols = scalar @cols;
856
+ chomp($line);
857
+ if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; }
858
+ $self->warn("The column names not tab-separated? [$line]\n");
859
+ }
860
+
861
+ my $fields = $$self{mandatory};
862
+ my $nfields = scalar @$fields;
863
+
864
+ # Check the names of the mandatory columns
865
+ if ( $ncols < $nfields )
866
+ {
867
+ chomp($line);
868
+ $self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n");
869
+ return;
870
+ }
871
+
872
+ for (my $i=0; $i<$ncols; $i++)
873
+ {
874
+ if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ )
875
+ {
876
+ $self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n");
877
+ $cols[$i] =~ s/^\s+//;
878
+ $cols[$i] =~ s/\s+$//;
879
+ }
880
+ if ( $i<$nfields && $cols[$i] ne $$fields[$i] )
881
+ {
882
+ $self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n");
883
+ $cols[$i] = $$fields[$i];
884
+ }
885
+ $$self{has_column}{$cols[$i]} = $i+1;
886
+ }
887
+ $$self{columns} = \@cols;
888
+ return;
889
+ }
890
+
891
+
892
+ =head2 _fake_column_names
893
+
894
+ About : When no header is present, fake column names as the default mandatory ones + numbers
895
+ Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present
896
+
897
+ =cut
898
+
899
+ sub _fake_column_names
900
+ {
901
+ my ($self,$ncols) = @_;
902
+
903
+ $$self{columns} = [ @{$$self{mandatory}} ];
904
+ if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; }
905
+ for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; }
906
+ }
907
+
908
+
909
+ =head2 format_header
910
+
911
+ About : Returns the header.
912
+ Usage : print $vcf->format_header();
913
+ Args : The columns to include on output [optional]
914
+
915
+ =cut
916
+
917
+ sub format_header
918
+ {
919
+ my ($self,$columns) = @_;
920
+
921
+ my $out = '';
922
+ for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); }
923
+
924
+ # This is required when using the API for writing new VCF files and the caller does not add the line explicitly
925
+ if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' )
926
+ {
927
+ $out = "##fileformat=VCFv$$self{version}\n" .$out;
928
+ }
929
+ if ( !$$self{columns} ) { return $out; }
930
+
931
+ my @out_cols;
932
+ if ( $columns )
933
+ {
934
+ @out_cols = @{$$self{columns}}[0..8];
935
+ for my $col (@$columns)
936
+ {
937
+ if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; }
938
+ }
939
+ }
940
+ else
941
+ {
942
+ @out_cols = @{$$self{columns}};
943
+ }
944
+ $out .= "#". join("\t", @out_cols). "\n";
945
+
946
+ return $out;
947
+ }
948
+
949
+
950
+ =head2 format_line
951
+
952
+ About : Returns the header.
953
+ Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
954
+ $x = $vcf->next_data_array(); print $vcf->format_line($x);
955
+ Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array.
956
+ 2 : The columns to include [optional]
957
+
958
+ =cut
959
+
960
+ sub format_line
961
+ {
962
+ my ($self,$record,$columns) = @_;
963
+
964
+ if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); }
965
+ $self->throw("FIXME: todo\n");
966
+ }
967
+
968
+
969
+ =head2 recalc_ac_an
970
+
971
+ About : Control if the AC and AN values should be updated.
972
+ Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
973
+ Args 1 : 0 .. never recalculate
974
+ 1 .. recalculate if present
975
+ 2 .. recalculate if present and add if missing
976
+
977
+ =cut
978
+
979
+ sub recalc_ac_an
980
+ {
981
+ my ($self,$value) = @_;
982
+ if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; }
983
+ return;
984
+ }
985
+
986
+
987
+
988
+ sub _format_line_hash
989
+ {
990
+ my ($self,$record,$columns) = @_;
991
+
992
+ if ( !$$self{columns} )
993
+ {
994
+ my $ngtypes = scalar keys %{$$record{gtypes}};
995
+ if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; }
996
+ $self->_fake_column_names($ngtypes);
997
+ }
998
+ my $cols = $$self{columns};
999
+
1000
+ # CHROM POS ID REF
1001
+ my $out;
1002
+ $out .= $$record{CHROM} . "\t";
1003
+ $out .= $$record{POS} . "\t";
1004
+ $out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t";
1005
+ $out .= $$record{REF} . "\t";
1006
+
1007
+ # ALT
1008
+ $out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.');
1009
+
1010
+ # QUAL
1011
+ $out .= "\t". $$record{QUAL};
1012
+
1013
+ # FILTER
1014
+ $out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.');
1015
+
1016
+ # Collect the gtypes of interest
1017
+ my $gtypes;
1018
+ if ( $columns )
1019
+ {
1020
+ # Select only those gtypes keys with a corresponding key in columns.
1021
+ for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; }
1022
+ }
1023
+ else
1024
+ {
1025
+ $gtypes = $$record{gtypes};
1026
+ }
1027
+
1028
+ # INFO
1029
+ # .. calculate NS, AN and AC, but only if recalc_ac_an is set
1030
+ my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0;
1031
+ my @info;
1032
+ while (my ($key,$value) = each %{$$record{INFO}})
1033
+ {
1034
+ if ( $$self{recalc_ac_an}>0 )
1035
+ {
1036
+ if ( $key eq 'AN' ) { $needs_an_ac=1; next; }
1037
+ if ( $key eq 'AC' ) { $needs_an_ac=1; next; }
1038
+ }
1039
+ if ( defined $value )
1040
+ {
1041
+ push @info, "$key=$value";
1042
+ }
1043
+ elsif ( $key ne '.' )
1044
+ {
1045
+ push @info, $key;
1046
+ }
1047
+ }
1048
+ if ( $needs_an_ac )
1049
+ {
1050
+ my $nalt = scalar @{$$record{ALT}};
1051
+ if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; }
1052
+ my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt);
1053
+ push @info, "AN=$an","AC=$ac";
1054
+ }
1055
+ if ( !@info ) { push @info, '.'; }
1056
+ $out .= "\t". join(';', sort @info);
1057
+
1058
+ # FORMAT, the column is not required, it may not be present when there are no genotypes
1059
+ if ( exists($$cols[8]) && defined $$record{FORMAT} )
1060
+ {
1061
+ $out .= "\t". join(':',@{$$record{FORMAT}});
1062
+ }
1063
+
1064
+ # Genotypes: output all columns or only a selection?
1065
+ my @col_names = $columns ? @$columns : @$cols[9..@$cols-1];
1066
+ my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0;
1067
+ for my $col (@col_names)
1068
+ {
1069
+ my $gt = $$gtypes{$col};
1070
+ my $can_drop = $$self{drop_trailings};
1071
+ my @gtype;
1072
+ for (my $i=$nformat-1; $i>=0; $i--)
1073
+ {
1074
+ my $field = $$record{FORMAT}[$i];
1075
+ if ( $i==0 ) { $can_drop=0; }
1076
+
1077
+ if ( exists($$gt{$field}) )
1078
+ {
1079
+ $can_drop = 0;
1080
+ if ( ref($$gt{$field}) eq 'HASH' )
1081
+ {
1082
+ # Special treatment for Number=[AG] tags
1083
+ unshift @gtype, $self->format_AGtag($record,$$gt{$field},$field);
1084
+ }
1085
+ else
1086
+ {
1087
+ unshift @gtype,$$gt{$field};
1088
+ }
1089
+ }
1090
+ elsif ( $can_drop ) { next; }
1091
+ elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; }
1092
+ else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); }
1093
+ }
1094
+ $out .= "\t" . join(':',@gtype);
1095
+ }
1096
+
1097
+ $out .= "\n";
1098
+ return $out;
1099
+ }
1100
+
1101
+ sub calc_an_ac
1102
+ {
1103
+ my ($self,$gtypes,$nalleles) = @_;
1104
+ my $sep_re = $$self{regex_gtsep};
1105
+ my ($an,%ac_counts);
1106
+ if ( defined $nalleles )
1107
+ {
1108
+ for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; }
1109
+ }
1110
+ $an = 0;
1111
+ for my $gt (keys %$gtypes)
1112
+ {
1113
+ my $value = $$gtypes{$gt}{GT};
1114
+ if ( !defined $value ) { next; } # GT may not be present
1115
+ my ($al1,$al2) = split($sep_re,$value);
1116
+ if ( defined($al1) && $al1 ne '.' )
1117
+ {
1118
+ $an++;
1119
+ if ( $al1 ne '0' ) { $ac_counts{$al1}++; }
1120
+ }
1121
+ if ( defined($al2) && $al2 ne '.' )
1122
+ {
1123
+ $an++;
1124
+ if ( $al2 ne '0' ) { $ac_counts{$al2}++; }
1125
+ }
1126
+ }
1127
+ my @ac;
1128
+ for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; }
1129
+ if ( !@ac ) { @ac = ('0'); }
1130
+ return ($an,join(',',@ac),\@ac);
1131
+ }
1132
+
1133
+ sub _validate_alt_field
1134
+ {
1135
+ my ($self,$values,$ref) = @_;
1136
+
1137
+ for (my $i=0; $i<@$values; $i++)
1138
+ {
1139
+ for (my $j=0; $j<$i; $j++)
1140
+ {
1141
+ if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; }
1142
+ }
1143
+ if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; }
1144
+ }
1145
+ return undef;
1146
+ }
1147
+
1148
+ =head2 validate_alt_field
1149
+
1150
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
1151
+ Args : The ALT arrayref
1152
+ Returns : Error message in case of an error.
1153
+
1154
+ =cut
1155
+
1156
+ sub validate_alt_field
1157
+ {
1158
+ my ($self,$values,$ref) = @_;
1159
+
1160
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
1161
+
1162
+ my $ret = $self->_validate_alt_field($values,$ref);
1163
+ if ( $ret ) { return $ret; }
1164
+
1165
+ my @err;
1166
+ for my $item (@$values)
1167
+ {
1168
+ if ( $item=~/^[ACTGN]$/ ) { next; }
1169
+ elsif ( $item=~/^I[ACTGN]+$/ ) { next; }
1170
+ elsif ( $item=~/^D\d+$/ ) { next; }
1171
+
1172
+ push @err, $item;
1173
+ }
1174
+ if ( !@err ) { return undef; }
1175
+ return 'Could not parse the allele(s) [' .join(',',@err). ']';
1176
+ }
1177
+
1178
+ =head2 event_type
1179
+
1180
+ Usage : my $x = $vcf->next_data_hash();
1181
+ my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
1182
+ for my $allele (@$alleles)
1183
+ {
1184
+ my ($type,$len,$ht) = $vcf->event_type($x,$allele);
1185
+ }
1186
+ or
1187
+ my ($type,$len,$ht) = $vcf->event_type($ref,$al);
1188
+ Args : VCF data line parsed by next_data_hash or the reference allele
1189
+ : Allele
1190
+ Returns : 's' for SNP and number of SNPs in the record
1191
+ 'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
1192
+ 'r' identical to the reference, length 0
1193
+ 'o' for other (complex events) and the number of affected bases
1194
+ 'b' breakend
1195
+ 'u' unknown
1196
+
1197
+ =cut
1198
+
1199
+ sub event_type
1200
+ {
1201
+ my ($self,$rec,$allele) = @_;
1202
+
1203
+ my $ref = $rec;
1204
+ if ( ref($rec) eq 'HASH' )
1205
+ {
1206
+ if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
1207
+ $ref = $$rec{REF};
1208
+ }
1209
+
1210
+ my ($type,$len,$ht);
1211
+ if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; }
1212
+ elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; }
1213
+ elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; }
1214
+ elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; }
1215
+ else
1216
+ {
1217
+ my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef';
1218
+ my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef';
1219
+ $self->throw("Eh?: $chr:$pos .. $ref $allele\n");
1220
+ }
1221
+
1222
+ if ( ref($rec) eq 'HASH' )
1223
+ {
1224
+ $$rec{_cached_events}{$allele} = [$type,$len,$ht];
1225
+ }
1226
+ return ($type,$len,$ht);
1227
+ }
1228
+
1229
+
1230
+ =head2 parse_AGtags
1231
+
1232
+ About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
1233
+ Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
1234
+ Args : VCF data line parsed by next_data_hash
1235
+ : Mapping between ALT representations based on different REFs [optional]
1236
+ : New REF [optional]
1237
+ Returns : Hash {Allele=>Value}
1238
+
1239
+ =cut
1240
+
1241
+ sub parse_AGtags
1242
+ {
1243
+ my ($self,$rec,$ref_alt_map,$new_ref) = @_;
1244
+
1245
+ if ( !exists($$rec{gtypes}) ) { return; }
1246
+
1247
+ my (@atags,@gtags);
1248
+ for my $fmt (@{$$rec{FORMAT}})
1249
+ {
1250
+ # These have been listed explicitly for proper merging of v4.0 VCFs
1251
+ if ( $$self{fix_v40_AGtags} )
1252
+ {
1253
+ if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; }
1254
+ if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; }
1255
+ }
1256
+ if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
1257
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; }
1258
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; }
1259
+ }
1260
+ my $missing = $$self{defaults}{default};
1261
+ if ( @atags )
1262
+ {
1263
+ # Parse Number=A tags
1264
+ my $alts;
1265
+ if ( defined $ref_alt_map )
1266
+ {
1267
+ $alts = [];
1268
+ for my $alt (@{$$rec{ALT}})
1269
+ {
1270
+ if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); }
1271
+ push @$alts, $$ref_alt_map{$new_ref}{$alt};
1272
+ }
1273
+ }
1274
+ else
1275
+ {
1276
+ $alts = $$rec{ALT};
1277
+ }
1278
+ for my $tag (@atags)
1279
+ {
1280
+ for my $sample (values %{$$rec{gtypes}})
1281
+ {
1282
+ if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
1283
+ my @values = split(/,/,$$sample{$tag});
1284
+ $$sample{$tag} = {};
1285
+ for (my $i=0; $i<@values; $i++)
1286
+ {
1287
+ $$sample{$tag}{$$alts[$i]} = $values[$i];
1288
+ }
1289
+ }
1290
+ }
1291
+ }
1292
+ if ( @gtags )
1293
+ {
1294
+ # Parse Number=G tags
1295
+ my @alleles;
1296
+ if ( defined $ref_alt_map )
1297
+ {
1298
+ push @alleles, $new_ref;
1299
+ for my $alt (@{$$rec{ALT}})
1300
+ {
1301
+ if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); }
1302
+ push @alleles, $$ref_alt_map{$new_ref}{$alt};
1303
+ }
1304
+ }
1305
+ else
1306
+ {
1307
+ @alleles = ($$rec{REF},@{$$rec{ALT}});
1308
+ }
1309
+ my @gtypes;
1310
+ for (my $i=0; $i<@alleles; $i++)
1311
+ {
1312
+ for (my $j=0; $j<=$i; $j++)
1313
+ {
1314
+ push @gtypes, $alleles[$i].'/'.$alleles[$j];
1315
+ }
1316
+ }
1317
+ for my $tag (@gtags)
1318
+ {
1319
+ for my $sample (values %{$$rec{gtypes}})
1320
+ {
1321
+ if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
1322
+ my @values = split(/,/,$$sample{$tag});
1323
+ $$sample{$tag} = {};
1324
+ for (my $i=0; $i<@values; $i++)
1325
+ {
1326
+ $$sample{$tag}{$gtypes[$i]} = $values[$i];
1327
+ }
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ =head2 format_AGtag
1334
+
1335
+ About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
1336
+ Usage :
1337
+ Args :
1338
+ :
1339
+ :
1340
+ Returns :
1341
+
1342
+ =cut
1343
+
1344
+ sub format_AGtag
1345
+ {
1346
+ my ($self,$record,$tag_data,$tag) = @_;
1347
+
1348
+ # The FORMAT field is checked only once and the results are cached.
1349
+ if ( !exists($$record{_atags}) )
1350
+ {
1351
+ $$record{_atags} = {};
1352
+
1353
+ # Check if there are any A,G tags
1354
+ for my $fmt (@{$$record{FORMAT}})
1355
+ {
1356
+ # These have been listed explicitly for proper merging of v4.0 VCFs
1357
+ if ( $$self{fix_v40_AGtags} )
1358
+ {
1359
+ if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; }
1360
+ if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; }
1361
+ }
1362
+ if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
1363
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; }
1364
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; }
1365
+ }
1366
+ }
1367
+
1368
+ my @out;
1369
+ if ( exists($$record{_atags}{$tag}) )
1370
+ {
1371
+ for my $alt (@{$$record{ALT}})
1372
+ {
1373
+ push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default};
1374
+ }
1375
+ }
1376
+
1377
+ if ( exists($$record{_gtags}{$tag}) )
1378
+ {
1379
+ my $gtypes = $$record{_gtypes};
1380
+ my $gtypes2 = $$record{_gtypes2};
1381
+ if ( !defined $gtypes )
1382
+ {
1383
+ $gtypes = [];
1384
+ $gtypes2 = [];
1385
+
1386
+ my @alleles = ( $$record{REF}, @{$$record{ALT}} );
1387
+ for (my $i=0; $i<@alleles; $i++)
1388
+ {
1389
+ for (my $j=0; $j<=$i; $j++)
1390
+ {
1391
+ push @$gtypes, $alleles[$i].'/'.$alleles[$j];
1392
+ push @$gtypes2, $alleles[$j].'/'.$alleles[$i];
1393
+ }
1394
+ }
1395
+
1396
+ $$record{_gtypes} = $gtypes;
1397
+ $$record{_gtypes2} = $gtypes2;
1398
+ }
1399
+
1400
+ for (my $i=0; $i<@$gtypes; $i++)
1401
+ {
1402
+ my $gt = $$gtypes[$i];
1403
+ if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$i]; }
1404
+ push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default};
1405
+ }
1406
+ }
1407
+
1408
+ return join(',',@out);
1409
+ }
1410
+
1411
+ =head2 parse_alleles
1412
+
1413
+ About : Deprecated, use parse_haplotype instead.
1414
+ Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
1415
+ Args : VCF data line parsed by next_data_hash
1416
+ : The genotype column name
1417
+ Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.
1418
+
1419
+ =cut
1420
+
1421
+ sub parse_alleles
1422
+ {
1423
+ my ($self,$rec,$column) = @_;
1424
+ if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
1425
+
1426
+ my $gtype = $$rec{gtypes}{$column}{GT};
1427
+ if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); }
1428
+ my $al1 = $1;
1429
+ my $sep = $2;
1430
+ my $al2 = $3;
1431
+
1432
+ if ( !$al1 ) { $al1 = $$rec{REF}; }
1433
+ elsif ( $al1 ne '.' )
1434
+ {
1435
+ if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); }
1436
+ $al1 = $$rec{ALT}[$al1-1];
1437
+ }
1438
+
1439
+ if ( !defined $al2 or $al2 eq '' )
1440
+ {
1441
+ $sep = '';
1442
+ $al2 = '';
1443
+ }
1444
+ else
1445
+ {
1446
+ if ( !$al2 ) { $al2 = $$rec{REF}; }
1447
+ elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; }
1448
+ }
1449
+ return ($al1,$sep,$al2);
1450
+ }
1451
+
1452
+ =head2 parse_haplotype
1453
+
1454
+ About : Similar to parse_alleles, supports also multiploid VCFs.
1455
+ Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
1456
+ Args : VCF data line parsed by next_data_hash
1457
+ : The genotype column name
1458
+ Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
1459
+ can be cashed and must be therefore considered read only!
1460
+
1461
+ =cut
1462
+
1463
+ sub parse_haplotype
1464
+ {
1465
+ my ($self,$rec,$column) = @_;
1466
+ if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
1467
+ if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); }
1468
+
1469
+ my $gtype = $$rec{gtypes}{$column}{GT};
1470
+ if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); }
1471
+
1472
+ my @alleles = ();
1473
+ my @seps = ();
1474
+ my $is_phased = 0;
1475
+ my $is_empty = 1;
1476
+
1477
+ my $buf = $gtype;
1478
+ while ($buf ne '')
1479
+ {
1480
+ if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); }
1481
+ $buf = $';
1482
+
1483
+ if ( $1 eq '.' ) { push @alleles,'.'; }
1484
+ else
1485
+ {
1486
+ $is_empty = 0;
1487
+ if ( $1 eq '0' ) { push @alleles,$$rec{REF}; }
1488
+ elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; }
1489
+ else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); }
1490
+ }
1491
+ if ( $2 )
1492
+ {
1493
+ if ( $2 eq '|' ) { $is_phased=1; }
1494
+ push @seps,$2;
1495
+ }
1496
+ }
1497
+ $$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty];
1498
+ return (@{$$rec{_cached_haplotypes}{$gtype}});
1499
+ }
1500
+
1501
+ =head2 format_haplotype
1502
+
1503
+ Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);
1504
+
1505
+ =cut
1506
+
1507
+ sub format_haplotype
1508
+ {
1509
+ my ($self,$alleles,$seps) = @_;
1510
+ if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); }
1511
+ my $out = $$alleles[0];
1512
+ for (my $i=1; $i<@$alleles; $i++)
1513
+ {
1514
+ $out .= $$seps[$i-1];
1515
+ $out .= $$alleles[$i];
1516
+ }
1517
+ return $out;
1518
+ }
1519
+
1520
+
1521
+ =head2 format_genotype_strings
1522
+
1523
+ Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
1524
+ $vcf->format_genotype_strings($x);
1525
+ print $vcf->format_line($x);
1526
+ Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
1527
+ 2 : Optionally, a subset of columns can be supplied. See also format_line.
1528
+ Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
1529
+ numbers starting from 1.
1530
+
1531
+ =cut
1532
+
1533
+ sub format_genotype_strings
1534
+ {
1535
+ my ($self,$rec,$columns) = @_;
1536
+
1537
+ if ( !exists($$rec{gtypes}) ) { return; }
1538
+
1539
+ my $ref = $$rec{REF};
1540
+ my $nalts = 0;
1541
+ my %alts = ();
1542
+ my $gt_re = $$self{regex_gt2};
1543
+
1544
+ if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; }
1545
+
1546
+ for my $key (@$columns)
1547
+ {
1548
+ my $gtype = $$rec{gtypes}{$key}{GT};
1549
+ my $buf = $gtype;
1550
+ my $out = '';
1551
+ while ($buf ne '')
1552
+ {
1553
+ if ( !($buf=~$gt_re) ) { $self->throw("Could not parse gtype string [$gtype]\n"); }
1554
+ $buf = $';
1555
+
1556
+ my $al = $1;
1557
+ my $sep = $2;
1558
+ if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; }
1559
+ else
1560
+ {
1561
+ if ( $al=~/^\d+$/ )
1562
+ {
1563
+ if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); }
1564
+ $al = $$rec{ALT}[$al-1];
1565
+ }
1566
+
1567
+ if ( exists($alts{$al}) ) { $al = $alts{$al} }
1568
+ elsif ( $al=~$$self{regex_snp} or $al=~$$self{regex_ins} or $al=~$$self{regex_del} )
1569
+ {
1570
+ $alts{$al} = ++$nalts;
1571
+ $al = $nalts;
1572
+ }
1573
+ elsif ( $al ne '.' )
1574
+ {
1575
+ $self->throw("Could not parse the genotype string [$gtype]\n");
1576
+ }
1577
+
1578
+ }
1579
+ $out .= $al;
1580
+ if ( $sep ) { $out .= $sep; }
1581
+ }
1582
+ $$rec{gtypes}{$key}{GT} = $out;
1583
+ }
1584
+ $$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ];
1585
+ }
1586
+
1587
+ sub fill_ref_alt_mapping
1588
+ {
1589
+ my ($self,$map) = @_;
1590
+
1591
+ my $new_ref;
1592
+ for my $ref (keys %$map)
1593
+ {
1594
+ $new_ref = $ref;
1595
+ if ( $ref ne $new_ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
1596
+ for my $alt (keys %{$$map{$ref}})
1597
+ {
1598
+ $$map{$ref}{$alt} = $alt;
1599
+ }
1600
+ }
1601
+ $$map{$new_ref}{$new_ref} = $new_ref;
1602
+ return $new_ref;
1603
+ }
1604
+
1605
+
1606
+ =head2 format_header_line
1607
+
1608
+ Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
1609
+ Args :
1610
+ Returns :
1611
+
1612
+ =cut
1613
+
1614
+ sub format_header_line
1615
+ {
1616
+ my ($self,$rec) = @_;
1617
+ my $line = "##$$rec{key}";
1618
+ $line .= "=$$rec{value}" unless !exists($$rec{value});
1619
+ $line .= "=$$rec{ID}" unless !exists($$rec{ID});
1620
+ $line .= ",$$rec{Number}" unless !exists($$rec{Number});
1621
+ $line .= ",$$rec{Type}" unless !exists($$rec{Type});
1622
+ $line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description});
1623
+ $line .= "\n";
1624
+ return $line;
1625
+ }
1626
+
1627
+ =head2 add_columns
1628
+
1629
+ Usage : $vcf->add_columns('NA001','NA0002');
1630
+ Args :
1631
+ Returns :
1632
+
1633
+ =cut
1634
+
1635
+ sub add_columns
1636
+ {
1637
+ my ($self,@columns) = @_;
1638
+ if ( !$$self{columns} )
1639
+ {
1640
+ # The columns should be initialized de novo. Figure out if the @columns contain also the mandatory
1641
+ # columns and if FORMAT should be present (it can be absent when there is no genotype column present).
1642
+ my $has_other = 0;
1643
+ for my $col (@columns)
1644
+ {
1645
+ if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; }
1646
+ }
1647
+
1648
+ $$self{columns} = [ @{$$self{mandatory}} ];
1649
+ if ( $has_other ) { push @{$$self{columns}},'FORMAT'; }
1650
+
1651
+ for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; }
1652
+ }
1653
+ my $ncols = @{$$self{columns}};
1654
+ for my $col (@columns)
1655
+ {
1656
+ if ( $$self{has_column}{$col} ) { next; }
1657
+ $ncols++;
1658
+ push @{$$self{columns}}, $col;
1659
+ }
1660
+ }
1661
+
1662
+ =head2 add_format_field
1663
+
1664
+ Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
1665
+ Args : The record obtained by next_data_hash
1666
+ : The field name
1667
+ Returns :
1668
+
1669
+ =cut
1670
+
1671
+ sub add_format_field
1672
+ {
1673
+ my ($self,$rec,$field) = @_;
1674
+
1675
+ if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
1676
+
1677
+ for my $key (@{$$rec{FORMAT}})
1678
+ {
1679
+ if ( $key eq $field ) { return; } # already there
1680
+ }
1681
+ push @{$$rec{FORMAT}}, $field;
1682
+ }
1683
+
1684
+
1685
+ =head2 remove_format_field
1686
+
1687
+ Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
1688
+ Args : The record obtained by next_data_hash
1689
+ : The field name
1690
+ Returns :
1691
+
1692
+ =cut
1693
+
1694
+ sub remove_format_field
1695
+ {
1696
+ my ($self,$rec,$field) = @_;
1697
+
1698
+ if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
1699
+
1700
+ my $i = 0;
1701
+ for my $key (@{$$rec{FORMAT}})
1702
+ {
1703
+ if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; }
1704
+ $i++;
1705
+ }
1706
+ }
1707
+
1708
+
1709
+ =head2 add_info_field
1710
+
1711
+ Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
1712
+ Args : The record obtained by next_data_array
1713
+ : The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
1714
+ it will be removed. To add fields without a value, use empty string ''.
1715
+ Returns : The formatted INFO.
1716
+
1717
+ =cut
1718
+
1719
+ sub add_info_field
1720
+ {
1721
+ my ($self,$info,%fields) = @_;
1722
+
1723
+ my @out = ();
1724
+
1725
+ # First handle the existing values, keep everything unless in %fields
1726
+ for my $field (split(/;/,$info))
1727
+ {
1728
+ my ($key,$value) = split(/=/,$field);
1729
+ if ( $key eq '.' ) { next; }
1730
+ if ( !exists($fields{$key}) ) { push @out,$field; next; }
1731
+ }
1732
+
1733
+ # Now add the new values and remove the unwanted ones
1734
+ while (my ($key,$value)=each %fields)
1735
+ {
1736
+ if ( !defined($value) ) { next; } # this one should be removed
1737
+ if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3
1738
+ else { push @out,"$key=$value"; } # this is the standard key=value pair
1739
+ }
1740
+ if ( !@out ) { push @out,'.'; }
1741
+ return join(';',@out);
1742
+ }
1743
+
1744
+
1745
+ =head2 add_filter
1746
+
1747
+ Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
1748
+ Args : The record obtained by next_data_array or next_data_hash
1749
+ : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
1750
+ Returns : The formatted filter field.
1751
+
1752
+ =cut
1753
+
1754
+ sub add_filter
1755
+ {
1756
+ my ($self,$filter,%filters) = @_;
1757
+
1758
+ my @out = ();
1759
+ my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter);
1760
+
1761
+ # First handle the existing filters, keep everything unless in %filters
1762
+ for my $key (@filters)
1763
+ {
1764
+ if ( $key eq '.' or $key eq 'PASS' ) { next; }
1765
+ if ( !exists($filters{$key}) ) { push @out,$key; next; }
1766
+ }
1767
+
1768
+ # Now add the new filters and remove the unwanted ones
1769
+ while (my ($key,$value)=each %filters)
1770
+ {
1771
+ if ( !$value ) { next; } # this one should be removed
1772
+ push @out,$key; # this one should be added
1773
+ }
1774
+ if ( !@out ) { push @out,'PASS'; }
1775
+ return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out);
1776
+ }
1777
+
1778
+
1779
+ =head2 validate_filter_field
1780
+
1781
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
1782
+ Args : The FILTER arrayref
1783
+ Returns : Error message in case of an error.
1784
+
1785
+ =cut
1786
+
1787
+ sub validate_filter_field
1788
+ {
1789
+ my ($self,$values) = @_;
1790
+
1791
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
1792
+
1793
+ my @errs;
1794
+ my @missing;
1795
+ for my $item (@$values)
1796
+ {
1797
+ if ( $item eq $$self{filter_passed} ) { next; }
1798
+ if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; }
1799
+ if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; }
1800
+ if ( exists($$self{header}{FILTER}{$item}) ) { next; }
1801
+ push @missing, $item;
1802
+ $self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'});
1803
+ }
1804
+ if ( !@errs && !@missing ) { return undef; }
1805
+ if ( $$self{version}<3.3 ) { return undef; }
1806
+ return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.';
1807
+ }
1808
+
1809
+
1810
+ sub _add_unknown_field
1811
+ {
1812
+ my ($self,$field,$key,$nargs) = @_;
1813
+ $self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'});
1814
+ }
1815
+
1816
+ =head2 validate_header
1817
+
1818
+ About : Version specific header validation code.
1819
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header();
1820
+ Args :
1821
+
1822
+ =cut
1823
+
1824
+ sub validate_header
1825
+ {
1826
+ my ($self) = @_;
1827
+ }
1828
+
1829
+ =head2 validate_line
1830
+
1831
+ About : Version specific line validation code.
1832
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
1833
+ Args :
1834
+
1835
+ =cut
1836
+
1837
+ sub validate_line
1838
+ {
1839
+ my ($self,$x) = @_;
1840
+
1841
+ # Is the ID composed of alphanumeric chars
1842
+ if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); }
1843
+ }
1844
+
1845
+ =head2 validate_info_field
1846
+
1847
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
1848
+ Args : The INFO hashref
1849
+ Returns : Error message in case of an error.
1850
+
1851
+ =cut
1852
+
1853
+ sub validate_info_field
1854
+ {
1855
+ my ($self,$values,$alts) = @_;
1856
+
1857
+ if ( !defined $values ) { return 'Empty INFO field.'; }
1858
+
1859
+ # First handle the empty INFO field (.)
1860
+ if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; }
1861
+
1862
+ # Expected numbers
1863
+ my $ng = -1;
1864
+ my $na = -1;
1865
+ if ( $$self{version}>4.0 )
1866
+ {
1867
+ if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
1868
+ else
1869
+ {
1870
+ $na = @$alts;
1871
+ $ng = (1+$na+1)*($na+1)/2;
1872
+ }
1873
+ }
1874
+
1875
+ my @errs;
1876
+ while (my ($key,$value) = each %$values)
1877
+ {
1878
+ if ( !exists($$self{header}{INFO}{$key}) )
1879
+ {
1880
+ push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3;
1881
+ my $nargs = defined $value ? -1 : 0;
1882
+ $self->_add_unknown_field('INFO',$key,$nargs);
1883
+ next;
1884
+ }
1885
+ my $type = $$self{header}{INFO}{$key};
1886
+
1887
+ my @vals = defined $value ? split(/,/, $value) : ();
1888
+ if ( $$type{Number} eq 'G' )
1889
+ {
1890
+ if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; }
1891
+ }
1892
+ elsif ( $$type{Number} eq 'A' )
1893
+ {
1894
+ if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; }
1895
+ }
1896
+ elsif ( $$type{Number}==0 )
1897
+ {
1898
+ if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; }
1899
+ next;
1900
+ }
1901
+ elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
1902
+ {
1903
+ push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})";
1904
+ }
1905
+ if ( !$$type{handler} ) { next; }
1906
+ for my $val (@vals)
1907
+ {
1908
+ my $err = &{$$type{handler}}($self,$val,$$type{default});
1909
+ if ( $err ) { push @errs, $err; }
1910
+ }
1911
+ }
1912
+ if ( !@errs ) { return undef; }
1913
+ return join(',',@errs);
1914
+ }
1915
+
1916
+ =head2 validate_gtype_field
1917
+
1918
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
1919
+ Args : The genotype data hashref
1920
+ The ALT arrayref
1921
+ Returns : Error message in case of an error.
1922
+
1923
+ =cut
1924
+
1925
+ sub validate_gtype_field
1926
+ {
1927
+ my ($self,$data,$alts,$format) = @_;
1928
+
1929
+ # Expected numbers
1930
+ my $ng = -1;
1931
+ my $na = -1;
1932
+ if ( $$self{version}>4.0 )
1933
+ {
1934
+ if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
1935
+ else
1936
+ {
1937
+ $na = @$alts;
1938
+ $ng = (1+$na+1)*($na+1)/2;
1939
+ }
1940
+ }
1941
+
1942
+ my @errs;
1943
+ while (my ($key,$value) = each %$data)
1944
+ {
1945
+ if ( !exists($$self{header}{FORMAT}{$key}) )
1946
+ {
1947
+ push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3;
1948
+ $self->_add_unknown_field('FORMAT',$key,-1);
1949
+ next;
1950
+ }
1951
+ my $type = $$self{header}{FORMAT}{$key};
1952
+
1953
+ my @vals = split(/,/, $value);
1954
+ if ( $$type{Number} eq 'G' )
1955
+ {
1956
+ if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; }
1957
+ }
1958
+ elsif ( $$type{Number} eq 'A' )
1959
+ {
1960
+ if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; }
1961
+ }
1962
+ elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
1963
+ {
1964
+ push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})";
1965
+ }
1966
+ if ( !$$type{handler} ) { next; }
1967
+ for my $val (@vals)
1968
+ {
1969
+ my $err = &{$$type{handler}}($self,$val,$$type{default});
1970
+ if ( $err ) { push @errs, $err; }
1971
+ }
1972
+ }
1973
+ if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; }
1974
+ else
1975
+ {
1976
+ my $buf = $$data{GT};
1977
+ while ($buf ne '')
1978
+ {
1979
+ my $al = $buf;
1980
+ if ( $buf=~$$self{regex_gtsep} )
1981
+ {
1982
+ $al = $`;
1983
+ $buf = $';
1984
+ if ( $buf eq '' ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
1985
+ }
1986
+ else
1987
+ {
1988
+ $buf = '';
1989
+ }
1990
+
1991
+ if ( !defined $al ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
1992
+ if ( $al eq '.' ) { next; }
1993
+ if ( $al eq '0' ) { next; }
1994
+ if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integer."; last; }
1995
+ if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; }
1996
+ }
1997
+ }
1998
+ if ( !@errs ) { return undef; }
1999
+ return join(',',@errs);
2000
+ }
2001
+
2002
+
2003
+ sub validate_ref_field
2004
+ {
2005
+ my ($self,$ref) = @_;
2006
+ if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; }
2007
+ return undef;
2008
+ }
2009
+
2010
+ sub validate_int
2011
+ {
2012
+ my ($self,$value,$default) = @_;
2013
+
2014
+ if ( defined($default) && $value eq $default ) { return undef; }
2015
+ if ( $value =~ /^-?\d+$/ ) { return undef; }
2016
+ return "Could not validate the int [$value]";
2017
+ }
2018
+
2019
+ sub validate_float
2020
+ {
2021
+ my ($self,$value,$default) = @_;
2022
+ if ( defined($default) && $value eq $default ) { return undef; }
2023
+ if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; }
2024
+ if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; }
2025
+ if ( $value =~ /^-?\d+$/ ) { return undef; }
2026
+ if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; }
2027
+ return "Could not validate the float [$value]";
2028
+ }
2029
+
2030
+ sub validate_char
2031
+ {
2032
+ my ($self,$value,$default) = @_;
2033
+
2034
+ if ( defined($default) && $value eq $default ) { return undef; }
2035
+ if ( length($value)==1) { return undef; }
2036
+ return "Could not validate the char value [$value]";
2037
+ }
2038
+
2039
+
2040
+ =head2 run_validation
2041
+
2042
+ About : Validates the VCF file.
2043
+ Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
2044
+ Args : File name or file handle.
2045
+
2046
+ =cut
2047
+
2048
+ sub run_validation
2049
+ {
2050
+ my ($self) = @_;
2051
+
2052
+ $self->parse_header();
2053
+ $self->validate_header();
2054
+
2055
+ if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); }
2056
+ elsif ( !exists($$self{header}{fileformat}) )
2057
+ {
2058
+ $self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]);
2059
+ }
2060
+ elsif ( $$self{header_lines}[0]{key} ne 'fileformat' )
2061
+ {
2062
+ $self->warn(qq[The "fileformat" not the first line in the header\n]);
2063
+ }
2064
+ if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); }
2065
+
2066
+ my $default_qual = $$self{defaults}{QUAL};
2067
+ my $warn_sorted=1;
2068
+ my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1;
2069
+ my ($prev_chrm,$prev_pos);
2070
+ while (my $x=$self->next_data_hash())
2071
+ {
2072
+ $self->validate_line($x);
2073
+
2074
+ # Is the position numeric?
2075
+ if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); }
2076
+
2077
+ if ( $warn_duplicates )
2078
+ {
2079
+ if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} )
2080
+ {
2081
+ $self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n");
2082
+ $warn_duplicates = 0;
2083
+ }
2084
+ }
2085
+
2086
+ # Is the file sorted?
2087
+ if ( $warn_sorted )
2088
+ {
2089
+ if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} )
2090
+ {
2091
+ $self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n");
2092
+ $warn_sorted = 0;
2093
+ }
2094
+ $prev_chrm = $$x{CHROM};
2095
+ $prev_pos = $$x{POS};
2096
+ }
2097
+
2098
+ # The reference base: one of A,C,G,T,N, non-empty.
2099
+ my $err = $self->validate_ref_field($$x{REF});
2100
+ if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
2101
+
2102
+ # The ALT field (alternate non-reference base)
2103
+ $err = $self->validate_alt_field($$x{ALT},$$x{REF});
2104
+ if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
2105
+
2106
+ # The QUAL field
2107
+ my $ret = $self->validate_float($$x{QUAL},$default_qual);
2108
+ if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); }
2109
+ elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); }
2110
+
2111
+ # The FILTER field
2112
+ $err = $self->validate_filter_field($$x{FILTER});
2113
+ if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); }
2114
+
2115
+ # The INFO field
2116
+ $err = $self->validate_info_field($$x{INFO},$$x{ALT});
2117
+ if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); }
2118
+
2119
+ while (my ($gt,$data) = each %{$$x{gtypes}})
2120
+ {
2121
+ $err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT});
2122
+ if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); }
2123
+ }
2124
+
2125
+ if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) )
2126
+ {
2127
+ my $nalt = scalar @{$$x{ALT}};
2128
+ if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; }
2129
+ my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples
2130
+ if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} )
2131
+ {
2132
+ $self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n");
2133
+ }
2134
+ if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} )
2135
+ {
2136
+ $self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n");
2137
+ }
2138
+ }
2139
+ }
2140
+ }
2141
+
2142
+
2143
+ =head2 get_chromosomes
2144
+
2145
+ About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
2146
+ Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes();
2147
+ Args : none
2148
+
2149
+ =cut
2150
+
2151
+ sub get_chromosomes
2152
+ {
2153
+ my ($self) = @_;
2154
+ if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); }
2155
+ my (@out) = `tabix -l $$self{file}`;
2156
+ if ( $? )
2157
+ {
2158
+ $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]);
2159
+ }
2160
+ for (my $i=0; $i<@out; $i++) { chomp($out[$i]); }
2161
+ return \@out;
2162
+ }
2163
+
2164
+
2165
+ =head2 get_samples
2166
+
2167
+ About : Get list of samples.
2168
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
2169
+ Args : none
2170
+
2171
+ =cut
2172
+
2173
+ sub get_samples
2174
+ {
2175
+ my ($self) = @_;
2176
+ my $n = @{$$self{columns}} - 1;
2177
+ return (@{$$self{columns}}[9..$n]);
2178
+ }
2179
+
2180
+
2181
+ #------------------------------------------------
2182
+ # Version 3.2 specific functions
2183
+
2184
+ package Vcf3_2;
2185
+ use base qw(VcfReader);
2186
+
2187
+ sub new
2188
+ {
2189
+ my ($class,@args) = @_;
2190
+ my $self = $class->SUPER::new(@args);
2191
+ bless $self, ref($class) || $class;
2192
+
2193
+ $$self{_defaults} =
2194
+ {
2195
+ version => '3.2',
2196
+ drop_trailings => 1,
2197
+ filter_passed => 0,
2198
+
2199
+ defaults =>
2200
+ {
2201
+ QUAL => '-1',
2202
+ default => '.',
2203
+ Flag => undef,
2204
+ GT => '.',
2205
+ },
2206
+
2207
+ handlers =>
2208
+ {
2209
+ Integer => \&VcfReader::validate_int,
2210
+ Float => \&VcfReader::validate_float,
2211
+ Character => \&VcfReader::validate_char,
2212
+ String => undef,
2213
+ Flag => undef,
2214
+ },
2215
+
2216
+ regex_snp => qr/^[ACGTN]$/i,
2217
+ regex_ins => qr/^I[ACGTN]+$/,
2218
+ regex_del => qr/^D\d+$/,
2219
+ regex_gtsep => qr{[\\|/]},
2220
+ regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
2221
+ regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)},
2222
+ };
2223
+
2224
+ for my $key (keys %{$$self{_defaults}})
2225
+ {
2226
+ $$self{$key}=$$self{_defaults}{$key};
2227
+ }
2228
+
2229
+
2230
+ return $self;
2231
+ }
2232
+
2233
+
2234
+ #------------------------------------------------
2235
+ # Version 3.3 specific functions
2236
+
2237
+ package Vcf3_3;
2238
+ use base qw(VcfReader);
2239
+
2240
+ sub new
2241
+ {
2242
+ my ($class,@args) = @_;
2243
+ my $self = $class->SUPER::new(@args);
2244
+ bless $self, ref($class) || $class;
2245
+
2246
+ $$self{_defaults} =
2247
+ {
2248
+ version => '3.3',
2249
+ drop_trailings => 0,
2250
+ filter_passed => 0,
2251
+
2252
+ defaults =>
2253
+ {
2254
+ QUAL => '-1',
2255
+ Integer => '-1',
2256
+ Float => '-1',
2257
+ Character => '.',
2258
+ String => '.',
2259
+ Flag => undef,
2260
+ GT => './.',
2261
+ default => '.',
2262
+ },
2263
+
2264
+ handlers =>
2265
+ {
2266
+ Integer => \&VcfReader::validate_int,
2267
+ Float => \&VcfReader::validate_float,
2268
+ Character => \&VcfReader::validate_char,
2269
+ String => undef,
2270
+ Flag => undef,
2271
+ },
2272
+
2273
+ regex_snp => qr/^[ACGTN]$/i,
2274
+ regex_ins => qr/^I[ACGTN]+$/,
2275
+ regex_del => qr/^D\d+$/,
2276
+ regex_gtsep => qr{[\\|/]},
2277
+ regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
2278
+ regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT
2279
+ };
2280
+
2281
+ for my $key (keys %{$$self{_defaults}})
2282
+ {
2283
+ $$self{$key}=$$self{_defaults}{$key};
2284
+ }
2285
+
2286
+ return $self;
2287
+ }
2288
+
2289
+
2290
+ #------------------------------------------------
2291
+ # Version 4.0 specific functions
2292
+
2293
+ =head1 VCFv4.0
2294
+
2295
+ VCFv4.0 specific functions
2296
+
2297
+ =cut
2298
+
2299
+ package Vcf4_0;
2300
+ use base qw(VcfReader);
2301
+
2302
+ sub new
2303
+ {
2304
+ my ($class,@args) = @_;
2305
+ my $self = $class->SUPER::new(@args);
2306
+ bless $self, ref($class) || $class;
2307
+
2308
+ $$self{_defaults} =
2309
+ {
2310
+ version => '4.0',
2311
+ drop_trailings => 1,
2312
+ filter_passed => 'PASS',
2313
+
2314
+ defaults =>
2315
+ {
2316
+ QUAL => '.',
2317
+ Flag => undef,
2318
+ GT => '.',
2319
+ default => '.',
2320
+ },
2321
+ reserved =>
2322
+ {
2323
+ FILTER => { 0=>1 },
2324
+ },
2325
+
2326
+ handlers =>
2327
+ {
2328
+ Integer => \&VcfReader::validate_int,
2329
+ Float => \&VcfReader::validate_float,
2330
+ Character => \&VcfReader::validate_char,
2331
+ String => undef,
2332
+ Flag => undef,
2333
+ },
2334
+
2335
+ regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
2336
+ regex_ins => qr/^[ACGTN]+$/,
2337
+ regex_del => qr/^[ACGTN]+$/,
2338
+ regex_gtsep => qr{[|/]}, # | /
2339
+ regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
2340
+ regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
2341
+ };
2342
+
2343
+ for my $key (keys %{$$self{_defaults}})
2344
+ {
2345
+ $$self{$key}=$$self{_defaults}{$key};
2346
+ }
2347
+
2348
+ return $self;
2349
+ }
2350
+
2351
+ sub Vcf4_0::format_header_line
2352
+ {
2353
+ my ($self,$rec) = @_;
2354
+
2355
+ my %tmp_rec = ( %$rec );
2356
+ if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' }
2357
+ my $value;
2358
+ if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' )
2359
+ {
2360
+ my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output
2361
+ my @items;
2362
+ for my $key (qw(ID Number Type Description), sort keys %tmp_rec)
2363
+ {
2364
+ if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; }
2365
+ my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : '';
2366
+ push @items, "$key=$quote$tmp_rec{$key}$quote";
2367
+ $has{$key}=1;
2368
+ }
2369
+ $value = '<' .join(',',@items). '>';
2370
+ }
2371
+ else { $value = $tmp_rec{value}; }
2372
+
2373
+ my $line = "##$tmp_rec{key}=".$value."\n";
2374
+ return $line;
2375
+ }
2376
+
2377
+ =head2 parse_header_line
2378
+
2379
+ Usage : $vcf->parse_header_line(q[##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">])
2380
+ $vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36])
2381
+ Args :
2382
+ Returns :
2383
+
2384
+ =cut
2385
+
2386
+ sub Vcf4_0::parse_header_line
2387
+ {
2388
+ my ($self,$line) = @_;
2389
+
2390
+ chomp($line);
2391
+ $line =~ s/^##//;
2392
+
2393
+ if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); }
2394
+ my $key = $1;
2395
+ my $value = $';
2396
+
2397
+ if ( !($value=~/^<(.+)>\s*$/) )
2398
+ {
2399
+ # Simple sanity check for subtle typos
2400
+ if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' )
2401
+ {
2402
+ $self->throw("Hmm, is this a typo? [$key] [$value]");
2403
+ }
2404
+ return { key=>$key, value=>$value };
2405
+ }
2406
+
2407
+ my $rec = { key=>$key };
2408
+ my $tmp = $1;
2409
+ my ($attr_key,$attr_value,$quoted);
2410
+ while ($tmp ne '')
2411
+ {
2412
+ if ( !defined $attr_key )
2413
+ {
2414
+ if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; }
2415
+ elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; }
2416
+ else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); }
2417
+ }
2418
+
2419
+ if ( $tmp=~/^[^,\\"]+/ ) { $attr_value .= $&; $tmp = $'; }
2420
+ if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; }
2421
+ if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; }
2422
+ if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ )
2423
+ {
2424
+ if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ )
2425
+ {
2426
+ $self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n");
2427
+ $attr_key =~ s/^\s+//;
2428
+ $attr_key =~ s/\s+$//;
2429
+ $attr_value =~ s/^\s+//;
2430
+ $attr_value =~ s/\s+$//;
2431
+ }
2432
+ $$rec{$attr_key} = $attr_value;
2433
+ $tmp = $';
2434
+ if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; }
2435
+ $attr_key = $attr_value = $quoted = undef;
2436
+ next;
2437
+ }
2438
+ if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; }
2439
+ $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]);
2440
+ }
2441
+
2442
+ if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); }
2443
+ if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' )
2444
+ {
2445
+ if ( !exists($$rec{Description}) ) { $self->throw("Missing the Description tag in $line\n"); }
2446
+ }
2447
+ if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); }
2448
+ if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; }
2449
+
2450
+ return $rec;
2451
+ }
2452
+
2453
+ sub Vcf4_0::validate_ref_field
2454
+ {
2455
+ my ($self,$ref) = @_;
2456
+ if ( !($ref=~/^[ACGTN]+$/) )
2457
+ {
2458
+ my $offending = $ref;
2459
+ $offending =~ s/[ACGTN]+//g;
2460
+ return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n";
2461
+ }
2462
+ return undef;
2463
+ }
2464
+
2465
+ sub Vcf4_0::validate_alt_field
2466
+ {
2467
+ my ($self,$values,$ref) = @_;
2468
+
2469
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
2470
+
2471
+ my $ret = $self->_validate_alt_field($values,$ref);
2472
+ if ( $ret ) { return $ret; }
2473
+
2474
+ my $ref_len = length($ref);
2475
+ my $ref1 = substr($ref,0,1);
2476
+
2477
+ my @err;
2478
+ my $msg = '';
2479
+ for my $item (@$values)
2480
+ {
2481
+ if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
2482
+ if ( $item=~/^<[^<>\s]+>$/ ) { next; }
2483
+ if ( $ref_len==length($item) ) { next; }
2484
+ if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; }
2485
+ }
2486
+ if ( !@err ) { return undef; }
2487
+ return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
2488
+ }
2489
+
2490
+
2491
+ =head2 fill_ref_alt_mapping
2492
+
2493
+ About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping
2494
+ from the original haplotypes to the haplotypes based on the new REF. Consider the following
2495
+ example:
2496
+ REF ALT
2497
+ G GA
2498
+ GT G
2499
+ GT GA
2500
+ GT GAA
2501
+ GTC G
2502
+ G <DEL>
2503
+ my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{'<DEL>'=>1}};
2504
+ my $new_ref=$vcf->fill_ref_alt_mapping($map);
2505
+
2506
+ The call returns GTC and $map is now
2507
+ G GA -> GTC GATC
2508
+ GT G -> GTC GC
2509
+ GT GA -> GTC GAC
2510
+ GT GAA -> GTC GAAC
2511
+ GTC G -> GTC G
2512
+ G <DEL> -> GTC <DEL>
2513
+ Args :
2514
+ Returns : New REF string and fills the hash with appropriate ALT.
2515
+
2516
+ =cut
2517
+
2518
+ sub Vcf4_0::fill_ref_alt_mapping
2519
+ {
2520
+ my ($self,$map) = @_;
2521
+
2522
+ my $max_len = 0;
2523
+ my $new_ref;
2524
+ for my $ref (keys %$map)
2525
+ {
2526
+ my $len = length($ref);
2527
+ if ( $max_len<$len )
2528
+ {
2529
+ $max_len = $len;
2530
+ $new_ref = $ref;
2531
+ }
2532
+ $$map{$ref}{$ref} = 1;
2533
+ }
2534
+ for my $ref (keys %$map)
2535
+ {
2536
+ my $rlen = length($ref);
2537
+ if ( substr($new_ref,0,$rlen) ne $ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
2538
+ for my $alt (keys %{$$map{$ref}})
2539
+ {
2540
+ # The second part of the regex is for VCF>4.0, but does no harm for v<=4.0
2541
+ if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; }
2542
+ my $new = $alt;
2543
+ if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); }
2544
+ $$map{$ref}{$alt} = $new;
2545
+ }
2546
+ }
2547
+ return $new_ref;
2548
+ }
2549
+
2550
+
2551
+
2552
+ sub Vcf4_0::event_type
2553
+ {
2554
+ my ($self,$rec,$allele) = @_;
2555
+
2556
+ my $ref = $rec;
2557
+ if ( ref($rec) eq 'HASH' )
2558
+ {
2559
+ if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
2560
+ $ref = $$rec{REF};
2561
+ }
2562
+
2563
+ if ( $allele=~/^<[^>]+>$/ )
2564
+ {
2565
+ if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; }
2566
+ return ('u',0,$allele);
2567
+ }
2568
+ if ( $allele eq '.' )
2569
+ {
2570
+ if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; }
2571
+ return ('r',0,$ref);
2572
+ }
2573
+
2574
+ my $reflen = length($ref);
2575
+ my $len = length($allele);
2576
+
2577
+ my $ht;
2578
+ my $type;
2579
+ if ( $len==$reflen )
2580
+ {
2581
+ # This can be a reference, a SNP, or multiple SNPs
2582
+ my $mism = 0;
2583
+ for (my $i=0; $i<$len; $i++)
2584
+ {
2585
+ if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; }
2586
+ }
2587
+ if ( $mism==0 ) { $type='r'; $len=0; }
2588
+ else { $type='s'; $len=$mism; }
2589
+ }
2590
+ else
2591
+ {
2592
+ ($len,$ht)=is_indel($ref,$allele);
2593
+ if ( $len )
2594
+ {
2595
+ # Indel
2596
+ $type = 'i';
2597
+ $allele = $ht;
2598
+ }
2599
+ else
2600
+ {
2601
+ $type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1;
2602
+ }
2603
+ }
2604
+
2605
+ if ( ref($rec) eq 'HASH' )
2606
+ {
2607
+ $$rec{_cached_events}{$allele} = [$type,$len,$allele];
2608
+ }
2609
+ return ($type,$len,$allele);
2610
+ }
2611
+
2612
+ # The sequences start at the same position, which simplifies things greatly.
2613
+ sub is_indel
2614
+ {
2615
+ my ($seq1,$seq2) = @_;
2616
+
2617
+ my $len1 = length($seq1);
2618
+ my $len2 = length($seq2);
2619
+ if ( $len1 eq $len2 ) { return (0,''); }
2620
+
2621
+ my ($del,$len,$LEN);
2622
+ if ( $len1<$len2 )
2623
+ {
2624
+ $len = $len1;
2625
+ $LEN = $len2;
2626
+ $del = 1;
2627
+ }
2628
+ else
2629
+ {
2630
+ $len = $len2;
2631
+ $LEN = $len1;
2632
+ $del = -1;
2633
+ my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp;
2634
+ }
2635
+
2636
+ my $ileft;
2637
+ for ($ileft=0; $ileft<$len; $ileft++)
2638
+ {
2639
+ if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; }
2640
+ }
2641
+ if ( $ileft==$len )
2642
+ {
2643
+ return ($del*($LEN-$len), substr($seq2,$ileft));
2644
+ }
2645
+
2646
+ my $iright;
2647
+ for ($iright=0; $iright<$len; $iright++)
2648
+ {
2649
+ if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; }
2650
+ }
2651
+ if ( $iright+$ileft<=$len ) { return (0,''); }
2652
+
2653
+ return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len));
2654
+ }
2655
+
2656
+
2657
+ #------------------------------------------------
2658
+ # Version 4.1 specific functions
2659
+
2660
+ =head1 VCFv4.1
2661
+
2662
+ VCFv4.1 specific functions
2663
+
2664
+ =cut
2665
+
2666
+ package Vcf4_1;
2667
+ use base qw(Vcf4_0);
2668
+
2669
+ sub new
2670
+ {
2671
+ my ($class,@args) = @_;
2672
+ my $self = $class->SUPER::new(@args);
2673
+ bless $self, ref($class) || $class;
2674
+
2675
+ $$self{_defaults} =
2676
+ {
2677
+ version => '4.1',
2678
+ drop_trailings => 1,
2679
+ filter_passed => 'PASS',
2680
+
2681
+ defaults =>
2682
+ {
2683
+ QUAL => '.',
2684
+ Flag => undef,
2685
+ GT => '.',
2686
+ default => '.',
2687
+ },
2688
+ reserved =>
2689
+ {
2690
+ FILTER => { 0=>1 },
2691
+ },
2692
+
2693
+ handlers =>
2694
+ {
2695
+ Integer => \&VcfReader::validate_int,
2696
+ Float => \&VcfReader::validate_float,
2697
+ Character => \&VcfReader::validate_char,
2698
+ String => undef,
2699
+ Flag => undef,
2700
+ },
2701
+
2702
+ regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
2703
+ regex_ins => qr/^[ACGTN]+$/i,
2704
+ regex_del => qr/^[ACGTN]+$/i,
2705
+ regex_gtsep => qr{[|/]}, # | /
2706
+ regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
2707
+ regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
2708
+ };
2709
+
2710
+ $$self{ignore_missing_GT} = 1;
2711
+
2712
+ for my $key (keys %{$$self{_defaults}})
2713
+ {
2714
+ $$self{$key}=$$self{_defaults}{$key};
2715
+ }
2716
+
2717
+ return $self;
2718
+ }
2719
+
2720
+ sub Vcf4_1::validate_header
2721
+ {
2722
+ my ($self) = @_;
2723
+ my $lines = $self->get_header_line(key=>'reference');
2724
+ if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); }
2725
+ }
2726
+
2727
+ sub Vcf4_1::validate_line
2728
+ {
2729
+ my ($self,$line) = @_;
2730
+
2731
+ if ( !$$self{_contig_validated}{$$line{CHROM}} )
2732
+ {
2733
+ my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM});
2734
+ if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); }
2735
+ $$self{_contig_validated}{$$line{CHROM}} = 1;
2736
+ }
2737
+
2738
+ # Is the ID composed of alphanumeric chars
2739
+ if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); }
2740
+ }
2741
+
2742
+ sub Vcf4_1::validate_alt_field
2743
+ {
2744
+ my ($self,$values,$ref) = @_;
2745
+
2746
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
2747
+
2748
+ my $ret = $self->_validate_alt_field($values,$ref);
2749
+ if ( $ret ) { return $ret; }
2750
+
2751
+ my $ref_len = length($ref);
2752
+ my $ref1 = substr($ref,0,1);
2753
+
2754
+ my @err;
2755
+ my $msg = '';
2756
+ for my $item (@$values)
2757
+ {
2758
+ if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ )
2759
+ {
2760
+ if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; }
2761
+ my $rpl;
2762
+ if ( $1 ne '' )
2763
+ {
2764
+ $rpl = $1;
2765
+ if ( $rpl ne '.' )
2766
+ {
2767
+ my $rref = substr($rpl,0,1);
2768
+ if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; }
2769
+ }
2770
+ }
2771
+ else
2772
+ {
2773
+ $rpl = $3;
2774
+ if ( $rpl ne '.' )
2775
+ {
2776
+ my $rref = substr($rpl,-1,1);
2777
+ if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; }
2778
+ }
2779
+ }
2780
+ my $pos = $2;
2781
+ if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; }
2782
+ if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; }
2783
+ next;
2784
+ }
2785
+ if ( $item=~/^\.([ACTGNactgn])[ACTGNactgn]*$/ )
2786
+ {
2787
+ if ( $ref1 ne $1 ) { $msg=', first base does not match the reference'; push @err,$item; }
2788
+ next;
2789
+ }
2790
+ elsif ( $item=~/^[ACTGNactgn]*([ACTGNactgn])\.$/ )
2791
+ {
2792
+ if ( substr($ref,-1,1) ne $1 ) { $msg=', last base does not match the reference'; push @err,$item; }
2793
+ next;
2794
+ }
2795
+ if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
2796
+ if ( $item=~/^<[^<>\s]+>$/ ) { next; }
2797
+ if ( $ref_len==length($item) ) { next; }
2798
+ if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference'; push @err,$item; next; }
2799
+ }
2800
+ if ( !@err ) { return undef; }
2801
+ return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
2802
+ }
2803
+
2804
+ sub Vcf4_1::next_data_hash
2805
+ {
2806
+ my ($self,@args) = @_;
2807
+
2808
+ my $out = $self->SUPER::next_data_hash(@args);
2809
+ if ( !defined $out ) { return $out; }
2810
+
2811
+ # Case-insensitive ALT and REF bases
2812
+ $$out{REF} = uc($$out{REF});
2813
+ my $nalt = @{$$out{ALT}};
2814
+ for (my $i=0; $i<$nalt; $i++)
2815
+ {
2816
+ if ( $$out{ALT}[$i]=~/^</ ) { next; }
2817
+ $$out{ALT}[$i] = uc($$out{ALT}[$i]);
2818
+ }
2819
+
2820
+ return $out;
2821
+ }
2822
+
2823
+ sub Vcf4_1::next_data_array
2824
+ {
2825
+ my ($self,@args) = @_;
2826
+
2827
+ my $out = $self->SUPER::next_data_array(@args);
2828
+ if ( !defined $out or $$self{assume_uppercase} ) { return $out; }
2829
+
2830
+ # Case-insensitive ALT and REF bases
2831
+ $$out[3] = uc($$out[3]);
2832
+ my $alt = $$out[4];
2833
+ $$out[4] = '';
2834
+ while ($alt=~/[^<>]+/)
2835
+ {
2836
+ $$out[4] .= $`;
2837
+ $$out[4] .= (length($`) && substr($`,-1,1) eq '<' ) ? $& : uc($&);
2838
+ $alt = $';
2839
+ if ( $alt=~/^<[^<>]+>/ ) { $$out[4] .= $&; $alt=$'; }
2840
+ }
2841
+
2842
+ return $out;
2843
+ }
2844
+
2845
+ sub Vcf4_1::event_type
2846
+ {
2847
+ my ($self,$rec,$allele) = @_;
2848
+ if ( $allele=~/\[|\]|^\..+|.+\.$/ ) { return 'b'; }
2849
+ return $self->SUPER::event_type($rec,$allele);
2850
+ }
2851
+
2852
+ 1;
2853
+