ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,2853 @@
1
+ package Vcf;
2
+
3
+ # http://vcftools.sourceforge.net/specs.html
4
+ # http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
5
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:variant_call_format
6
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0
7
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf_4.0_sv
8
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
9
+ # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2
10
+ #
11
+ # Authors: petr.danecek@sanger
12
+ # for VCF v3.2, v3.3, v4.0, v4.1
13
+ #
14
+
15
+ =head1 NAME
16
+
17
+ Vcf.pm. Module for validation, parsing and creating VCF files.
18
+ Supported versions: 3.2, 3.3, 4.0, 4.1
19
+
20
+ =head1 SYNOPSIS
21
+
22
+ From the command line:
23
+ perl -MVcf -e validate example.vcf
24
+ perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf
25
+
26
+ From a script:
27
+ use Vcf;
28
+
29
+ my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000');
30
+ $vcf->parse_header();
31
+
32
+ # Do some simple parsing. Most thorough but slowest way how to get the data.
33
+ while (my $x=$vcf->next_data_hash())
34
+ {
35
+ for my $gt (keys %{$$x{gtypes}})
36
+ {
37
+ my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt);
38
+ print "\t$gt: $al1$sep$al2\n";
39
+ }
40
+ print "\n";
41
+ }
42
+
43
+ # This will split the fields and print a list of CHR:POS
44
+ while (my $x=$vcf->next_data_array())
45
+ {
46
+ print "$$x[0]:$$x[1]\n";
47
+ }
48
+
49
+ # This will return the lines as they were read, including the newline at the end
50
+ while (my $x=$vcf->next_line())
51
+ {
52
+ print $x;
53
+ }
54
+
55
+ # Only the columns NA00001, NA00002 and NA00003 will be printed.
56
+ my @columns = qw(NA00001 NA00002 NA00003);
57
+ print $vcf->format_header(\@columns);
58
+ while (my $x=$vcf->next_data_array())
59
+ {
60
+ # this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0
61
+ print $vcf->format_line($x,\@columns);
62
+ }
63
+
64
+ $vcf->close();
65
+
66
+ =cut
67
+
68
+
69
+ use strict;
70
+ use warnings;
71
+ use Carp;
72
+ use Exporter;
73
+ use Data::Dumper;
74
+ use POSIX ":sys_wait_h";
75
+
76
+ use vars qw/@ISA @EXPORT/;
77
+ @ISA = qw/Exporter/;
78
+ @EXPORT = qw/validate validate_v32/;
79
+
80
+ =head2 validate
81
+
82
+ About : Validates the VCF file.
83
+ Usage : perl -MVcf -e validate example.vcf.gz # (from the command line)
84
+ validate('example.vcf.gz'); # (from a script)
85
+ validate(\*STDIN);
86
+ Args : File name or file handle. When no argument given, the first command line
87
+ argument is interpreted as the file name.
88
+
89
+ =cut
90
+
91
+ sub validate
92
+ {
93
+ my ($fh) = @_;
94
+
95
+ if ( !$fh && @ARGV ) { $fh = $ARGV[0]; }
96
+
97
+ my $vcf;
98
+ if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); }
99
+ else { $vcf = Vcf->new(fh=>\*STDIN); }
100
+
101
+ $vcf->run_validation();
102
+ }
103
+
104
+
105
+ =head2 validate_v32
106
+
107
+ About : Same as validate, but assumes v3.2 VCF version.
108
+ Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line)
109
+ Args : File name or file handle. When no argument given, the first command line
110
+ argument is interpreted as the file name.
111
+
112
+ =cut
113
+
114
+ sub validate_v32
115
+ {
116
+ my ($fh) = @_;
117
+
118
+ if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; }
119
+
120
+ my %params = ( version=>'3.2' );
121
+
122
+ my $vcf;
123
+ if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); }
124
+ else { $vcf = Vcf->new(%params, fh=>\*STDIN); }
125
+
126
+ $vcf->run_validation();
127
+ }
128
+
129
+
130
+ =head2 new
131
+
132
+ About : Creates new VCF reader/writer.
133
+ Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2');
134
+ Args :
135
+ fh .. Open file handle. If neither file nor fh is given, open in write mode.
136
+ file .. The file name. If neither file nor fh is given, open in write mode.
137
+ region .. Optional region to parse (requires tabix indexed VCF file)
138
+ silent .. Unless set to 0, warning messages may be printed.
139
+ strict .. Unless set to 0, the reader will die when the file violates the specification.
140
+ version .. If not given, '4.0' is assumed. The header information overrides this setting.
141
+
142
+ =cut
143
+
144
+ sub new
145
+ {
146
+ my ($class,@args) = @_;
147
+ my $self = {@args};
148
+ bless $self, ref($class) || $class;
149
+
150
+ $$self{silent} = 0 unless exists($$self{silent});
151
+ $$self{strict} = 0 unless exists($$self{strict});
152
+ $$self{buffer} = []; # buffer stores the lines in the reverse order
153
+ $$self{columns} = undef; # column names
154
+ $$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory});
155
+ $$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols});
156
+ $$self{recalc_ac_an} = 1;
157
+ $$self{has_header} = 0;
158
+ $$self{default_version} = '4.1';
159
+ $$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1) ];
160
+ if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} }
161
+ $$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1;
162
+ my %open_args = ();
163
+ if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; }
164
+ if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; }
165
+ return $self->_open(%open_args);
166
+ }
167
+
168
+ sub throw
169
+ {
170
+ my ($self,@msg) = @_;
171
+ confess @msg,"\n";
172
+ }
173
+
174
+ sub warn
175
+ {
176
+ my ($self,@msg) = @_;
177
+ if ( $$self{silent} ) { return; }
178
+ if ( $$self{strict} ) { $self->throw(@msg); }
179
+ warn @msg;
180
+ }
181
+
182
+ sub _open
183
+ {
184
+ my ($self,%args) = @_;
185
+
186
+ if ( !exists($$self{fh}) && !exists($$self{file}) )
187
+ {
188
+ # Write mode, the version must be supplied by the user
189
+ return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version});
190
+ }
191
+
192
+ # Open the file unless filehandle is provided
193
+ if ( !exists($$self{fh}) )
194
+ {
195
+ my $cmd = "<$$self{file}";
196
+
197
+ my $tabix_args = '';
198
+ if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; }
199
+ $tabix_args .= $$self{file};
200
+ if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= ' '.$args{region}; }
201
+
202
+ if ( -e $$self{file} && $$self{file}=~/\.gz/i )
203
+ {
204
+ if ( exists($args{region}) && defined($args{region}) )
205
+ {
206
+ $cmd = "tabix $tabix_args |";
207
+ }
208
+ else { $cmd = "gunzip -c $$self{file} |"; }
209
+ $$self{check_exit_status} = 1;
210
+ }
211
+ elsif ( $$self{file}=~m{^(?:http|ftp)://} )
212
+ {
213
+ if ( !exists($args{region}) ) { $tabix_args .= ' .'; }
214
+ $cmd = "tabix $tabix_args |";
215
+ $$self{check_exit_status} = 1;
216
+ }
217
+ open($$self{fh},$cmd) or $self->throw("$cmd: $!");
218
+ }
219
+
220
+ # Set the correct VCF version, but only when called for the first time
221
+ my $vcf = $self;
222
+ if ( !$$self{_version_set} )
223
+ {
224
+ my $first_line = $self->next_line();
225
+ $vcf = $self->_set_version($first_line);
226
+ $self->_unread_line($first_line);
227
+ }
228
+ return $vcf;
229
+ }
230
+
231
+
232
+
233
+ =head2 open
234
+
235
+ About : (Re)Open file. No need to call this explicitly unless reading from a different
236
+ region is requested.
237
+ Usage : $vcf->open(); # Read from the start
238
+ $vcf->open(region=>'1:12345-92345');
239
+ Args : region .. Supported only for tabix indexed files
240
+
241
+ =cut
242
+
243
+ sub open
244
+ {
245
+ my ($self,%args) = @_;
246
+ $self->close();
247
+ $self->_open(%args);
248
+ }
249
+
250
+
251
+ =head2 close
252
+
253
+ About : Close the filehandle
254
+ Usage : $vcf->close();
255
+ Args : none
256
+
257
+ =cut
258
+
259
+ sub close
260
+ {
261
+ my ($self) = @_;
262
+ if ( !$$self{fh} ) { return; }
263
+ close($$self{fh});
264
+ delete($$self{fh});
265
+ }
266
+
267
+
268
+ =head2 next_line
269
+
270
+ About : Reads next VCF line.
271
+ Usage : my $vcf = Vcf->new();
272
+ my $x = $vcf->next_line();
273
+ Args : none
274
+
275
+ =cut
276
+
277
+ sub next_line
278
+ {
279
+ my ($self) = @_;
280
+ if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
281
+
282
+ my $line;
283
+ if ( !exists($$self{max_line_len}) )
284
+ {
285
+ $line = readline($$self{fh});
286
+ }
287
+ else
288
+ {
289
+ while (1)
290
+ {
291
+ $line = readline($$self{fh});
292
+ if ( !defined $line ) { last; }
293
+
294
+ my $len = length($line);
295
+ if ( $len>$$self{max_line_len} && !($line=~/^#/) )
296
+ {
297
+ if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); }
298
+ $self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n");
299
+ next;
300
+ }
301
+ last;
302
+ }
303
+ }
304
+ if ( !defined $line && $$self{check_exit_status} )
305
+ {
306
+ my $pid = waitpid(-1, WNOHANG);
307
+ if ( $pid!=0 && $pid!=-1 && $? !=0 )
308
+ {
309
+ $self->throw("Error reading VCF file.\n");
310
+ }
311
+ }
312
+ return $line;
313
+ }
314
+
315
+ sub _unread_line
316
+ {
317
+ my ($self,$line) = @_;
318
+ unshift @{$$self{buffer}}, $line;
319
+ return;
320
+ }
321
+
322
+
323
+ =head2 next_data_array
324
+
325
+ About : Reads next VCF line and splits it into an array. The last element is chomped.
326
+ Usage : my $vcf = Vcf->new();
327
+ $vcf->parse_header();
328
+ my $x = $vcf->next_data_array();
329
+ Args : Optional line to parse
330
+
331
+ =cut
332
+
333
+ sub next_data_array
334
+ {
335
+ my ($self,$line) = @_;
336
+ if ( !$line ) { $line = $self->next_line(); }
337
+ if ( !$line ) { return undef; }
338
+ my @items = split(/\t/,$line);
339
+ chomp($items[-1]);
340
+ return \@items;
341
+ }
342
+
343
+
344
+ =head2 set_samples
345
+
346
+ About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit.
347
+ Usage : my $vcf = Vcf->new();
348
+ $vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded.
349
+ $vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included.
350
+ my $x = $vcf->next_data_hash();
351
+ Args : Optional line to parse
352
+
353
+ =cut
354
+
355
+ sub set_samples
356
+ {
357
+ my ($self,%args) = @_;
358
+
359
+ if ( exists($args{include}) )
360
+ {
361
+ for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; }
362
+ for my $sample (@{$args{include}})
363
+ {
364
+ if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
365
+ my $idx = $$self{has_column}{$sample} - 1;
366
+ $$self{samples_to_parse}[$idx] = 1;
367
+ }
368
+ }
369
+
370
+ if ( exists($args{exclude}) )
371
+ {
372
+ for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; }
373
+ for my $sample (@{$args{exclude}})
374
+ {
375
+ if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); }
376
+ my $idx = $$self{has_column}{$sample} - 1;
377
+ $$self{samples_to_parse}[$idx] = 0;
378
+ }
379
+ }
380
+ }
381
+
382
+
383
+ sub _set_version
384
+ {
385
+ my ($self,$version_line) = @_;
386
+
387
+ if ( $$self{_version_set} ) { return $self; }
388
+ $$self{_version_set} = 1;
389
+
390
+ $$self{version} = $$self{default_version};
391
+ if ( $version_line )
392
+ {
393
+ if ( $version_line=~/^(\d+(?:\.\d+)?)$/ )
394
+ {
395
+ $$self{version} = $1;
396
+ undef $version_line;
397
+ }
398
+ elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) )
399
+ {
400
+ $self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n");
401
+ undef $version_line;
402
+ }
403
+ else
404
+ {
405
+ $$self{version} = $1;
406
+ }
407
+ }
408
+
409
+ my $reader;
410
+ if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); }
411
+ elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); }
412
+ elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); }
413
+ elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); }
414
+ else
415
+ {
416
+ $self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]);
417
+ $$self{version} = '4.1';
418
+ $reader = Vcf4_1->new(%$self);
419
+ }
420
+
421
+ $self = $reader;
422
+ # When changing version, change also the fileformat header line
423
+ if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' )
424
+ {
425
+ shift(@{$$self{header_lines}});
426
+ }
427
+
428
+ return $self;
429
+ }
430
+
431
+
432
+ #---------------------------------------
433
+
434
+ package VcfReader;
435
+ use base qw(Vcf);
436
+ use strict;
437
+ use warnings;
438
+ use Carp;
439
+ use Data::Dumper;
440
+
441
+ sub new
442
+ {
443
+ my ($class,@args) = @_;
444
+ my $self = {@args};
445
+ bless $self, ref($class) || $class;
446
+ return $self;
447
+ }
448
+
449
+
450
+ =head2 next_data_hash
451
+
452
+ About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
453
+ Usage : my $vcf = Vcf->new();
454
+ $vcf->parse_header();
455
+ my $x = $vcf->next_data_hash();
456
+
457
+ # Or having a VCF data line $line
458
+ my $x = $vcf->next_data_hash($line);
459
+
460
+ Args : Optional line to parse.
461
+
462
+ =cut
463
+
464
+ sub next_data_hash
465
+ {
466
+ my ($self,$line) = @_;
467
+ if ( !$line ) { $line = $self->next_line(); }
468
+ if ( !$line ) { return undef; }
469
+ my @items;
470
+ if ( ref($line) eq 'ARRAY' ) { @items = @$line; }
471
+ else { @items = split(/\t/,$line); }
472
+ chomp($items[-1]);
473
+
474
+ my $cols = $$self{columns};
475
+ if ( !$cols )
476
+ {
477
+ $self->_fake_column_names(scalar @items - 9);
478
+ $cols = $$self{columns};
479
+ }
480
+
481
+ # Check the number of columns
482
+ if ( scalar @items != scalar @$cols )
483
+ {
484
+ if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); }
485
+ if ( $line=~/^#/ ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); }
486
+
487
+ $self->warn("Different number of columns at $items[0]:$items[1] (expected ".scalar @$cols.", got ".scalar @items.")\n");
488
+ while ( $items[-1] eq '' ) { pop(@items); }
489
+ if ( scalar @items != scalar @$cols )
490
+ {
491
+ my @test = split(/\s+/,$line);
492
+ if ( scalar @test == scalar @$cols ) { $self->warn("(Have spaces been used instead of tabs?)\n\n"); }
493
+ else { $self->throw("Error not recoverable, exiting.\n"); }
494
+
495
+ @items = @test;
496
+ }
497
+ else { $self->warn("(Trailing tabs?)\n\n"); }
498
+ }
499
+ my %out;
500
+
501
+ # Mandatory fields
502
+ $out{CHROM} = $items[0];
503
+ $out{POS} = $items[1];
504
+ $out{ID} = $items[2];
505
+ $out{REF} = $items[3];
506
+ $out{ALT} = [ split(/,/,$items[4]) ];
507
+ $out{QUAL} = $items[5];
508
+ $out{FILTER} = [ split(/;/,$items[6]) ];
509
+
510
+ # INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2
511
+ if ( defined $items[7] )
512
+ {
513
+ my %hash;
514
+ for my $info (split(/;/,$items[7]))
515
+ {
516
+ my ($key,$val) = split(/=/,$info);
517
+ if ( !defined $key )
518
+ {
519
+ $self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n");
520
+ next;
521
+ }
522
+ if ( defined $val )
523
+ {
524
+ $hash{$key} = $val;
525
+ }
526
+ elsif ( exists($$self{header}{INFO}{$key}) )
527
+ {
528
+ $hash{$key} = $$self{header}{INFO}{$key}{default};
529
+ }
530
+ else
531
+ {
532
+ $hash{$key} = undef;
533
+ }
534
+ }
535
+ $out{INFO} = \%hash;
536
+ }
537
+
538
+ # The FORMAT field may not be present. GT:GQ:DP:HQ
539
+ my $format;
540
+ if ( $$cols[8] || $items[8] )
541
+ {
542
+ $format = $out{FORMAT} = [ split(/:/,$items[8]) ];
543
+ if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); }
544
+ }
545
+
546
+ # Genotype fields
547
+ my %gtypes;
548
+ my $check_nformat = $$self{drop_trailings} ? 0 : 1;
549
+ for (my $icol=9; $icol<@items; $icol++)
550
+ {
551
+ if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; }
552
+ if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; }
553
+
554
+ my @fields = split(/:/, $items[$icol]);
555
+ if ( $check_nformat && @fields != @$format )
556
+ {
557
+ $self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] ("
558
+ .scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n");
559
+ }
560
+ my %hash;
561
+ for (my $ifield=0; $ifield<@fields; $ifield++)
562
+ {
563
+ $hash{$$format[$ifield]} = $fields[$ifield];
564
+ }
565
+ $gtypes{$$cols[$icol]} = \%hash;
566
+ }
567
+ $out{gtypes} = \%gtypes;
568
+
569
+ return \%out;
570
+ }
571
+
572
+
573
+ =head2 parse_header
574
+
575
+ About : Reads (and stores) the VCF header.
576
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header();
577
+ Args : silent .. do not warn about duplicate header lines
578
+
579
+ =cut
580
+
581
+ sub parse_header
582
+ {
583
+ my ($self,%args) = @_;
584
+
585
+ # First come the header lines prefixed by ##
586
+ while ($self->_next_header_line(%args)) { ; }
587
+
588
+ # Now comes the column names line prefixed by #
589
+ $self->_read_column_names();
590
+ }
591
+
592
+
593
+ =head2 _next_header_line
594
+
595
+ About : Stores the header lines and meta information, such as fields types, etc.
596
+ Args : silent .. do not warn about duplicate column names
597
+
598
+ =cut
599
+
600
+ sub _next_header_line
601
+ {
602
+ my ($self,%args) = @_;
603
+ my $line = $self->next_line();
604
+ if ( !defined $line ) { return undef; }
605
+ if ( substr($line,0,2) ne '##' )
606
+ {
607
+ $self->_unread_line($line);
608
+ return undef;
609
+ }
610
+
611
+ my $rec = $self->parse_header_line($line);
612
+ if ( $rec ) { $self->add_header_line($rec,%args); }
613
+
614
+ return $rec;
615
+ }
616
+
617
+ =head2 get_header_line
618
+
619
+ Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC')
620
+ $vcf->get_header_line(key=>'FILTER', ID=>'q10')
621
+ $vcf->get_header_line(key=>'reference')
622
+ $vcf->get_header_line(key=>'contig',ID=>'20')
623
+ Args : Header line filter as in the example above
624
+ Returns : List ref of header line hashes matching the filter
625
+
626
+ =cut
627
+
628
+ sub get_header_line
629
+ {
630
+ my ($self,%filter) = @_;
631
+
632
+ my $key = $filter{key};
633
+ delete($filter{key});
634
+
635
+ my $id = $filter{ID};
636
+
637
+ my @out;
638
+ while (my ($hline_key,$hline_hash) = each %{$$self{header}})
639
+ {
640
+ if ( $key ne $hline_key ) { next; }
641
+
642
+ if ( defined $id )
643
+ {
644
+ if ( !exists($$hline_hash{$id}) ) { next; }
645
+ $hline_hash = $$hline_hash{$id};
646
+ }
647
+
648
+ my $match = 1;
649
+ while (my ($fkey,$fval) = each %filter)
650
+ {
651
+ if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval )
652
+ {
653
+ $match=0;
654
+ last;
655
+ }
656
+ }
657
+ if ( $match ) { push @out,$hline_hash }
658
+ }
659
+ return \@out;
660
+ }
661
+
662
+
663
+ =head2 add_header_line
664
+
665
+ Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
666
+ $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
667
+ Args : Header line hash as in the example above
668
+ Hash with additional parameters [optional]
669
+ silent .. do not warn about existing header keys
670
+ append .. append timestamp to the name of the new one
671
+ Returns :
672
+
673
+ =cut
674
+
675
+ sub add_header_line
676
+ {
677
+ my ($self,$rec,%args) = @_;
678
+
679
+ if ( !%args ) { $args{silent}=0; }
680
+
681
+ my $key = $$rec{key};
682
+ if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); }
683
+
684
+ if ( exists($$rec{Type}) )
685
+ {
686
+ if ( !exists($$rec{default}) )
687
+ {
688
+ my $type = $$rec{Type};
689
+ if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; }
690
+ else { $$rec{default}=$$self{defaults}{default}; }
691
+ }
692
+ if ( !exists($$rec{handler}) )
693
+ {
694
+ my $type = $$rec{Type};
695
+ if ( !exists($$self{handlers}{$type}) )
696
+ {
697
+ $self->warn("Unknown type [$type]\n");
698
+ $type = 'String';
699
+ $$rec{Type} = $type;
700
+ }
701
+ if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; }
702
+ else { $self->throw("Unknown type [$type].\n"); }
703
+ }
704
+ }
705
+
706
+ if ( exists($$rec{ID}) )
707
+ {
708
+ my $id = $$rec{ID};
709
+ if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); }
710
+ $$self{header}{$key}{$id} = $rec;
711
+ push @{$$self{header_lines}}, $rec;
712
+ return;
713
+ }
714
+
715
+ if ( $args{append} )
716
+ {
717
+ my @tm = gmtime(time);
718
+ $key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4],$tm[3];
719
+ my $i = 1;
720
+ while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; }
721
+ $key = $key.'.'.$i;
722
+ $$rec{key} = $key;
723
+ }
724
+
725
+ if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); }
726
+
727
+ push @{$$self{header}{$key}}, $rec;
728
+ if ( $$rec{key} eq 'fileformat' )
729
+ {
730
+ unshift @{$$self{header_lines}}, $rec;
731
+ }
732
+ else
733
+ {
734
+ push @{$$self{header_lines}}, $rec;
735
+ }
736
+ }
737
+
738
+ sub _header_line_exists
739
+ {
740
+ my ($self,$key,$rec) = @_;
741
+ if ( !exists($$self{header}{$key}) ) { return 0; }
742
+ if ( $key eq 'fileformat' ) { return 1; }
743
+ for my $hrec (@{$$self{header}{$key}})
744
+ {
745
+ my $differ = 0;
746
+ for my $item (keys %$rec)
747
+ {
748
+ if ( !exists($$hrec{$item}) ) { $differ=1; last; }
749
+ if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; }
750
+ }
751
+ if ( !$differ ) { return $hrec; }
752
+ }
753
+ return 0;
754
+ }
755
+
756
+ =head2 remove_header_line
757
+
758
+ Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
759
+ Args :
760
+ Returns :
761
+
762
+ =cut
763
+
764
+ sub remove_header_line
765
+ {
766
+ my ($self,%args) = @_;
767
+ my $key = $args{key};
768
+ for (my $i=0; $i<@{$$self{header_lines}}; $i++)
769
+ {
770
+ my $line = $$self{header_lines}[$i];
771
+ if ( $$line{key} ne $key ) { next; }
772
+ if ( exists($args{ID}) )
773
+ {
774
+ if ( $args{ID} ne $$line{ID} ) { next; }
775
+ delete($$self{header}{$key}{$args{ID}});
776
+ splice(@{$$self{header_lines}},$i,1);
777
+ }
778
+ else
779
+ {
780
+ my $to_be_removed = $self->_header_line_exists($key,\%args);
781
+ if ( !$to_be_removed ) { next; }
782
+ for (my $j=0; $j<@{$$self{header}{$key}}; $j++)
783
+ {
784
+ if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; }
785
+ }
786
+ splice(@{$$self{header_lines}},$i,1);
787
+ }
788
+ }
789
+ }
790
+
791
+
792
+ =head2 parse_header_line
793
+
794
+ Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
795
+ $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
796
+ Args :
797
+ Returns :
798
+
799
+ =cut
800
+
801
+ sub parse_header_line
802
+ {
803
+ my ($self,$line) = @_;
804
+
805
+ chomp($line);
806
+ $line =~ s/^##//;
807
+
808
+ if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; }
809
+ my $key = $1;
810
+ my $value = $';
811
+
812
+ my $desc;
813
+ if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; }
814
+
815
+ if ( !$desc ) { return { key=>$key, value=>$value }; }
816
+
817
+ if ( $key eq 'INFO' or $key eq 'FORMAT' )
818
+ {
819
+ my ($id,$number,$type,@rest) = split(/,\s*/,$value);
820
+ if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
821
+ return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc };
822
+ }
823
+ if ( $key eq 'FILTER' )
824
+ {
825
+ my ($id,@rest) = split(/,\s*/,$value);
826
+ if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); }
827
+ return { key=>$key, ID=>$id, Description=>$desc };
828
+ }
829
+ $self->throw("Could not parse the header line: $line\n");
830
+ }
831
+
832
+ =head2 _read_column_names
833
+
834
+ About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
835
+ The indexes go from 1.
836
+ Usage : $vcf->_read_column_names();
837
+ Args : none
838
+
839
+ =cut
840
+
841
+ sub _read_column_names
842
+ {
843
+ my ($self) = @_;
844
+ my $line = $self->next_line();
845
+ if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); }
846
+ $$self{column_line} = $line;
847
+
848
+ chomp($line);
849
+ my @cols = split(/\t/, substr($line,1));
850
+ my $ncols = scalar @cols;
851
+ if ( $ncols == 1 )
852
+ {
853
+ # If there is only one name, it can be space-seprated instead of tab separated
854
+ @cols = split(/\s+/, $cols[0]);
855
+ $ncols = scalar @cols;
856
+ chomp($line);
857
+ if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; }
858
+ $self->warn("The column names not tab-separated? [$line]\n");
859
+ }
860
+
861
+ my $fields = $$self{mandatory};
862
+ my $nfields = scalar @$fields;
863
+
864
+ # Check the names of the mandatory columns
865
+ if ( $ncols < $nfields )
866
+ {
867
+ chomp($line);
868
+ $self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n");
869
+ return;
870
+ }
871
+
872
+ for (my $i=0; $i<$ncols; $i++)
873
+ {
874
+ if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ )
875
+ {
876
+ $self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n");
877
+ $cols[$i] =~ s/^\s+//;
878
+ $cols[$i] =~ s/\s+$//;
879
+ }
880
+ if ( $i<$nfields && $cols[$i] ne $$fields[$i] )
881
+ {
882
+ $self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n");
883
+ $cols[$i] = $$fields[$i];
884
+ }
885
+ $$self{has_column}{$cols[$i]} = $i+1;
886
+ }
887
+ $$self{columns} = \@cols;
888
+ return;
889
+ }
890
+
891
+
892
+ =head2 _fake_column_names
893
+
894
+ About : When no header is present, fake column names as the default mandatory ones + numbers
895
+ Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present
896
+
897
+ =cut
898
+
899
+ sub _fake_column_names
900
+ {
901
+ my ($self,$ncols) = @_;
902
+
903
+ $$self{columns} = [ @{$$self{mandatory}} ];
904
+ if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; }
905
+ for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; }
906
+ }
907
+
908
+
909
+ =head2 format_header
910
+
911
+ About : Returns the header.
912
+ Usage : print $vcf->format_header();
913
+ Args : The columns to include on output [optional]
914
+
915
+ =cut
916
+
917
+ sub format_header
918
+ {
919
+ my ($self,$columns) = @_;
920
+
921
+ my $out = '';
922
+ for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); }
923
+
924
+ # This is required when using the API for writing new VCF files and the caller does not add the line explicitly
925
+ if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' )
926
+ {
927
+ $out = "##fileformat=VCFv$$self{version}\n" .$out;
928
+ }
929
+ if ( !$$self{columns} ) { return $out; }
930
+
931
+ my @out_cols;
932
+ if ( $columns )
933
+ {
934
+ @out_cols = @{$$self{columns}}[0..8];
935
+ for my $col (@$columns)
936
+ {
937
+ if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; }
938
+ }
939
+ }
940
+ else
941
+ {
942
+ @out_cols = @{$$self{columns}};
943
+ }
944
+ $out .= "#". join("\t", @out_cols). "\n";
945
+
946
+ return $out;
947
+ }
948
+
949
+
950
+ =head2 format_line
951
+
952
+ About : Returns the header.
953
+ Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
954
+ $x = $vcf->next_data_array(); print $vcf->format_line($x);
955
+ Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array.
956
+ 2 : The columns to include [optional]
957
+
958
+ =cut
959
+
960
+ sub format_line
961
+ {
962
+ my ($self,$record,$columns) = @_;
963
+
964
+ if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); }
965
+ $self->throw("FIXME: todo\n");
966
+ }
967
+
968
+
969
+ =head2 recalc_ac_an
970
+
971
+ About : Control if the AC and AN values should be updated.
972
+ Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
973
+ Args 1 : 0 .. never recalculate
974
+ 1 .. recalculate if present
975
+ 2 .. recalculate if present and add if missing
976
+
977
+ =cut
978
+
979
+ sub recalc_ac_an
980
+ {
981
+ my ($self,$value) = @_;
982
+ if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; }
983
+ return;
984
+ }
985
+
986
+
987
+
988
+ sub _format_line_hash
989
+ {
990
+ my ($self,$record,$columns) = @_;
991
+
992
+ if ( !$$self{columns} )
993
+ {
994
+ my $ngtypes = scalar keys %{$$record{gtypes}};
995
+ if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; }
996
+ $self->_fake_column_names($ngtypes);
997
+ }
998
+ my $cols = $$self{columns};
999
+
1000
+ # CHROM POS ID REF
1001
+ my $out;
1002
+ $out .= $$record{CHROM} . "\t";
1003
+ $out .= $$record{POS} . "\t";
1004
+ $out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t";
1005
+ $out .= $$record{REF} . "\t";
1006
+
1007
+ # ALT
1008
+ $out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.');
1009
+
1010
+ # QUAL
1011
+ $out .= "\t". $$record{QUAL};
1012
+
1013
+ # FILTER
1014
+ $out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.');
1015
+
1016
+ # Collect the gtypes of interest
1017
+ my $gtypes;
1018
+ if ( $columns )
1019
+ {
1020
+ # Select only those gtypes keys with a corresponding key in columns.
1021
+ for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; }
1022
+ }
1023
+ else
1024
+ {
1025
+ $gtypes = $$record{gtypes};
1026
+ }
1027
+
1028
+ # INFO
1029
+ # .. calculate NS, AN and AC, but only if recalc_ac_an is set
1030
+ my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0;
1031
+ my @info;
1032
+ while (my ($key,$value) = each %{$$record{INFO}})
1033
+ {
1034
+ if ( $$self{recalc_ac_an}>0 )
1035
+ {
1036
+ if ( $key eq 'AN' ) { $needs_an_ac=1; next; }
1037
+ if ( $key eq 'AC' ) { $needs_an_ac=1; next; }
1038
+ }
1039
+ if ( defined $value )
1040
+ {
1041
+ push @info, "$key=$value";
1042
+ }
1043
+ elsif ( $key ne '.' )
1044
+ {
1045
+ push @info, $key;
1046
+ }
1047
+ }
1048
+ if ( $needs_an_ac )
1049
+ {
1050
+ my $nalt = scalar @{$$record{ALT}};
1051
+ if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; }
1052
+ my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt);
1053
+ push @info, "AN=$an","AC=$ac";
1054
+ }
1055
+ if ( !@info ) { push @info, '.'; }
1056
+ $out .= "\t". join(';', sort @info);
1057
+
1058
+ # FORMAT, the column is not required, it may not be present when there are no genotypes
1059
+ if ( exists($$cols[8]) && defined $$record{FORMAT} )
1060
+ {
1061
+ $out .= "\t". join(':',@{$$record{FORMAT}});
1062
+ }
1063
+
1064
+ # Genotypes: output all columns or only a selection?
1065
+ my @col_names = $columns ? @$columns : @$cols[9..@$cols-1];
1066
+ my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0;
1067
+ for my $col (@col_names)
1068
+ {
1069
+ my $gt = $$gtypes{$col};
1070
+ my $can_drop = $$self{drop_trailings};
1071
+ my @gtype;
1072
+ for (my $i=$nformat-1; $i>=0; $i--)
1073
+ {
1074
+ my $field = $$record{FORMAT}[$i];
1075
+ if ( $i==0 ) { $can_drop=0; }
1076
+
1077
+ if ( exists($$gt{$field}) )
1078
+ {
1079
+ $can_drop = 0;
1080
+ if ( ref($$gt{$field}) eq 'HASH' )
1081
+ {
1082
+ # Special treatment for Number=[AG] tags
1083
+ unshift @gtype, $self->format_AGtag($record,$$gt{$field},$field);
1084
+ }
1085
+ else
1086
+ {
1087
+ unshift @gtype,$$gt{$field};
1088
+ }
1089
+ }
1090
+ elsif ( $can_drop ) { next; }
1091
+ elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; }
1092
+ else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); }
1093
+ }
1094
+ $out .= "\t" . join(':',@gtype);
1095
+ }
1096
+
1097
+ $out .= "\n";
1098
+ return $out;
1099
+ }
1100
+
1101
+ sub calc_an_ac
1102
+ {
1103
+ my ($self,$gtypes,$nalleles) = @_;
1104
+ my $sep_re = $$self{regex_gtsep};
1105
+ my ($an,%ac_counts);
1106
+ if ( defined $nalleles )
1107
+ {
1108
+ for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; }
1109
+ }
1110
+ $an = 0;
1111
+ for my $gt (keys %$gtypes)
1112
+ {
1113
+ my $value = $$gtypes{$gt}{GT};
1114
+ if ( !defined $value ) { next; } # GT may not be present
1115
+ my ($al1,$al2) = split($sep_re,$value);
1116
+ if ( defined($al1) && $al1 ne '.' )
1117
+ {
1118
+ $an++;
1119
+ if ( $al1 ne '0' ) { $ac_counts{$al1}++; }
1120
+ }
1121
+ if ( defined($al2) && $al2 ne '.' )
1122
+ {
1123
+ $an++;
1124
+ if ( $al2 ne '0' ) { $ac_counts{$al2}++; }
1125
+ }
1126
+ }
1127
+ my @ac;
1128
+ for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; }
1129
+ if ( !@ac ) { @ac = ('0'); }
1130
+ return ($an,join(',',@ac),\@ac);
1131
+ }
1132
+
1133
+ sub _validate_alt_field
1134
+ {
1135
+ my ($self,$values,$ref) = @_;
1136
+
1137
+ for (my $i=0; $i<@$values; $i++)
1138
+ {
1139
+ for (my $j=0; $j<$i; $j++)
1140
+ {
1141
+ if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; }
1142
+ }
1143
+ if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; }
1144
+ }
1145
+ return undef;
1146
+ }
1147
+
1148
+ =head2 validate_alt_field
1149
+
1150
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
1151
+ Args : The ALT arrayref
1152
+ Returns : Error message in case of an error.
1153
+
1154
+ =cut
1155
+
1156
+ sub validate_alt_field
1157
+ {
1158
+ my ($self,$values,$ref) = @_;
1159
+
1160
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
1161
+
1162
+ my $ret = $self->_validate_alt_field($values,$ref);
1163
+ if ( $ret ) { return $ret; }
1164
+
1165
+ my @err;
1166
+ for my $item (@$values)
1167
+ {
1168
+ if ( $item=~/^[ACTGN]$/ ) { next; }
1169
+ elsif ( $item=~/^I[ACTGN]+$/ ) { next; }
1170
+ elsif ( $item=~/^D\d+$/ ) { next; }
1171
+
1172
+ push @err, $item;
1173
+ }
1174
+ if ( !@err ) { return undef; }
1175
+ return 'Could not parse the allele(s) [' .join(',',@err). ']';
1176
+ }
1177
+
1178
+ =head2 event_type
1179
+
1180
+ Usage : my $x = $vcf->next_data_hash();
1181
+ my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
1182
+ for my $allele (@$alleles)
1183
+ {
1184
+ my ($type,$len,$ht) = $vcf->event_type($x,$allele);
1185
+ }
1186
+ or
1187
+ my ($type,$len,$ht) = $vcf->event_type($ref,$al);
1188
+ Args : VCF data line parsed by next_data_hash or the reference allele
1189
+ : Allele
1190
+ Returns : 's' for SNP and number of SNPs in the record
1191
+ 'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
1192
+ 'r' identical to the reference, length 0
1193
+ 'o' for other (complex events) and the number of affected bases
1194
+ 'b' breakend
1195
+ 'u' unknown
1196
+
1197
+ =cut
1198
+
1199
+ sub event_type
1200
+ {
1201
+ my ($self,$rec,$allele) = @_;
1202
+
1203
+ my $ref = $rec;
1204
+ if ( ref($rec) eq 'HASH' )
1205
+ {
1206
+ if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
1207
+ $ref = $$rec{REF};
1208
+ }
1209
+
1210
+ my ($type,$len,$ht);
1211
+ if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; }
1212
+ elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; }
1213
+ elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; }
1214
+ elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; }
1215
+ else
1216
+ {
1217
+ my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef';
1218
+ my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef';
1219
+ $self->throw("Eh?: $chr:$pos .. $ref $allele\n");
1220
+ }
1221
+
1222
+ if ( ref($rec) eq 'HASH' )
1223
+ {
1224
+ $$rec{_cached_events}{$allele} = [$type,$len,$ht];
1225
+ }
1226
+ return ($type,$len,$ht);
1227
+ }
1228
+
1229
+
1230
+ =head2 parse_AGtags
1231
+
1232
+ About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
1233
+ Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
1234
+ Args : VCF data line parsed by next_data_hash
1235
+ : Mapping between ALT representations based on different REFs [optional]
1236
+ : New REF [optional]
1237
+ Returns : Hash {Allele=>Value}
1238
+
1239
+ =cut
1240
+
1241
+ sub parse_AGtags
1242
+ {
1243
+ my ($self,$rec,$ref_alt_map,$new_ref) = @_;
1244
+
1245
+ if ( !exists($$rec{gtypes}) ) { return; }
1246
+
1247
+ my (@atags,@gtags);
1248
+ for my $fmt (@{$$rec{FORMAT}})
1249
+ {
1250
+ # These have been listed explicitly for proper merging of v4.0 VCFs
1251
+ if ( $$self{fix_v40_AGtags} )
1252
+ {
1253
+ if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; }
1254
+ if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; }
1255
+ }
1256
+ if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
1257
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; }
1258
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; }
1259
+ }
1260
+ my $missing = $$self{defaults}{default};
1261
+ if ( @atags )
1262
+ {
1263
+ # Parse Number=A tags
1264
+ my $alts;
1265
+ if ( defined $ref_alt_map )
1266
+ {
1267
+ $alts = [];
1268
+ for my $alt (@{$$rec{ALT}})
1269
+ {
1270
+ if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); }
1271
+ push @$alts, $$ref_alt_map{$new_ref}{$alt};
1272
+ }
1273
+ }
1274
+ else
1275
+ {
1276
+ $alts = $$rec{ALT};
1277
+ }
1278
+ for my $tag (@atags)
1279
+ {
1280
+ for my $sample (values %{$$rec{gtypes}})
1281
+ {
1282
+ if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
1283
+ my @values = split(/,/,$$sample{$tag});
1284
+ $$sample{$tag} = {};
1285
+ for (my $i=0; $i<@values; $i++)
1286
+ {
1287
+ $$sample{$tag}{$$alts[$i]} = $values[$i];
1288
+ }
1289
+ }
1290
+ }
1291
+ }
1292
+ if ( @gtags )
1293
+ {
1294
+ # Parse Number=G tags
1295
+ my @alleles;
1296
+ if ( defined $ref_alt_map )
1297
+ {
1298
+ push @alleles, $new_ref;
1299
+ for my $alt (@{$$rec{ALT}})
1300
+ {
1301
+ if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); }
1302
+ push @alleles, $$ref_alt_map{$new_ref}{$alt};
1303
+ }
1304
+ }
1305
+ else
1306
+ {
1307
+ @alleles = ($$rec{REF},@{$$rec{ALT}});
1308
+ }
1309
+ my @gtypes;
1310
+ for (my $i=0; $i<@alleles; $i++)
1311
+ {
1312
+ for (my $j=0; $j<=$i; $j++)
1313
+ {
1314
+ push @gtypes, $alleles[$i].'/'.$alleles[$j];
1315
+ }
1316
+ }
1317
+ for my $tag (@gtags)
1318
+ {
1319
+ for my $sample (values %{$$rec{gtypes}})
1320
+ {
1321
+ if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; }
1322
+ my @values = split(/,/,$$sample{$tag});
1323
+ $$sample{$tag} = {};
1324
+ for (my $i=0; $i<@values; $i++)
1325
+ {
1326
+ $$sample{$tag}{$gtypes[$i]} = $values[$i];
1327
+ }
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ =head2 format_AGtag
1334
+
1335
+ About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
1336
+ Usage :
1337
+ Args :
1338
+ :
1339
+ :
1340
+ Returns :
1341
+
1342
+ =cut
1343
+
1344
+ sub format_AGtag
1345
+ {
1346
+ my ($self,$record,$tag_data,$tag) = @_;
1347
+
1348
+ # The FORMAT field is checked only once and the results are cached.
1349
+ if ( !exists($$record{_atags}) )
1350
+ {
1351
+ $$record{_atags} = {};
1352
+
1353
+ # Check if there are any A,G tags
1354
+ for my $fmt (@{$$record{FORMAT}})
1355
+ {
1356
+ # These have been listed explicitly for proper merging of v4.0 VCFs
1357
+ if ( $$self{fix_v40_AGtags} )
1358
+ {
1359
+ if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; }
1360
+ if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; }
1361
+ }
1362
+ if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; }
1363
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; }
1364
+ if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; }
1365
+ }
1366
+ }
1367
+
1368
+ my @out;
1369
+ if ( exists($$record{_atags}{$tag}) )
1370
+ {
1371
+ for my $alt (@{$$record{ALT}})
1372
+ {
1373
+ push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default};
1374
+ }
1375
+ }
1376
+
1377
+ if ( exists($$record{_gtags}{$tag}) )
1378
+ {
1379
+ my $gtypes = $$record{_gtypes};
1380
+ my $gtypes2 = $$record{_gtypes2};
1381
+ if ( !defined $gtypes )
1382
+ {
1383
+ $gtypes = [];
1384
+ $gtypes2 = [];
1385
+
1386
+ my @alleles = ( $$record{REF}, @{$$record{ALT}} );
1387
+ for (my $i=0; $i<@alleles; $i++)
1388
+ {
1389
+ for (my $j=0; $j<=$i; $j++)
1390
+ {
1391
+ push @$gtypes, $alleles[$i].'/'.$alleles[$j];
1392
+ push @$gtypes2, $alleles[$j].'/'.$alleles[$i];
1393
+ }
1394
+ }
1395
+
1396
+ $$record{_gtypes} = $gtypes;
1397
+ $$record{_gtypes2} = $gtypes2;
1398
+ }
1399
+
1400
+ for (my $i=0; $i<@$gtypes; $i++)
1401
+ {
1402
+ my $gt = $$gtypes[$i];
1403
+ if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$i]; }
1404
+ push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default};
1405
+ }
1406
+ }
1407
+
1408
+ return join(',',@out);
1409
+ }
1410
+
1411
+ =head2 parse_alleles
1412
+
1413
+ About : Deprecated, use parse_haplotype instead.
1414
+ Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
1415
+ Args : VCF data line parsed by next_data_hash
1416
+ : The genotype column name
1417
+ Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.
1418
+
1419
+ =cut
1420
+
1421
+ sub parse_alleles
1422
+ {
1423
+ my ($self,$rec,$column) = @_;
1424
+ if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
1425
+
1426
+ my $gtype = $$rec{gtypes}{$column}{GT};
1427
+ if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); }
1428
+ my $al1 = $1;
1429
+ my $sep = $2;
1430
+ my $al2 = $3;
1431
+
1432
+ if ( !$al1 ) { $al1 = $$rec{REF}; }
1433
+ elsif ( $al1 ne '.' )
1434
+ {
1435
+ if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); }
1436
+ $al1 = $$rec{ALT}[$al1-1];
1437
+ }
1438
+
1439
+ if ( !defined $al2 or $al2 eq '' )
1440
+ {
1441
+ $sep = '';
1442
+ $al2 = '';
1443
+ }
1444
+ else
1445
+ {
1446
+ if ( !$al2 ) { $al2 = $$rec{REF}; }
1447
+ elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; }
1448
+ }
1449
+ return ($al1,$sep,$al2);
1450
+ }
1451
+
1452
+ =head2 parse_haplotype
1453
+
1454
+ About : Similar to parse_alleles, supports also multiploid VCFs.
1455
+ Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
1456
+ Args : VCF data line parsed by next_data_hash
1457
+ : The genotype column name
1458
+ Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
1459
+ can be cashed and must be therefore considered read only!
1460
+
1461
+ =cut
1462
+
1463
+ sub parse_haplotype
1464
+ {
1465
+ my ($self,$rec,$column) = @_;
1466
+ if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); }
1467
+ if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); }
1468
+
1469
+ my $gtype = $$rec{gtypes}{$column}{GT};
1470
+ if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); }
1471
+
1472
+ my @alleles = ();
1473
+ my @seps = ();
1474
+ my $is_phased = 0;
1475
+ my $is_empty = 1;
1476
+
1477
+ my $buf = $gtype;
1478
+ while ($buf ne '')
1479
+ {
1480
+ if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); }
1481
+ $buf = $';
1482
+
1483
+ if ( $1 eq '.' ) { push @alleles,'.'; }
1484
+ else
1485
+ {
1486
+ $is_empty = 0;
1487
+ if ( $1 eq '0' ) { push @alleles,$$rec{REF}; }
1488
+ elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; }
1489
+ else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); }
1490
+ }
1491
+ if ( $2 )
1492
+ {
1493
+ if ( $2 eq '|' ) { $is_phased=1; }
1494
+ push @seps,$2;
1495
+ }
1496
+ }
1497
+ $$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty];
1498
+ return (@{$$rec{_cached_haplotypes}{$gtype}});
1499
+ }
1500
+
1501
+ =head2 format_haplotype
1502
+
1503
+ Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);
1504
+
1505
+ =cut
1506
+
1507
+ sub format_haplotype
1508
+ {
1509
+ my ($self,$alleles,$seps) = @_;
1510
+ if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); }
1511
+ my $out = $$alleles[0];
1512
+ for (my $i=1; $i<@$alleles; $i++)
1513
+ {
1514
+ $out .= $$seps[$i-1];
1515
+ $out .= $$alleles[$i];
1516
+ }
1517
+ return $out;
1518
+ }
1519
+
1520
+
1521
+ =head2 format_genotype_strings
1522
+
1523
+ Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
1524
+ $vcf->format_genotype_strings($x);
1525
+ print $vcf->format_line($x);
1526
+ Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
1527
+ 2 : Optionally, a subset of columns can be supplied. See also format_line.
1528
+ Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
1529
+ numbers starting from 1.
1530
+
1531
+ =cut
1532
+
1533
+ sub format_genotype_strings
1534
+ {
1535
+ my ($self,$rec,$columns) = @_;
1536
+
1537
+ if ( !exists($$rec{gtypes}) ) { return; }
1538
+
1539
+ my $ref = $$rec{REF};
1540
+ my $nalts = 0;
1541
+ my %alts = ();
1542
+ my $gt_re = $$self{regex_gt2};
1543
+
1544
+ if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; }
1545
+
1546
+ for my $key (@$columns)
1547
+ {
1548
+ my $gtype = $$rec{gtypes}{$key}{GT};
1549
+ my $buf = $gtype;
1550
+ my $out = '';
1551
+ while ($buf ne '')
1552
+ {
1553
+ if ( !($buf=~$gt_re) ) { $self->throw("Could not parse gtype string [$gtype]\n"); }
1554
+ $buf = $';
1555
+
1556
+ my $al = $1;
1557
+ my $sep = $2;
1558
+ if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; }
1559
+ else
1560
+ {
1561
+ if ( $al=~/^\d+$/ )
1562
+ {
1563
+ if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); }
1564
+ $al = $$rec{ALT}[$al-1];
1565
+ }
1566
+
1567
+ if ( exists($alts{$al}) ) { $al = $alts{$al} }
1568
+ elsif ( $al=~$$self{regex_snp} or $al=~$$self{regex_ins} or $al=~$$self{regex_del} )
1569
+ {
1570
+ $alts{$al} = ++$nalts;
1571
+ $al = $nalts;
1572
+ }
1573
+ elsif ( $al ne '.' )
1574
+ {
1575
+ $self->throw("Could not parse the genotype string [$gtype]\n");
1576
+ }
1577
+
1578
+ }
1579
+ $out .= $al;
1580
+ if ( $sep ) { $out .= $sep; }
1581
+ }
1582
+ $$rec{gtypes}{$key}{GT} = $out;
1583
+ }
1584
+ $$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ];
1585
+ }
1586
+
1587
+ sub fill_ref_alt_mapping
1588
+ {
1589
+ my ($self,$map) = @_;
1590
+
1591
+ my $new_ref;
1592
+ for my $ref (keys %$map)
1593
+ {
1594
+ $new_ref = $ref;
1595
+ if ( $ref ne $new_ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
1596
+ for my $alt (keys %{$$map{$ref}})
1597
+ {
1598
+ $$map{$ref}{$alt} = $alt;
1599
+ }
1600
+ }
1601
+ $$map{$new_ref}{$new_ref} = $new_ref;
1602
+ return $new_ref;
1603
+ }
1604
+
1605
+
1606
+ =head2 format_header_line
1607
+
1608
+ Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
1609
+ Args :
1610
+ Returns :
1611
+
1612
+ =cut
1613
+
1614
+ sub format_header_line
1615
+ {
1616
+ my ($self,$rec) = @_;
1617
+ my $line = "##$$rec{key}";
1618
+ $line .= "=$$rec{value}" unless !exists($$rec{value});
1619
+ $line .= "=$$rec{ID}" unless !exists($$rec{ID});
1620
+ $line .= ",$$rec{Number}" unless !exists($$rec{Number});
1621
+ $line .= ",$$rec{Type}" unless !exists($$rec{Type});
1622
+ $line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description});
1623
+ $line .= "\n";
1624
+ return $line;
1625
+ }
1626
+
1627
+ =head2 add_columns
1628
+
1629
+ Usage : $vcf->add_columns('NA001','NA0002');
1630
+ Args :
1631
+ Returns :
1632
+
1633
+ =cut
1634
+
1635
+ sub add_columns
1636
+ {
1637
+ my ($self,@columns) = @_;
1638
+ if ( !$$self{columns} )
1639
+ {
1640
+ # The columns should be initialized de novo. Figure out if the @columns contain also the mandatory
1641
+ # columns and if FORMAT should be present (it can be absent when there is no genotype column present).
1642
+ my $has_other = 0;
1643
+ for my $col (@columns)
1644
+ {
1645
+ if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; }
1646
+ }
1647
+
1648
+ $$self{columns} = [ @{$$self{mandatory}} ];
1649
+ if ( $has_other ) { push @{$$self{columns}},'FORMAT'; }
1650
+
1651
+ for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; }
1652
+ }
1653
+ my $ncols = @{$$self{columns}};
1654
+ for my $col (@columns)
1655
+ {
1656
+ if ( $$self{has_column}{$col} ) { next; }
1657
+ $ncols++;
1658
+ push @{$$self{columns}}, $col;
1659
+ }
1660
+ }
1661
+
1662
+ =head2 add_format_field
1663
+
1664
+ Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
1665
+ Args : The record obtained by next_data_hash
1666
+ : The field name
1667
+ Returns :
1668
+
1669
+ =cut
1670
+
1671
+ sub add_format_field
1672
+ {
1673
+ my ($self,$rec,$field) = @_;
1674
+
1675
+ if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
1676
+
1677
+ for my $key (@{$$rec{FORMAT}})
1678
+ {
1679
+ if ( $key eq $field ) { return; } # already there
1680
+ }
1681
+ push @{$$rec{FORMAT}}, $field;
1682
+ }
1683
+
1684
+
1685
+ =head2 remove_format_field
1686
+
1687
+ Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
1688
+ Args : The record obtained by next_data_hash
1689
+ : The field name
1690
+ Returns :
1691
+
1692
+ =cut
1693
+
1694
+ sub remove_format_field
1695
+ {
1696
+ my ($self,$rec,$field) = @_;
1697
+
1698
+ if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; }
1699
+
1700
+ my $i = 0;
1701
+ for my $key (@{$$rec{FORMAT}})
1702
+ {
1703
+ if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; }
1704
+ $i++;
1705
+ }
1706
+ }
1707
+
1708
+
1709
+ =head2 add_info_field
1710
+
1711
+ Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
1712
+ Args : The record obtained by next_data_array
1713
+ : The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
1714
+ it will be removed. To add fields without a value, use empty string ''.
1715
+ Returns : The formatted INFO.
1716
+
1717
+ =cut
1718
+
1719
+ sub add_info_field
1720
+ {
1721
+ my ($self,$info,%fields) = @_;
1722
+
1723
+ my @out = ();
1724
+
1725
+ # First handle the existing values, keep everything unless in %fields
1726
+ for my $field (split(/;/,$info))
1727
+ {
1728
+ my ($key,$value) = split(/=/,$field);
1729
+ if ( $key eq '.' ) { next; }
1730
+ if ( !exists($fields{$key}) ) { push @out,$field; next; }
1731
+ }
1732
+
1733
+ # Now add the new values and remove the unwanted ones
1734
+ while (my ($key,$value)=each %fields)
1735
+ {
1736
+ if ( !defined($value) ) { next; } # this one should be removed
1737
+ if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3
1738
+ else { push @out,"$key=$value"; } # this is the standard key=value pair
1739
+ }
1740
+ if ( !@out ) { push @out,'.'; }
1741
+ return join(';',@out);
1742
+ }
1743
+
1744
+
1745
+ =head2 add_filter
1746
+
1747
+ Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
1748
+ Args : The record obtained by next_data_array or next_data_hash
1749
+ : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
1750
+ Returns : The formatted filter field.
1751
+
1752
+ =cut
1753
+
1754
+ sub add_filter
1755
+ {
1756
+ my ($self,$filter,%filters) = @_;
1757
+
1758
+ my @out = ();
1759
+ my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter);
1760
+
1761
+ # First handle the existing filters, keep everything unless in %filters
1762
+ for my $key (@filters)
1763
+ {
1764
+ if ( $key eq '.' or $key eq 'PASS' ) { next; }
1765
+ if ( !exists($filters{$key}) ) { push @out,$key; next; }
1766
+ }
1767
+
1768
+ # Now add the new filters and remove the unwanted ones
1769
+ while (my ($key,$value)=each %filters)
1770
+ {
1771
+ if ( !$value ) { next; } # this one should be removed
1772
+ push @out,$key; # this one should be added
1773
+ }
1774
+ if ( !@out ) { push @out,'PASS'; }
1775
+ return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out);
1776
+ }
1777
+
1778
+
1779
+ =head2 validate_filter_field
1780
+
1781
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
1782
+ Args : The FILTER arrayref
1783
+ Returns : Error message in case of an error.
1784
+
1785
+ =cut
1786
+
1787
+ sub validate_filter_field
1788
+ {
1789
+ my ($self,$values) = @_;
1790
+
1791
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
1792
+
1793
+ my @errs;
1794
+ my @missing;
1795
+ for my $item (@$values)
1796
+ {
1797
+ if ( $item eq $$self{filter_passed} ) { next; }
1798
+ if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; }
1799
+ if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; }
1800
+ if ( exists($$self{header}{FILTER}{$item}) ) { next; }
1801
+ push @missing, $item;
1802
+ $self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'});
1803
+ }
1804
+ if ( !@errs && !@missing ) { return undef; }
1805
+ if ( $$self{version}<3.3 ) { return undef; }
1806
+ return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.';
1807
+ }
1808
+
1809
+
1810
+ sub _add_unknown_field
1811
+ {
1812
+ my ($self,$field,$key,$nargs) = @_;
1813
+ $self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'});
1814
+ }
1815
+
1816
+ =head2 validate_header
1817
+
1818
+ About : Version specific header validation code.
1819
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header();
1820
+ Args :
1821
+
1822
+ =cut
1823
+
1824
+ sub validate_header
1825
+ {
1826
+ my ($self) = @_;
1827
+ }
1828
+
1829
+ =head2 validate_line
1830
+
1831
+ About : Version specific line validation code.
1832
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
1833
+ Args :
1834
+
1835
+ =cut
1836
+
1837
+ sub validate_line
1838
+ {
1839
+ my ($self,$x) = @_;
1840
+
1841
+ # Is the ID composed of alphanumeric chars
1842
+ if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); }
1843
+ }
1844
+
1845
+ =head2 validate_info_field
1846
+
1847
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
1848
+ Args : The INFO hashref
1849
+ Returns : Error message in case of an error.
1850
+
1851
+ =cut
1852
+
1853
+ sub validate_info_field
1854
+ {
1855
+ my ($self,$values,$alts) = @_;
1856
+
1857
+ if ( !defined $values ) { return 'Empty INFO field.'; }
1858
+
1859
+ # First handle the empty INFO field (.)
1860
+ if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; }
1861
+
1862
+ # Expected numbers
1863
+ my $ng = -1;
1864
+ my $na = -1;
1865
+ if ( $$self{version}>4.0 )
1866
+ {
1867
+ if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
1868
+ else
1869
+ {
1870
+ $na = @$alts;
1871
+ $ng = (1+$na+1)*($na+1)/2;
1872
+ }
1873
+ }
1874
+
1875
+ my @errs;
1876
+ while (my ($key,$value) = each %$values)
1877
+ {
1878
+ if ( !exists($$self{header}{INFO}{$key}) )
1879
+ {
1880
+ push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3;
1881
+ my $nargs = defined $value ? -1 : 0;
1882
+ $self->_add_unknown_field('INFO',$key,$nargs);
1883
+ next;
1884
+ }
1885
+ my $type = $$self{header}{INFO}{$key};
1886
+
1887
+ my @vals = defined $value ? split(/,/, $value) : ();
1888
+ if ( $$type{Number} eq 'G' )
1889
+ {
1890
+ if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; }
1891
+ }
1892
+ elsif ( $$type{Number} eq 'A' )
1893
+ {
1894
+ if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; }
1895
+ }
1896
+ elsif ( $$type{Number}==0 )
1897
+ {
1898
+ if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; }
1899
+ next;
1900
+ }
1901
+ elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
1902
+ {
1903
+ push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})";
1904
+ }
1905
+ if ( !$$type{handler} ) { next; }
1906
+ for my $val (@vals)
1907
+ {
1908
+ my $err = &{$$type{handler}}($self,$val,$$type{default});
1909
+ if ( $err ) { push @errs, $err; }
1910
+ }
1911
+ }
1912
+ if ( !@errs ) { return undef; }
1913
+ return join(',',@errs);
1914
+ }
1915
+
1916
+ =head2 validate_gtype_field
1917
+
1918
+ Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
1919
+ Args : The genotype data hashref
1920
+ The ALT arrayref
1921
+ Returns : Error message in case of an error.
1922
+
1923
+ =cut
1924
+
1925
+ sub validate_gtype_field
1926
+ {
1927
+ my ($self,$data,$alts,$format) = @_;
1928
+
1929
+ # Expected numbers
1930
+ my $ng = -1;
1931
+ my $na = -1;
1932
+ if ( $$self{version}>4.0 )
1933
+ {
1934
+ if ( $$alts[0] eq '.' ) { $ng=1; $na=1; }
1935
+ else
1936
+ {
1937
+ $na = @$alts;
1938
+ $ng = (1+$na+1)*($na+1)/2;
1939
+ }
1940
+ }
1941
+
1942
+ my @errs;
1943
+ while (my ($key,$value) = each %$data)
1944
+ {
1945
+ if ( !exists($$self{header}{FORMAT}{$key}) )
1946
+ {
1947
+ push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3;
1948
+ $self->_add_unknown_field('FORMAT',$key,-1);
1949
+ next;
1950
+ }
1951
+ my $type = $$self{header}{FORMAT}{$key};
1952
+
1953
+ my @vals = split(/,/, $value);
1954
+ if ( $$type{Number} eq 'G' )
1955
+ {
1956
+ if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; }
1957
+ }
1958
+ elsif ( $$type{Number} eq 'A' )
1959
+ {
1960
+ if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; }
1961
+ }
1962
+ elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} )
1963
+ {
1964
+ push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})";
1965
+ }
1966
+ if ( !$$type{handler} ) { next; }
1967
+ for my $val (@vals)
1968
+ {
1969
+ my $err = &{$$type{handler}}($self,$val,$$type{default});
1970
+ if ( $err ) { push @errs, $err; }
1971
+ }
1972
+ }
1973
+ if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; }
1974
+ else
1975
+ {
1976
+ my $buf = $$data{GT};
1977
+ while ($buf ne '')
1978
+ {
1979
+ my $al = $buf;
1980
+ if ( $buf=~$$self{regex_gtsep} )
1981
+ {
1982
+ $al = $`;
1983
+ $buf = $';
1984
+ if ( $buf eq '' ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
1985
+ }
1986
+ else
1987
+ {
1988
+ $buf = '';
1989
+ }
1990
+
1991
+ if ( !defined $al ) { push @errs, "Unable to parse the GT field [$$data{GT}]."; last; }
1992
+ if ( $al eq '.' ) { next; }
1993
+ if ( $al eq '0' ) { next; }
1994
+ if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integer."; last; }
1995
+ if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; }
1996
+ }
1997
+ }
1998
+ if ( !@errs ) { return undef; }
1999
+ return join(',',@errs);
2000
+ }
2001
+
2002
+
2003
+ sub validate_ref_field
2004
+ {
2005
+ my ($self,$ref) = @_;
2006
+ if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; }
2007
+ return undef;
2008
+ }
2009
+
2010
+ sub validate_int
2011
+ {
2012
+ my ($self,$value,$default) = @_;
2013
+
2014
+ if ( defined($default) && $value eq $default ) { return undef; }
2015
+ if ( $value =~ /^-?\d+$/ ) { return undef; }
2016
+ return "Could not validate the int [$value]";
2017
+ }
2018
+
2019
+ sub validate_float
2020
+ {
2021
+ my ($self,$value,$default) = @_;
2022
+ if ( defined($default) && $value eq $default ) { return undef; }
2023
+ if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; }
2024
+ if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; }
2025
+ if ( $value =~ /^-?\d+$/ ) { return undef; }
2026
+ if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; }
2027
+ return "Could not validate the float [$value]";
2028
+ }
2029
+
2030
+ sub validate_char
2031
+ {
2032
+ my ($self,$value,$default) = @_;
2033
+
2034
+ if ( defined($default) && $value eq $default ) { return undef; }
2035
+ if ( length($value)==1) { return undef; }
2036
+ return "Could not validate the char value [$value]";
2037
+ }
2038
+
2039
+
2040
+ =head2 run_validation
2041
+
2042
+ About : Validates the VCF file.
2043
+ Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
2044
+ Args : File name or file handle.
2045
+
2046
+ =cut
2047
+
2048
+ sub run_validation
2049
+ {
2050
+ my ($self) = @_;
2051
+
2052
+ $self->parse_header();
2053
+ $self->validate_header();
2054
+
2055
+ if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); }
2056
+ elsif ( !exists($$self{header}{fileformat}) )
2057
+ {
2058
+ $self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]);
2059
+ }
2060
+ elsif ( $$self{header_lines}[0]{key} ne 'fileformat' )
2061
+ {
2062
+ $self->warn(qq[The "fileformat" not the first line in the header\n]);
2063
+ }
2064
+ if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); }
2065
+
2066
+ my $default_qual = $$self{defaults}{QUAL};
2067
+ my $warn_sorted=1;
2068
+ my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1;
2069
+ my ($prev_chrm,$prev_pos);
2070
+ while (my $x=$self->next_data_hash())
2071
+ {
2072
+ $self->validate_line($x);
2073
+
2074
+ # Is the position numeric?
2075
+ if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); }
2076
+
2077
+ if ( $warn_duplicates )
2078
+ {
2079
+ if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} )
2080
+ {
2081
+ $self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n");
2082
+ $warn_duplicates = 0;
2083
+ }
2084
+ }
2085
+
2086
+ # Is the file sorted?
2087
+ if ( $warn_sorted )
2088
+ {
2089
+ if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} )
2090
+ {
2091
+ $self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n");
2092
+ $warn_sorted = 0;
2093
+ }
2094
+ $prev_chrm = $$x{CHROM};
2095
+ $prev_pos = $$x{POS};
2096
+ }
2097
+
2098
+ # The reference base: one of A,C,G,T,N, non-empty.
2099
+ my $err = $self->validate_ref_field($$x{REF});
2100
+ if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
2101
+
2102
+ # The ALT field (alternate non-reference base)
2103
+ $err = $self->validate_alt_field($$x{ALT},$$x{REF});
2104
+ if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); }
2105
+
2106
+ # The QUAL field
2107
+ my $ret = $self->validate_float($$x{QUAL},$default_qual);
2108
+ if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); }
2109
+ elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); }
2110
+
2111
+ # The FILTER field
2112
+ $err = $self->validate_filter_field($$x{FILTER});
2113
+ if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); }
2114
+
2115
+ # The INFO field
2116
+ $err = $self->validate_info_field($$x{INFO},$$x{ALT});
2117
+ if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); }
2118
+
2119
+ while (my ($gt,$data) = each %{$$x{gtypes}})
2120
+ {
2121
+ $err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT});
2122
+ if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); }
2123
+ }
2124
+
2125
+ if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) )
2126
+ {
2127
+ my $nalt = scalar @{$$x{ALT}};
2128
+ if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; }
2129
+ my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples
2130
+ if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} )
2131
+ {
2132
+ $self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n");
2133
+ }
2134
+ if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} )
2135
+ {
2136
+ $self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n");
2137
+ }
2138
+ }
2139
+ }
2140
+ }
2141
+
2142
+
2143
+ =head2 get_chromosomes
2144
+
2145
+ About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
2146
+ Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes();
2147
+ Args : none
2148
+
2149
+ =cut
2150
+
2151
+ sub get_chromosomes
2152
+ {
2153
+ my ($self) = @_;
2154
+ if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); }
2155
+ my (@out) = `tabix -l $$self{file}`;
2156
+ if ( $? )
2157
+ {
2158
+ $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]);
2159
+ }
2160
+ for (my $i=0; $i<@out; $i++) { chomp($out[$i]); }
2161
+ return \@out;
2162
+ }
2163
+
2164
+
2165
+ =head2 get_samples
2166
+
2167
+ About : Get list of samples.
2168
+ Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
2169
+ Args : none
2170
+
2171
+ =cut
2172
+
2173
+ sub get_samples
2174
+ {
2175
+ my ($self) = @_;
2176
+ my $n = @{$$self{columns}} - 1;
2177
+ return (@{$$self{columns}}[9..$n]);
2178
+ }
2179
+
2180
+
2181
+ #------------------------------------------------
2182
+ # Version 3.2 specific functions
2183
+
2184
+ package Vcf3_2;
2185
+ use base qw(VcfReader);
2186
+
2187
+ sub new
2188
+ {
2189
+ my ($class,@args) = @_;
2190
+ my $self = $class->SUPER::new(@args);
2191
+ bless $self, ref($class) || $class;
2192
+
2193
+ $$self{_defaults} =
2194
+ {
2195
+ version => '3.2',
2196
+ drop_trailings => 1,
2197
+ filter_passed => 0,
2198
+
2199
+ defaults =>
2200
+ {
2201
+ QUAL => '-1',
2202
+ default => '.',
2203
+ Flag => undef,
2204
+ GT => '.',
2205
+ },
2206
+
2207
+ handlers =>
2208
+ {
2209
+ Integer => \&VcfReader::validate_int,
2210
+ Float => \&VcfReader::validate_float,
2211
+ Character => \&VcfReader::validate_char,
2212
+ String => undef,
2213
+ Flag => undef,
2214
+ },
2215
+
2216
+ regex_snp => qr/^[ACGTN]$/i,
2217
+ regex_ins => qr/^I[ACGTN]+$/,
2218
+ regex_del => qr/^D\d+$/,
2219
+ regex_gtsep => qr{[\\|/]},
2220
+ regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
2221
+ regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)},
2222
+ };
2223
+
2224
+ for my $key (keys %{$$self{_defaults}})
2225
+ {
2226
+ $$self{$key}=$$self{_defaults}{$key};
2227
+ }
2228
+
2229
+
2230
+ return $self;
2231
+ }
2232
+
2233
+
2234
+ #------------------------------------------------
2235
+ # Version 3.3 specific functions
2236
+
2237
+ package Vcf3_3;
2238
+ use base qw(VcfReader);
2239
+
2240
+ sub new
2241
+ {
2242
+ my ($class,@args) = @_;
2243
+ my $self = $class->SUPER::new(@args);
2244
+ bless $self, ref($class) || $class;
2245
+
2246
+ $$self{_defaults} =
2247
+ {
2248
+ version => '3.3',
2249
+ drop_trailings => 0,
2250
+ filter_passed => 0,
2251
+
2252
+ defaults =>
2253
+ {
2254
+ QUAL => '-1',
2255
+ Integer => '-1',
2256
+ Float => '-1',
2257
+ Character => '.',
2258
+ String => '.',
2259
+ Flag => undef,
2260
+ GT => './.',
2261
+ default => '.',
2262
+ },
2263
+
2264
+ handlers =>
2265
+ {
2266
+ Integer => \&VcfReader::validate_int,
2267
+ Float => \&VcfReader::validate_float,
2268
+ Character => \&VcfReader::validate_char,
2269
+ String => undef,
2270
+ Flag => undef,
2271
+ },
2272
+
2273
+ regex_snp => qr/^[ACGTN]$/i,
2274
+ regex_ins => qr/^I[ACGTN]+$/,
2275
+ regex_del => qr/^D\d+$/,
2276
+ regex_gtsep => qr{[\\|/]},
2277
+ regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$},
2278
+ regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT
2279
+ };
2280
+
2281
+ for my $key (keys %{$$self{_defaults}})
2282
+ {
2283
+ $$self{$key}=$$self{_defaults}{$key};
2284
+ }
2285
+
2286
+ return $self;
2287
+ }
2288
+
2289
+
2290
+ #------------------------------------------------
2291
+ # Version 4.0 specific functions
2292
+
2293
+ =head1 VCFv4.0
2294
+
2295
+ VCFv4.0 specific functions
2296
+
2297
+ =cut
2298
+
2299
+ package Vcf4_0;
2300
+ use base qw(VcfReader);
2301
+
2302
+ sub new
2303
+ {
2304
+ my ($class,@args) = @_;
2305
+ my $self = $class->SUPER::new(@args);
2306
+ bless $self, ref($class) || $class;
2307
+
2308
+ $$self{_defaults} =
2309
+ {
2310
+ version => '4.0',
2311
+ drop_trailings => 1,
2312
+ filter_passed => 'PASS',
2313
+
2314
+ defaults =>
2315
+ {
2316
+ QUAL => '.',
2317
+ Flag => undef,
2318
+ GT => '.',
2319
+ default => '.',
2320
+ },
2321
+ reserved =>
2322
+ {
2323
+ FILTER => { 0=>1 },
2324
+ },
2325
+
2326
+ handlers =>
2327
+ {
2328
+ Integer => \&VcfReader::validate_int,
2329
+ Float => \&VcfReader::validate_float,
2330
+ Character => \&VcfReader::validate_char,
2331
+ String => undef,
2332
+ Flag => undef,
2333
+ },
2334
+
2335
+ regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
2336
+ regex_ins => qr/^[ACGTN]+$/,
2337
+ regex_del => qr/^[ACGTN]+$/,
2338
+ regex_gtsep => qr{[|/]}, # | /
2339
+ regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
2340
+ regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
2341
+ };
2342
+
2343
+ for my $key (keys %{$$self{_defaults}})
2344
+ {
2345
+ $$self{$key}=$$self{_defaults}{$key};
2346
+ }
2347
+
2348
+ return $self;
2349
+ }
2350
+
2351
+ sub Vcf4_0::format_header_line
2352
+ {
2353
+ my ($self,$rec) = @_;
2354
+
2355
+ my %tmp_rec = ( %$rec );
2356
+ if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' }
2357
+ my $value;
2358
+ if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' )
2359
+ {
2360
+ my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output
2361
+ my @items;
2362
+ for my $key (qw(ID Number Type Description), sort keys %tmp_rec)
2363
+ {
2364
+ if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; }
2365
+ my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : '';
2366
+ push @items, "$key=$quote$tmp_rec{$key}$quote";
2367
+ $has{$key}=1;
2368
+ }
2369
+ $value = '<' .join(',',@items). '>';
2370
+ }
2371
+ else { $value = $tmp_rec{value}; }
2372
+
2373
+ my $line = "##$tmp_rec{key}=".$value."\n";
2374
+ return $line;
2375
+ }
2376
+
2377
+ =head2 parse_header_line
2378
+
2379
+ Usage : $vcf->parse_header_line(q[##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">])
2380
+ $vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36])
2381
+ Args :
2382
+ Returns :
2383
+
2384
+ =cut
2385
+
2386
+ sub Vcf4_0::parse_header_line
2387
+ {
2388
+ my ($self,$line) = @_;
2389
+
2390
+ chomp($line);
2391
+ $line =~ s/^##//;
2392
+
2393
+ if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); }
2394
+ my $key = $1;
2395
+ my $value = $';
2396
+
2397
+ if ( !($value=~/^<(.+)>\s*$/) )
2398
+ {
2399
+ # Simple sanity check for subtle typos
2400
+ if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' )
2401
+ {
2402
+ $self->throw("Hmm, is this a typo? [$key] [$value]");
2403
+ }
2404
+ return { key=>$key, value=>$value };
2405
+ }
2406
+
2407
+ my $rec = { key=>$key };
2408
+ my $tmp = $1;
2409
+ my ($attr_key,$attr_value,$quoted);
2410
+ while ($tmp ne '')
2411
+ {
2412
+ if ( !defined $attr_key )
2413
+ {
2414
+ if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; }
2415
+ elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; }
2416
+ else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); }
2417
+ }
2418
+
2419
+ if ( $tmp=~/^[^,\\"]+/ ) { $attr_value .= $&; $tmp = $'; }
2420
+ if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; }
2421
+ if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; }
2422
+ if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ )
2423
+ {
2424
+ if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ )
2425
+ {
2426
+ $self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n");
2427
+ $attr_key =~ s/^\s+//;
2428
+ $attr_key =~ s/\s+$//;
2429
+ $attr_value =~ s/^\s+//;
2430
+ $attr_value =~ s/\s+$//;
2431
+ }
2432
+ $$rec{$attr_key} = $attr_value;
2433
+ $tmp = $';
2434
+ if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; }
2435
+ $attr_key = $attr_value = $quoted = undef;
2436
+ next;
2437
+ }
2438
+ if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; }
2439
+ $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]);
2440
+ }
2441
+
2442
+ if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); }
2443
+ if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' )
2444
+ {
2445
+ if ( !exists($$rec{Description}) ) { $self->throw("Missing the Description tag in $line\n"); }
2446
+ }
2447
+ if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); }
2448
+ if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; }
2449
+
2450
+ return $rec;
2451
+ }
2452
+
2453
+ sub Vcf4_0::validate_ref_field
2454
+ {
2455
+ my ($self,$ref) = @_;
2456
+ if ( !($ref=~/^[ACGTN]+$/) )
2457
+ {
2458
+ my $offending = $ref;
2459
+ $offending =~ s/[ACGTN]+//g;
2460
+ return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n";
2461
+ }
2462
+ return undef;
2463
+ }
2464
+
2465
+ sub Vcf4_0::validate_alt_field
2466
+ {
2467
+ my ($self,$values,$ref) = @_;
2468
+
2469
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
2470
+
2471
+ my $ret = $self->_validate_alt_field($values,$ref);
2472
+ if ( $ret ) { return $ret; }
2473
+
2474
+ my $ref_len = length($ref);
2475
+ my $ref1 = substr($ref,0,1);
2476
+
2477
+ my @err;
2478
+ my $msg = '';
2479
+ for my $item (@$values)
2480
+ {
2481
+ if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
2482
+ if ( $item=~/^<[^<>\s]+>$/ ) { next; }
2483
+ if ( $ref_len==length($item) ) { next; }
2484
+ if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; }
2485
+ }
2486
+ if ( !@err ) { return undef; }
2487
+ return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
2488
+ }
2489
+
2490
+
2491
+ =head2 fill_ref_alt_mapping
2492
+
2493
+ About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping
2494
+ from the original haplotypes to the haplotypes based on the new REF. Consider the following
2495
+ example:
2496
+ REF ALT
2497
+ G GA
2498
+ GT G
2499
+ GT GA
2500
+ GT GAA
2501
+ GTC G
2502
+ G <DEL>
2503
+ my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{'<DEL>'=>1}};
2504
+ my $new_ref=$vcf->fill_ref_alt_mapping($map);
2505
+
2506
+ The call returns GTC and $map is now
2507
+ G GA -> GTC GATC
2508
+ GT G -> GTC GC
2509
+ GT GA -> GTC GAC
2510
+ GT GAA -> GTC GAAC
2511
+ GTC G -> GTC G
2512
+ G <DEL> -> GTC <DEL>
2513
+ Args :
2514
+ Returns : New REF string and fills the hash with appropriate ALT.
2515
+
2516
+ =cut
2517
+
2518
+ sub Vcf4_0::fill_ref_alt_mapping
2519
+ {
2520
+ my ($self,$map) = @_;
2521
+
2522
+ my $max_len = 0;
2523
+ my $new_ref;
2524
+ for my $ref (keys %$map)
2525
+ {
2526
+ my $len = length($ref);
2527
+ if ( $max_len<$len )
2528
+ {
2529
+ $max_len = $len;
2530
+ $new_ref = $ref;
2531
+ }
2532
+ $$map{$ref}{$ref} = 1;
2533
+ }
2534
+ for my $ref (keys %$map)
2535
+ {
2536
+ my $rlen = length($ref);
2537
+ if ( substr($new_ref,0,$rlen) ne $ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); }
2538
+ for my $alt (keys %{$$map{$ref}})
2539
+ {
2540
+ # The second part of the regex is for VCF>4.0, but does no harm for v<=4.0
2541
+ if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; }
2542
+ my $new = $alt;
2543
+ if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); }
2544
+ $$map{$ref}{$alt} = $new;
2545
+ }
2546
+ }
2547
+ return $new_ref;
2548
+ }
2549
+
2550
+
2551
+
2552
+ sub Vcf4_0::event_type
2553
+ {
2554
+ my ($self,$rec,$allele) = @_;
2555
+
2556
+ my $ref = $rec;
2557
+ if ( ref($rec) eq 'HASH' )
2558
+ {
2559
+ if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); }
2560
+ $ref = $$rec{REF};
2561
+ }
2562
+
2563
+ if ( $allele=~/^<[^>]+>$/ )
2564
+ {
2565
+ if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; }
2566
+ return ('u',0,$allele);
2567
+ }
2568
+ if ( $allele eq '.' )
2569
+ {
2570
+ if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; }
2571
+ return ('r',0,$ref);
2572
+ }
2573
+
2574
+ my $reflen = length($ref);
2575
+ my $len = length($allele);
2576
+
2577
+ my $ht;
2578
+ my $type;
2579
+ if ( $len==$reflen )
2580
+ {
2581
+ # This can be a reference, a SNP, or multiple SNPs
2582
+ my $mism = 0;
2583
+ for (my $i=0; $i<$len; $i++)
2584
+ {
2585
+ if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; }
2586
+ }
2587
+ if ( $mism==0 ) { $type='r'; $len=0; }
2588
+ else { $type='s'; $len=$mism; }
2589
+ }
2590
+ else
2591
+ {
2592
+ ($len,$ht)=is_indel($ref,$allele);
2593
+ if ( $len )
2594
+ {
2595
+ # Indel
2596
+ $type = 'i';
2597
+ $allele = $ht;
2598
+ }
2599
+ else
2600
+ {
2601
+ $type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1;
2602
+ }
2603
+ }
2604
+
2605
+ if ( ref($rec) eq 'HASH' )
2606
+ {
2607
+ $$rec{_cached_events}{$allele} = [$type,$len,$allele];
2608
+ }
2609
+ return ($type,$len,$allele);
2610
+ }
2611
+
2612
+ # The sequences start at the same position, which simplifies things greatly.
2613
+ sub is_indel
2614
+ {
2615
+ my ($seq1,$seq2) = @_;
2616
+
2617
+ my $len1 = length($seq1);
2618
+ my $len2 = length($seq2);
2619
+ if ( $len1 eq $len2 ) { return (0,''); }
2620
+
2621
+ my ($del,$len,$LEN);
2622
+ if ( $len1<$len2 )
2623
+ {
2624
+ $len = $len1;
2625
+ $LEN = $len2;
2626
+ $del = 1;
2627
+ }
2628
+ else
2629
+ {
2630
+ $len = $len2;
2631
+ $LEN = $len1;
2632
+ $del = -1;
2633
+ my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp;
2634
+ }
2635
+
2636
+ my $ileft;
2637
+ for ($ileft=0; $ileft<$len; $ileft++)
2638
+ {
2639
+ if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; }
2640
+ }
2641
+ if ( $ileft==$len )
2642
+ {
2643
+ return ($del*($LEN-$len), substr($seq2,$ileft));
2644
+ }
2645
+
2646
+ my $iright;
2647
+ for ($iright=0; $iright<$len; $iright++)
2648
+ {
2649
+ if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; }
2650
+ }
2651
+ if ( $iright+$ileft<=$len ) { return (0,''); }
2652
+
2653
+ return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len));
2654
+ }
2655
+
2656
+
2657
+ #------------------------------------------------
2658
+ # Version 4.1 specific functions
2659
+
2660
+ =head1 VCFv4.1
2661
+
2662
+ VCFv4.1 specific functions
2663
+
2664
+ =cut
2665
+
2666
+ package Vcf4_1;
2667
+ use base qw(Vcf4_0);
2668
+
2669
+ sub new
2670
+ {
2671
+ my ($class,@args) = @_;
2672
+ my $self = $class->SUPER::new(@args);
2673
+ bless $self, ref($class) || $class;
2674
+
2675
+ $$self{_defaults} =
2676
+ {
2677
+ version => '4.1',
2678
+ drop_trailings => 1,
2679
+ filter_passed => 'PASS',
2680
+
2681
+ defaults =>
2682
+ {
2683
+ QUAL => '.',
2684
+ Flag => undef,
2685
+ GT => '.',
2686
+ default => '.',
2687
+ },
2688
+ reserved =>
2689
+ {
2690
+ FILTER => { 0=>1 },
2691
+ },
2692
+
2693
+ handlers =>
2694
+ {
2695
+ Integer => \&VcfReader::validate_int,
2696
+ Float => \&VcfReader::validate_float,
2697
+ Character => \&VcfReader::validate_char,
2698
+ String => undef,
2699
+ Flag => undef,
2700
+ },
2701
+
2702
+ regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i,
2703
+ regex_ins => qr/^[ACGTN]+$/i,
2704
+ regex_del => qr/^[ACGTN]+$/i,
2705
+ regex_gtsep => qr{[|/]}, # | /
2706
+ regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1
2707
+ regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0|<DEL:ME:ALU>
2708
+ };
2709
+
2710
+ $$self{ignore_missing_GT} = 1;
2711
+
2712
+ for my $key (keys %{$$self{_defaults}})
2713
+ {
2714
+ $$self{$key}=$$self{_defaults}{$key};
2715
+ }
2716
+
2717
+ return $self;
2718
+ }
2719
+
2720
+ sub Vcf4_1::validate_header
2721
+ {
2722
+ my ($self) = @_;
2723
+ my $lines = $self->get_header_line(key=>'reference');
2724
+ if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); }
2725
+ }
2726
+
2727
+ sub Vcf4_1::validate_line
2728
+ {
2729
+ my ($self,$line) = @_;
2730
+
2731
+ if ( !$$self{_contig_validated}{$$line{CHROM}} )
2732
+ {
2733
+ my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM});
2734
+ if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); }
2735
+ $$self{_contig_validated}{$$line{CHROM}} = 1;
2736
+ }
2737
+
2738
+ # Is the ID composed of alphanumeric chars
2739
+ if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); }
2740
+ }
2741
+
2742
+ sub Vcf4_1::validate_alt_field
2743
+ {
2744
+ my ($self,$values,$ref) = @_;
2745
+
2746
+ if ( @$values == 1 && $$values[0] eq '.' ) { return undef; }
2747
+
2748
+ my $ret = $self->_validate_alt_field($values,$ref);
2749
+ if ( $ret ) { return $ret; }
2750
+
2751
+ my $ref_len = length($ref);
2752
+ my $ref1 = substr($ref,0,1);
2753
+
2754
+ my @err;
2755
+ my $msg = '';
2756
+ for my $item (@$values)
2757
+ {
2758
+ if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ )
2759
+ {
2760
+ if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; }
2761
+ my $rpl;
2762
+ if ( $1 ne '' )
2763
+ {
2764
+ $rpl = $1;
2765
+ if ( $rpl ne '.' )
2766
+ {
2767
+ my $rref = substr($rpl,0,1);
2768
+ if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; }
2769
+ }
2770
+ }
2771
+ else
2772
+ {
2773
+ $rpl = $3;
2774
+ if ( $rpl ne '.' )
2775
+ {
2776
+ my $rref = substr($rpl,-1,1);
2777
+ if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; }
2778
+ }
2779
+ }
2780
+ my $pos = $2;
2781
+ if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; }
2782
+ if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; }
2783
+ next;
2784
+ }
2785
+ if ( $item=~/^\.([ACTGNactgn])[ACTGNactgn]*$/ )
2786
+ {
2787
+ if ( $ref1 ne $1 ) { $msg=', first base does not match the reference'; push @err,$item; }
2788
+ next;
2789
+ }
2790
+ elsif ( $item=~/^[ACTGNactgn]*([ACTGNactgn])\.$/ )
2791
+ {
2792
+ if ( substr($ref,-1,1) ne $1 ) { $msg=', last base does not match the reference'; push @err,$item; }
2793
+ next;
2794
+ }
2795
+ if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; }
2796
+ if ( $item=~/^<[^<>\s]+>$/ ) { next; }
2797
+ if ( $ref_len==length($item) ) { next; }
2798
+ if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference'; push @err,$item; next; }
2799
+ }
2800
+ if ( !@err ) { return undef; }
2801
+ return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg;
2802
+ }
2803
+
2804
+ sub Vcf4_1::next_data_hash
2805
+ {
2806
+ my ($self,@args) = @_;
2807
+
2808
+ my $out = $self->SUPER::next_data_hash(@args);
2809
+ if ( !defined $out ) { return $out; }
2810
+
2811
+ # Case-insensitive ALT and REF bases
2812
+ $$out{REF} = uc($$out{REF});
2813
+ my $nalt = @{$$out{ALT}};
2814
+ for (my $i=0; $i<$nalt; $i++)
2815
+ {
2816
+ if ( $$out{ALT}[$i]=~/^</ ) { next; }
2817
+ $$out{ALT}[$i] = uc($$out{ALT}[$i]);
2818
+ }
2819
+
2820
+ return $out;
2821
+ }
2822
+
2823
+ sub Vcf4_1::next_data_array
2824
+ {
2825
+ my ($self,@args) = @_;
2826
+
2827
+ my $out = $self->SUPER::next_data_array(@args);
2828
+ if ( !defined $out or $$self{assume_uppercase} ) { return $out; }
2829
+
2830
+ # Case-insensitive ALT and REF bases
2831
+ $$out[3] = uc($$out[3]);
2832
+ my $alt = $$out[4];
2833
+ $$out[4] = '';
2834
+ while ($alt=~/[^<>]+/)
2835
+ {
2836
+ $$out[4] .= $`;
2837
+ $$out[4] .= (length($`) && substr($`,-1,1) eq '<' ) ? $& : uc($&);
2838
+ $alt = $';
2839
+ if ( $alt=~/^<[^<>]+>/ ) { $$out[4] .= $&; $alt=$'; }
2840
+ }
2841
+
2842
+ return $out;
2843
+ }
2844
+
2845
+ sub Vcf4_1::event_type
2846
+ {
2847
+ my ($self,$rec,$allele) = @_;
2848
+ if ( $allele=~/\[|\]|^\..+|.+\.$/ ) { return 'b'; }
2849
+ return $self->SUPER::event_type($rec,$allele);
2850
+ }
2851
+
2852
+ 1;
2853
+