ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1099 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Author: petr.danecek@sanger
4
+ #
5
+
6
+ use strict;
7
+ use warnings;
8
+ use Carp;
9
+ use Vcf;
10
+
11
+ my %filters =
12
+ (
13
+ MinAB => { dflt=>2, usage=>'INT', desc=>'Minimum number of alternate bases (INFO/DP4)', nick=>'a' },
14
+ SnpCluster => { dflt=>undef, usage=>'INT1,INT2', desc=>"Filters clusters of 'INT1' or more SNPs within a run of 'INT2' bases", nick=>'c' },
15
+ MinDP => { dflt=>2, usage=>'INT', desc=>"Minimum read depth (INFO/DP or INFO/DP4)", nick=>'d' },
16
+ MaxDP => { dflt=>10_000_000, usage=>'INT', desc=>"Maximum read depth (INFO/DP or INFO/DP4)", nick=>'D' },
17
+ MinMQ => { dflt=>10, usage=>'INT', desc=>"Minimum RMS mapping quality for SNPs (INFO/MQ)", nick=>'q' },
18
+ SnpGap => { dflt=>10, usage=>'INT', desc=>"SNP within INT bp around a gap to be filtered", nick=>'w' },
19
+ GapWin => { dflt=>3, usage=>'INT', desc=>"Window size for filtering adjacent gaps", nick=>'W' },
20
+ StrandBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for strand bias (INFO/PV4)", nick=>'1' },
21
+ BaseQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for baseQ bias (INFO/PV4)", nick=>'2' },
22
+ MapQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for mapQ bias (INFO/PV4)", nick=>'3' },
23
+ EndDistBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for end distance bias (INFO/PV4)", nick=>'4' },
24
+ RefN => { dflt=>'', usage=>'', desc=>"Reference base is N", nick=>'r' },
25
+ Qual => { dflt=>'10', usage=>'INT', desc=>"Minimum value of the QUAL field", nick=>'Q' },
26
+ VDB => { dflt=>'0.015', usage=>'FLOAT', desc=>"Minimum Variant Distance Bias (INFO/VDB)", nick=>'v' },
27
+ HWE => { dflt=>1e-4, usage=>'FLOAT', desc=>"Minimum P-value for HWE (plus F<0) (INFO/HWE and INFO/G3)", nick=>'H' },
28
+ );
29
+
30
+ my $opts = parse_params();
31
+ annotate($opts);
32
+
33
+ exit;
34
+
35
+ #--------------------------------
36
+
37
+ sub error
38
+ {
39
+ my (@msg) = @_;
40
+ if ( scalar @msg ) { confess @msg; }
41
+
42
+ my @filters;
43
+ for my $key (sort {lc($filters{$a}{nick}) cmp lc($filters{$b}{nick})} keys %filters)
44
+ {
45
+ push @filters, sprintf("\t%s, %-25s\t\t%s [%s]\n", $filters{$key}{nick},$key.' '.$filters{$key}{usage},$filters{$key}{desc},defined($filters{$key}{dflt})? $filters{$key}{dflt} : '');
46
+ }
47
+
48
+ print
49
+ "About: Annotates VCF file, adding filters or custom annotations. Requires tabix indexed file with annotations.\n",
50
+ " Currently annotates only the ID and INFO column, but it will be extended on popular demand.\n",
51
+ "Usage: cat in.vcf | vcf-annotate [OPTIONS] > out.vcf\n",
52
+ "Options:\n",
53
+ " -a, --annotations <file.gz> The tabix indexed file with the annotations: CHR\\tFROM[\\tTO][\\tVALUE]+.\n",
54
+ " -c, --columns <list> The list of columns in the annotation file, e.g. CHROM,FROM,TO,-,INFO/STR,INFO/GN. The dash\n",
55
+ " in this example indicates that the third column should be ignored. If TO is not\n",
56
+ " present, it is assumed that TO equals to FROM. When REF and ALT columns are present, only\n",
57
+ " matching lines are annotated.\n",
58
+ " -d, --description <file|string> Header annotation, e.g. key=INFO,ID=HM2,Number=0,Type=Flag,Description='HapMap2 membership'.\n",
59
+ " The descriptions can be read from a file, one annotation per line.\n",
60
+ " -f, --filter <file|list> Apply filters, list is in the format flt1=value/flt2/flt3=value/etc. If argument to -f is a file,\n",
61
+ " user-defined filters be applied. See User Defined Filters below.\n",
62
+ " -h, -?, --help This help message.\n",
63
+ "Filters:\n",
64
+ sprintf("\t+ %-25s\t\tApply all filters with default values (can be overriden, see the example below).\n",''),
65
+ sprintf("\t-X %-25s\t\tExclude the filter X\n",''),
66
+ join('',@filters),
67
+ "Examples:\n",
68
+ " zcat in.vcf.gz | vcf-annotate -a annotations.gz -d descriptions.txt -c FROM,TO,CHROM,ID,INFO/DP | bgzip -c >out.vcf.gz \n",
69
+ " zcat in.vcf.gz | vcf-annotate -f +/-a/c=3,10/q=3/d=5/-D -a annotations.gz -d descriptions.txt | bgzip -c >out.vcf.gz \n",
70
+ " zcat in.vcf.gz | vcf-annotate -a dbSNPv132.tab.gz -c CHROM,FROM,REF,ALT,ID,-,-,- | bgzip -c >out.vcf.gz \n",
71
+ "Where descriptions.txt contains:\n",
72
+ " key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name'\n",
73
+ " key=INFO,ID=STR,Number=1,Type=Integer,Description='Strand'\n",
74
+ "The file dbSNPv132.tab.gz with dbSNP IDs can be downloaded from\n",
75
+ " ftp://ftp.sanger.ac.uk/pub/1000genomes/pd3/dbSNP/\n",
76
+ "\n",
77
+ "User-defined filters:\n",
78
+ " # The examples below are self-explanatory. Notice the use of the predefined\n",
79
+ " # variables (\$MATCH, \$PASS, \$FAIL) and methods (error).\n",
80
+ " #\n",
81
+ " # In this example, a minimum value of AF1=0.1 is required\n",
82
+ " {\n",
83
+ " tag => 'INFO/AF1', # The VCF tag to apply this filter on\n",
84
+ " name => 'MinAF', # The filter ID\n",
85
+ " desc => 'Minimum AF1 [0.01]', # Description for the VCF header\n",
86
+ " test => sub { return \$MATCH < 0.01 ? \$FAIL : \$PASS },\n",
87
+ " },\n",
88
+ " # Filter all indels (presence of INDEL tag is tested)\n",
89
+ " {\n",
90
+ " tag => 'INFO/INDEL',\n",
91
+ " apply_to => 'indels', # Can be one of SNPs, indels, all. Default: [All]\n",
92
+ " name => 'Indel',\n",
93
+ " desc => 'INDEL tag present',\n",
94
+ " test => sub { return \$FAIL },\n",
95
+ " },\n",
96
+ " # Only loci with enough reads supporting the variant will pass the filter\n",
97
+ " {\n",
98
+ " tag => 'INFO/DP4',\n",
99
+ " name => 'FewAlts',\n",
100
+ " desc => 'Too few reads supporting the variant',\n",
101
+ " apply_to => 'SNPs',\n",
102
+ " test => sub {\n",
103
+ " if ( !(\$MATCH =~ /^([^,]+),([^,]+),([^,]+),(.+)\$/) )\n",
104
+ " {\n",
105
+ " error(\"Could not parse INFO/DP4: \$CHROM:\$POS [\$MATCH]\");\n",
106
+ " }\n",
107
+ " if ( 0.1*(\$1+\$2) > \$3+\$4 ) { return \$PASS; }\n",
108
+ " return \$FAIL;\n",
109
+ " },\n",
110
+ " },\n",
111
+ "\n";
112
+ exit -1;
113
+ }
114
+
115
+ sub parse_params
116
+ {
117
+ my $opts = { args=>[$0, @ARGV], };
118
+ while (defined(my $arg=shift(@ARGV)))
119
+ {
120
+ if ( $arg eq '-d' || $arg eq '--description' )
121
+ {
122
+ my $desc = shift(@ARGV);
123
+ if ( -e $desc )
124
+ {
125
+ open(my $fh,'<',$desc) or error("$desc: $!");
126
+ while (my $line=<$fh>)
127
+ {
128
+ if ( $line=~/^\s*$/ or $line=~/^#/ ) { next; }
129
+ chomp($line);
130
+ push @{$$opts{desc}}, $line;
131
+ }
132
+ close($fh);
133
+ }
134
+ else
135
+ {
136
+ push @{$$opts{desc}}, $desc;
137
+ }
138
+ next;
139
+ }
140
+ if ( $arg eq '-f' || $arg eq '--filter' )
141
+ {
142
+ my $filter = shift(@ARGV);
143
+ parse_filters($opts,$filter);
144
+ next;
145
+ }
146
+ if ( $arg eq '-c' || $arg eq '--columns' )
147
+ {
148
+ my $cols = shift(@ARGV);
149
+ $$opts{cols} = [ split(/,/,$cols) ];
150
+ next;
151
+ }
152
+ if ( $arg eq '-a' || $arg eq '--annotations' ) { $$opts{annotations} = shift(@ARGV); next }
153
+ if ( $arg eq '-t' || $arg eq '--tag' ) { $$opts{tag} = shift(@ARGV); next }
154
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
155
+ if ( -e $arg ) { $$opts{file}=$arg; next; }
156
+ error("Unknown parameter \"$arg\". Run -h for help.\n");
157
+ }
158
+ if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
159
+ {
160
+ if ( !exists($$opts{annotations}) ) { error("Missing the -a or -f option.\n") }
161
+ }
162
+ if ( exists($$opts{annotations}) && !exists($$opts{cols}) ) { error("Missing the -c option.\n"); }
163
+ return $opts;
164
+ }
165
+
166
+ sub parse_user_defined_filters
167
+ {
168
+ my ($opts,$str) = @_;
169
+ my $filters = [ do $str ];
170
+ for my $filter (@$filters)
171
+ {
172
+ if ( !exists($$filter{tag}) ) { error("Missing 'tag' key for one of the filters in $str\n"); }
173
+ if ( !($$filter{tag}=~m{^INFO/(.+)$}) ) { error("Currently only INFO tags supported. Could not parse the tag [$$filter{tag}]\n"); }
174
+ my $ID = $1;
175
+
176
+ $$filter{ID} = $ID;
177
+ $$filter{tag_re} = qr{$ID=?([^\t;]*)};
178
+
179
+ if ( !exists($$filter{name}) ) { error("Missing 'name' key for the filter [$$filter{tag}]\n"); }
180
+ if ( !exists($$filter{desc}) ) { error("Missing 'desc' key for the filter [$$filter{tag}]\n"); }
181
+
182
+ my $name = $$filter{name};
183
+ push @{$$opts{desc}}, "key=FILTER,ID=$name,Description='$$filter{desc}";
184
+ if ( !exists($$filter{apply_to}) or lc($$filter{apply_to}) eq 'all' )
185
+ {
186
+ $$opts{udef_filters}{'all'}{$name} = $filter;
187
+ $$opts{udef_filters}{'s'}{$name} = $filter;
188
+ $$opts{udef_filters}{'i'}{$name} = $filter;
189
+ }
190
+ elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'snps' )
191
+ {
192
+ $$opts{udef_filters}{'s'}{$name} = $filter;
193
+ $$opts{udef_filters_typecheck_needed} = 1;
194
+ }
195
+ elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'indels' )
196
+ {
197
+ $$opts{udef_filters}{'i'}{$name} = $filter;
198
+ $$opts{udef_filters_typecheck_needed} = 1;
199
+ }
200
+ }
201
+ }
202
+
203
+ sub parse_filters
204
+ {
205
+ my ($opts,$str) = @_;
206
+
207
+ if ( -e $str )
208
+ {
209
+ parse_user_defined_filters($opts,$str);
210
+ return;
211
+ }
212
+
213
+ my $has_filters = 0;
214
+ my $set_defaults = 0;
215
+ my @filters = split(m{/},$str);
216
+ for my $fltr (@filters)
217
+ {
218
+ if ( $fltr eq '+' ) { $set_defaults=1; last; }
219
+ }
220
+
221
+ my %mapping;
222
+ for my $flt (keys %filters)
223
+ {
224
+ if ( exists($mapping{$filters{$flt}{nick}}) ) { error("FIXME: the nick $filters{$flt}{nick} is not unique.\n"); }
225
+ $mapping{$filters{$flt}{nick}} = $flt;
226
+
227
+ if ( !defined($filters{$flt}{dflt}) ) { next; }
228
+ if ( $set_defaults )
229
+ {
230
+ $$opts{filters}{$flt} = $filters{$flt}{dflt};
231
+ }
232
+ }
233
+
234
+ for my $filter (@filters)
235
+ {
236
+ my ($key,$val) = split(/=/,$filter);
237
+ if ( $key eq '+' ) { next; }
238
+ my $to_be_deleted = 0;
239
+ if ( $key=~/^-(.+)$/ ) { $to_be_deleted=1; $key = $1; }
240
+ if ( !exists($filters{$key}) ) { $key = $mapping{$key}; }
241
+ if ( !exists($filters{$key}) && !exists($mapping{$key}) ) { error("The filter [$key] not recognised.\n"); }
242
+ if ( $to_be_deleted ) { delete($$opts{filters}{$key}); next; }
243
+
244
+ if ( $key eq 'c' || $key eq 'SnpCluster' )
245
+ {
246
+ ($$opts{SnpCluster_count},$$opts{SnpCluster_win}) = split(/,/,$val);
247
+
248
+ # Simple sanity check
249
+ if ( $$opts{SnpCluster_count}>$$opts{SnpCluster_win} )
250
+ {
251
+ error("Did you really mean snpCluster=$$opts{SnpCluster_count},$$opts{SnpCluster_win}? The win (INT2) must be bigger or equal to count (INT1).");
252
+ }
253
+ $$opts{SnpCluster_buffer} = [];
254
+ push @{$$opts{desc}}, "key=FILTER,ID=SnpCluster,Description='$filters{SnpCluster}{desc} [win=$$opts{SnpCluster_win},count=$$opts{SnpCluster_count}]'";
255
+ $has_filters = 1;
256
+ next;
257
+ }
258
+
259
+ $$opts{filters}{$key} = $val;
260
+ $has_filters = 1;
261
+ }
262
+ for my $key (keys %{$$opts{filters}})
263
+ {
264
+ push @{$$opts{desc}}, "key=FILTER,ID=$key,Description='$filters{$key}{desc}" . (defined $$opts{filters}{$key} ? " [$$opts{filters}{$key}]'" : "'");
265
+ }
266
+ if ( !$has_filters && !scalar keys %{$$opts{filters}} ) { delete($$opts{filters}); }
267
+ }
268
+
269
+
270
+ # Convert text descriptions given on command line to hashes which will be
271
+ # passed to Vcf::add_header_line
272
+ sub parse_descriptions
273
+ {
274
+ my ($descs) = @_;
275
+ my @out;
276
+ for my $str (@$descs)
277
+ {
278
+ my $desc = {};
279
+ my $tmp = $str;
280
+ while ($tmp)
281
+ {
282
+ my ($key,$value);
283
+ if ( $tmp=~/^([^=]+)=["']([^\"]+)["']/ ) { $key=$1; $value=$2; }
284
+ elsif ( $tmp=~/^([^=]+)=([^,"]+)/ && $1 eq 'Description' )
285
+ {
286
+ # The command line eats the quotes
287
+ $key=$1; $value=$2.$';
288
+ $$desc{$key} = $value;
289
+ last;
290
+ }
291
+ elsif ( $tmp=~/^([^=]+)=([^,"]+)/ )
292
+ {
293
+ $key=$1; $value=$2;
294
+ }
295
+ else { error(qq[Could not parse the description: [$str]\n]); }
296
+ $$desc{$key} = $value;
297
+
298
+ $tmp = $';
299
+ if ( $tmp=~/^,/ ) { $tmp = $'; }
300
+
301
+ }
302
+ if ( !exists($$desc{ID}) ) { error("No ID in description? [$str]\n"); }
303
+ push @out, $desc;
304
+ }
305
+ return \@out;
306
+ }
307
+
308
+ # Create mapping from the annotation IDs to column indexes. The mapping is used
309
+ # to determine which columns should be used from the annotation file. The
310
+ # following structure is returned:
311
+ # {
312
+ # CHROM => col_idx,
313
+ # FROM => col_idx,
314
+ # TO => col_idx,
315
+ # annots =>
316
+ # [
317
+ # { col=>col_idx, id=>annot_id, vcf_col=>vcf_column, is_flag=>0 },
318
+ # ]
319
+ # }
320
+ # If {annots}{is_flag} is nonzero, "annot_id" will be written to VCF instead of "annot_id=value".
321
+ # Currently only one VCF column (INFO) is supported.
322
+ #
323
+ sub parse_columns
324
+ {
325
+ my ($cols,$descriptions) = @_;
326
+
327
+ my %desc = ();
328
+ my %out = ( annots=>[] );
329
+
330
+ if ( !defined $cols ) { return \%out; }
331
+
332
+ for my $d (@$descriptions)
333
+ {
334
+ $desc{$$d{key}.'/'.$$d{ID}} = $d;
335
+ }
336
+
337
+ for (my $i=0; $i<@$cols; $i++)
338
+ {
339
+ my $col = $$cols[$i];
340
+
341
+ if ( $col eq '-' ) { next; }
342
+ elsif ( $col eq 'CHROM' ) { $out{$col}=$i; }
343
+ elsif ( $col eq 'FROM' ) { $out{$col}=$i; }
344
+ elsif ( $col eq 'POS' ) { $out{'FROM'}=$i; }
345
+ elsif ( $col eq 'TO' ) { $out{$col}=$i; }
346
+ elsif ( $col eq 'ID' ) { $out{$col}=$i; }
347
+ elsif ( $col eq 'REF' ) { $out{$col}=$i; }
348
+ elsif ( $col eq 'ALT' ) { $out{$col}=$i; }
349
+ else
350
+ {
351
+ if ( !exists($desc{$col}) && exists($desc{"INFO/$col"}) )
352
+ {
353
+ print STDERR qq[The description for "$col" does not exist, assuming "INFO/$col"\n];
354
+ $col = "INFO/$col";
355
+ }
356
+
357
+ if ( !exists($desc{$col}))
358
+ {
359
+ error("Missing the -d parameter for the column [$col]\n");
360
+ }
361
+ if ( !($col=~m{^(.+)/(.+)$}) ) { error("Could not parse the column [$col].\n"); }
362
+ my $key = $1;
363
+ my $id = $2;
364
+ my $rec = { col=>$i, id=>$id, vcf_col=>$key, is_flag=>($desc{$col}{Type} eq 'Flag' ? 1 : 0) };
365
+ push @{$out{annots}}, $rec;
366
+ if ( $key ne 'INFO' ) { error("TODO: other than INFO columns\n"); }
367
+ }
368
+ }
369
+ if ( !exists($out{CHROM}) ) { $out{CHROM}=0; }
370
+ if ( !exists($out{FROM}) ) { $out{FROM}=1; }
371
+ if ( !exists($out{TO}) ) { $out{TO}=$out{FROM}; }
372
+ if ( exists($out{REF}) && !exists($out{ALT}) or !exists($out{REF}) && exists($out{ALT}) ) { error("Expected both REF and ALT columns in the annotation file.\n"); }
373
+ return \%out;
374
+ }
375
+
376
+ sub annotate
377
+ {
378
+ my ($opts) = @_;
379
+
380
+ # Init the variables
381
+ my $descs = parse_descriptions($$opts{desc});
382
+ my $cols = parse_columns($$opts{cols},$descs);
383
+
384
+ # Open VCF file and add all required header lines
385
+ my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN);
386
+ $vcf->parse_header();
387
+ for my $desc (@$descs)
388
+ {
389
+ $vcf->add_header_line($desc,silent=>1);
390
+ }
391
+ $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp');
392
+ print $vcf->format_header();
393
+
394
+ my ($prev_chr,$prev_pos,$annot_from,$annot_to,$annot_line);
395
+ my @annots = @{$$cols{annots}};
396
+ my $id_col = exists($$cols{ID}) ? $$cols{ID} : undef;
397
+ my $from_col = $$cols{FROM};
398
+ my $to_col = $$cols{TO};
399
+ my $ref_col = exists($$cols{REF}) ? $$cols{REF} : undef;
400
+ my $alt_col = exists($$cols{ALT}) ? $$cols{ALT} : undef;
401
+
402
+ # Initialize the annotation reader
403
+ my $reader;
404
+ if ( exists($$opts{annotations}) )
405
+ {
406
+ $reader = Reader->new(file=>$$opts{annotations});
407
+ my $line = $vcf->next_line();
408
+ if ( !defined $line )
409
+ {
410
+ # VCF file is empty
411
+ undef $reader;
412
+ }
413
+ else
414
+ {
415
+ my @rec = split(/\t/,$line);
416
+ $prev_chr = $rec[0];
417
+ $prev_pos = $rec[1];
418
+ $vcf->_unread_line($line);
419
+ $reader->open(region=>$prev_chr);
420
+ }
421
+ }
422
+
423
+ while (defined $reader)
424
+ {
425
+ # Read next annotation group, i.e. all records with the same position (or overlapping in case of intervals)
426
+ my (@annot_lines,$annot_prev_from,$annot_prev_to);
427
+ while ($reader)
428
+ {
429
+ my $annot_line = $reader->next_line();
430
+ if ( !defined $annot_line ) { last; }
431
+ my $annot_from = $$annot_line[$from_col];
432
+ my $annot_to = $$annot_line[$to_col];
433
+ if ( !@annot_lines )
434
+ {
435
+ push @annot_lines, $annot_line;
436
+ $annot_prev_from = $annot_from;
437
+ $annot_prev_to = $annot_to;
438
+ next;
439
+ }
440
+ if ( $annot_from <= $annot_prev_to or $annot_to <= $annot_prev_to )
441
+ {
442
+ push @annot_lines, $annot_line;
443
+ if ( $annot_prev_to < $annot_to ) { $annot_prev_to = $annot_to; }
444
+ next;
445
+ }
446
+ $reader->unread_line($annot_line);
447
+ last;
448
+ }
449
+
450
+ # Now loop through the VCF records
451
+ my $line;
452
+ while ($line = $vcf->next_line())
453
+ {
454
+ my @rec = split(/\t/,$line);
455
+ my $chr = $rec[0];
456
+ my $pos = $rec[1];
457
+ chomp($rec[-1]);
458
+
459
+ if ( $chr ne $prev_chr )
460
+ {
461
+ $vcf->_unread_line($line);
462
+ $prev_chr = $chr;
463
+ $reader->open(region=>$prev_chr);
464
+ last;
465
+ }
466
+
467
+ # Quick position-based check: Is there an annotation for this record?
468
+ if ( !defined $annot_prev_from or $pos < $annot_prev_from )
469
+ {
470
+ output_line($opts,\@rec);
471
+ next;
472
+ }
473
+ if ( $pos > $annot_prev_to )
474
+ {
475
+ $vcf->_unread_line($line);
476
+ last;
477
+ }
478
+
479
+ # Initialize the REF,ALT-based check
480
+ my ($ref,%alt);
481
+ if ( defined $alt_col )
482
+ {
483
+ $ref = $rec[3];
484
+ %alt = map { $_=>1 } split(/,/,$rec[4]);
485
+ }
486
+
487
+ # Now fill the annotations; Existing annotations with the same tag will be overwritten
488
+ my %values = ();
489
+ my %ids = ();
490
+ for my $annot_line (@annot_lines)
491
+ {
492
+ # Skip annotation lines which are not relevant to this VCF record
493
+ if ( $$annot_line[$from_col] > $pos or $$annot_line[$to_col] < $pos ) { next; }
494
+ if ( defined $alt_col && $$annot_line[$ref_col] ne '.' )
495
+ {
496
+ if ( $$annot_line[$ref_col] ne $ref ) { next; }
497
+ if ( !exists($alt{$$annot_line[$alt_col]}) ) { next; }
498
+ }
499
+ for my $info (@annots)
500
+ {
501
+ my $val = $$annot_line[$$info{col}];
502
+ chomp($val);
503
+
504
+ if ( $val eq '' or $val eq '.' ) { $val=undef; } # Existing annotation should be removed
505
+ elsif ( $$info{is_flag} )
506
+ {
507
+ if ( $val ) { $val=''; } # Flag annotation should be added
508
+ else { $val=undef; } # Flag annotation should be removed
509
+ }
510
+
511
+ # Only single undef values can be present are overriden if defined in other records
512
+ if ( !defined $val && exists($values{$$info{id}}) ) { next; }
513
+ elsif ( exists($values{$$info{id}}) && !defined $values{$$info{id}}[0] )
514
+ {
515
+ $values{$$info{id}}[0] = $val;
516
+ next;
517
+ }
518
+ push @{$values{$$info{id}}}, $val;
519
+ }
520
+ if ( defined $id_col && $$annot_line[$id_col] ne '' ) { $ids{$$annot_line[$id_col]} = 1; }
521
+ }
522
+ if ( scalar keys %ids ) { $rec[2] = join(';', keys %ids); }
523
+ if ( scalar keys %values )
524
+ {
525
+ for my $key (keys %values)
526
+ {
527
+ # Cannot use join on undef values
528
+ $values{$key} = scalar @{$values{$key}} == 1 ? $values{$key}[0] : join(',', @{$values{$key}});
529
+ }
530
+ $rec[7] = $vcf->add_info_field($rec[7],%values);
531
+ }
532
+ output_line($opts,\@rec);
533
+ }
534
+ if ( !defined $line ) { last; }
535
+ }
536
+
537
+ # Finish the VCF, no annotations for this part
538
+ while (my $line=$vcf->next_line)
539
+ {
540
+ my @rec = split(/\t/,$line);
541
+ chomp($rec[-1]);
542
+ output_line($opts,\@rec);
543
+ }
544
+
545
+ # Output any lines left in the buffer
546
+ output_line($opts);
547
+ }
548
+
549
+
550
+ # Stage the lines and then apply filtering if requested, otherwise just print the line
551
+ sub output_line
552
+ {
553
+ my ($opts,$line) = @_;
554
+
555
+ if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
556
+ {
557
+ # No filters requested, print the line
558
+ print_line($line);
559
+ return;
560
+ }
561
+
562
+ if ( defined $line )
563
+ {
564
+ # Local filters return the line back immediately
565
+ if ( scalar keys %{$$opts{filters}} )
566
+ {
567
+ $line = apply_local_filters($opts,$line);
568
+ }
569
+ if ( exists($$opts{udef_filters}) )
570
+ {
571
+ $line = apply_user_defined_filters($opts,$line);
572
+ }
573
+ }
574
+
575
+ # Staging filters may return nothing or multiple lines. If $line is not defined, they will
576
+ # empty the buffers
577
+ my @lines;
578
+ if ( exists($$opts{filters}{SnpGap}) )
579
+ {
580
+ @lines = apply_snpgap_filter($opts,$line);
581
+ if ( defined $line && !scalar @lines ) { return; }
582
+ }
583
+ elsif ( defined $line ) { @lines=($line); }
584
+
585
+ if ( exists($$opts{filters}{GapWin}) )
586
+ {
587
+ my @tmp;
588
+ if ( !defined $line ) { push @lines,undef; }
589
+ for my $line (@lines)
590
+ {
591
+ push @tmp, apply_gapwin_filter($opts,$line);
592
+ }
593
+ @lines = @tmp;
594
+ }
595
+
596
+ if ( exists($$opts{SnpCluster_count}) )
597
+ {
598
+ my @tmp;
599
+ if ( !defined $line ) { push @lines,undef; }
600
+ for my $line (@lines)
601
+ {
602
+ push @tmp, apply_snpcluster_filter($opts,$line);
603
+ }
604
+ @lines = @tmp;
605
+ }
606
+
607
+ for my $line (@lines)
608
+ {
609
+ print_line($line);
610
+ }
611
+ }
612
+
613
+ sub apply_user_defined_filters
614
+ {
615
+ my ($opts,$line) = @_;
616
+
617
+ our($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF);
618
+ $CHROM = $$line[0];
619
+ $POS = $$line[1];
620
+ $FAIL = 1;
621
+ $PASS = 0;
622
+ $RECORD = $line;
623
+ $VCF = $$opts{vcf};
624
+
625
+ my %filters = ();
626
+ if ( $$opts{udef_filters_typecheck_needed} )
627
+ {
628
+ # Check if the line has an indel, SNP or both
629
+ for my $alt (split(/,/,$$line[4]))
630
+ {
631
+ my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt);
632
+ if ( exists($$opts{udef_filters}{$type}) )
633
+ {
634
+ %filters = ( %filters, %{$$opts{udef_filters}{$type}} );
635
+ }
636
+ }
637
+ # Return if the line does not have the wanted variant type
638
+ if ( !scalar %filters ) { return $line; }
639
+ }
640
+ else
641
+ {
642
+ %filters = %{$$opts{udef_filters}{all}};
643
+ }
644
+
645
+ my %apply;
646
+ for my $filter (values %filters)
647
+ {
648
+ if ( !($$line[7]=~$$filter{tag_re}) ) { next; }
649
+ $MATCH = $1;
650
+ $apply{ $$filter{name} } = &{$$filter{test}} == $PASS ? 0 : 1;
651
+ }
652
+ if ( scalar keys %apply )
653
+ {
654
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
655
+ }
656
+
657
+ return $line;
658
+ }
659
+
660
+ sub apply_local_filters
661
+ {
662
+ my ($opts,$line) = @_;
663
+
664
+ if ( !defined $line ) { return; }
665
+
666
+ my $filters = $$opts{filters};
667
+ my %apply;
668
+
669
+ my ($dp,$dp_alt,$mq,$vdb);
670
+ if ( $$line[7]=~/DP4=(\d+),(\d+),(\d+),(\d+)/i )
671
+ {
672
+ $dp = $1 + $2 + $3 + $4;
673
+ $dp_alt = $3 + $4;
674
+ }
675
+ if ( $$line[7]=~/DP=(\d+)/i ) { $dp = $1; }
676
+ if ( $$line[7]=~/MQ=(\d+)/i ) { $mq = $1; }
677
+ if ( $$line[7]=~/VDB=([^;,\t]+)/i ) { $vdb = $1; }
678
+
679
+ if ( exists($$filters{RefN}) )
680
+ {
681
+ $apply{RefN} = ($$line[3]=~/N/) ? 1 : 0;
682
+ }
683
+ if ( exists($$filters{Qual}) && $$line[5] ne '.' )
684
+ {
685
+ $apply{Qual} = $$line[5] < $$filters{Qual} ? 1 : 0;
686
+ }
687
+ if ( exists($$filters{HWE}) && $$line[7]=~/G3=([^,]+),([^,]+),/ )
688
+ {
689
+ my ($rr,$ra);
690
+ $rr = $1;
691
+ $ra = $2;
692
+ $apply{HWE} = 0;
693
+ if ( $$line[7]=~/HWE=([^;\t]+)/ && $1<$$filters{HWE} )
694
+ {
695
+ my $p = 2*$rr + $ra;
696
+ if ( $p>0 && $p<1 && $ra/($p*(1-$p))>1 )
697
+ {
698
+ $apply{HWE} = 1;
699
+ }
700
+ }
701
+ }
702
+ if ( defined $dp_alt )
703
+ {
704
+ if ( exists($$filters{MinAB}) )
705
+ {
706
+ $apply{MinAB} = $dp_alt < $$filters{MinAB} ? 1 : 0;
707
+ }
708
+ }
709
+ if ( defined $vdb )
710
+ {
711
+ $apply{VDB} = $vdb < $$filters{VDB} ? 1 : 0;
712
+ }
713
+ if ( defined $dp )
714
+ {
715
+ if ( exists($$filters{MinDP}) )
716
+ {
717
+ $apply{MinDP} = $dp < $$filters{MinDP} ? 1 : 0;
718
+ }
719
+ if ( exists($$filters{MaxDP}) )
720
+ {
721
+ $apply{MaxDP} = $dp > $$filters{MaxDP} ? 1 : 0;
722
+ }
723
+ }
724
+ if ( defined $mq )
725
+ {
726
+ if ( exists($$filters{MinMQ}) )
727
+ {
728
+ $apply{MinMQ} = $mq < $$filters{MinMQ} ? 1 : 0;
729
+ }
730
+ }
731
+ if ( $$line[7]=~/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ )
732
+ {
733
+ if ( exists($$filters{StrandBias}) )
734
+ {
735
+ $apply{StrandBias} = $1 < $$filters{StrandBias} ? 1 : 0;
736
+ }
737
+ if ( exists($$filters{BaseQualBias}) )
738
+ {
739
+ $apply{BaseQualBias} = $2 < $$filters{BaseQualBias} ? 1 : 0;
740
+ }
741
+ if ( exists($$filters{MapQualBias}) )
742
+ {
743
+ $apply{MapQualBias} = $3 < $$filters{MapQualBias} ? 1 : 0;
744
+ }
745
+ if ( exists($$filters{EndDistBias}) )
746
+ {
747
+ $apply{EndDistBias} = $4 < $$filters{EndDistBias} ? 1 : 0;
748
+ }
749
+ }
750
+ if ( scalar keys %apply )
751
+ {
752
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
753
+ }
754
+ return $line;
755
+ }
756
+
757
+ sub apply_snpgap_filter
758
+ {
759
+ my ($opts,$line) = @_;
760
+ if ( !exists($$opts{SnpGap_buffer}) ) { $$opts{SnpGap_buffer}=[]; }
761
+
762
+ my $vcf = $$opts{vcf};
763
+ my $win = $$opts{filters}{SnpGap};
764
+ my $buffer = $$opts{SnpGap_buffer};
765
+ my ($indel_chr,$indel_pos,$to);
766
+
767
+ if ( defined $line )
768
+ {
769
+ # There may be multiple variants, look for an indel. Anything what is not ref can be filtered.
770
+ my $is_indel = 0;
771
+ my $can_be_filtered = 0;
772
+ for my $alt (split(/,/,$$line[4]))
773
+ {
774
+ my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
775
+ if ( $type eq 'i' )
776
+ {
777
+ $is_indel = 1;
778
+ $indel_chr = $$line[0];
779
+ $indel_pos = $$line[1]+1;
780
+ }
781
+ elsif ( $type ne 'r' ) { $can_be_filtered = 1; }
782
+ }
783
+ # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
784
+ # correct as the indel can begin anywhere in the VCF4.x record with
785
+ # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
786
+ # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
787
+ # some valid SNPs.
788
+ $to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
789
+ push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, can_be_filtered=>$can_be_filtered, is_indel=>$is_indel };
790
+ }
791
+
792
+ my $n = @$buffer;
793
+
794
+ # Is the new line an indel? If yes, check the distance to all previous lines
795
+ if ( defined $indel_chr )
796
+ {
797
+ for (my $i=0; $i<$n-1; $i++)
798
+ {
799
+ my $buf = $$buffer[$i];
800
+ if ( $$buf{chr} ne $indel_chr ) { next; }
801
+ if ( !$$buf{can_be_filtered} ) { next; }
802
+ if ( $$buf{is_indel} ) { next; }
803
+ if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; }
804
+ }
805
+ }
806
+
807
+ if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
808
+ {
809
+ # There are not enough rows in the buffer: the SnpGap window spans them all. Wait until there is more rows
810
+ # or a new chromosome
811
+ return ();
812
+ }
813
+
814
+ # 'Look-behind' filtering was done above, now comes 'look-ahead' filtering
815
+ my $indel_to;
816
+ for (my $i=0; $i<$n; $i++)
817
+ {
818
+ my $buf = $$buffer[$i];
819
+ if ( $$buf{is_indel} )
820
+ {
821
+ $indel_to = $$buf{to};
822
+ $indel_chr = $$buf{chr};
823
+ next;
824
+ }
825
+ if ( !defined $indel_to ) { next; }
826
+ if ( !$$buf{can_be_filtered} ) { next; }
827
+ if ( $$buf{chr} ne $indel_chr )
828
+ {
829
+ undef $indel_to;
830
+ next;
831
+ }
832
+ if ( $$buf{from}<=$indel_to+$win ) { $$buf{exclude}=1; }
833
+ }
834
+
835
+ # Output. If no $line was given, output everything
836
+ $to = $$buffer[-1]{from}-$win;
837
+ my $chr = $$buffer[-1]{chr};
838
+ my @out;
839
+ while (@$buffer)
840
+ {
841
+ if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
842
+
843
+ my $buf = shift(@$buffer);
844
+ if ( $$buf{exclude} )
845
+ {
846
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>1);
847
+ }
848
+ else
849
+ {
850
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>0);
851
+ }
852
+ push @out,$$buf{line};
853
+ }
854
+ return @out;
855
+ }
856
+
857
+
858
+ sub apply_gapwin_filter
859
+ {
860
+ my ($opts,$line) = @_;
861
+ if ( !exists($$opts{GapWin_buffer}) ) { $$opts{GapWin_buffer}=[]; }
862
+
863
+ my $vcf = $$opts{vcf};
864
+ my $win = $$opts{filters}{GapWin};
865
+ my $buffer = $$opts{GapWin_buffer};
866
+ my ($indel_chr,$indel_pos,$to);
867
+
868
+ if ( defined $line )
869
+ {
870
+ # There may be multiple variants, only indels can be filtered
871
+ my $is_indel = 0;
872
+ for my $alt (split(/,/,$$line[4]))
873
+ {
874
+ my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
875
+ if ( $type eq 'i' )
876
+ {
877
+ $is_indel = 1;
878
+ $indel_chr = $$line[0];
879
+ $indel_pos = $$line[1] + 1;
880
+ }
881
+ }
882
+ # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
883
+ # correct as the indel can begin anywhere in the VCF4.x record with
884
+ # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
885
+ # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
886
+ # some valid SNPs.
887
+ $to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
888
+ push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, is_indel=>$is_indel };
889
+ }
890
+
891
+ my $n = @$buffer;
892
+
893
+ # Is the new line an indel? If yes, check the distance to all previous lines
894
+ if ( defined $indel_chr )
895
+ {
896
+ for (my $i=0; $i<$n-1; $i++)
897
+ {
898
+ my $buf = $$buffer[$i];
899
+ if ( $$buf{chr} ne $indel_chr ) { next; }
900
+ if ( !$$buf{is_indel} ) { next; }
901
+ if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; $$buffer[-1]{exclude}=1; }
902
+ }
903
+ }
904
+
905
+ if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
906
+ {
907
+ # There are not enough rows in the buffer: the GapWin window spans them all. Wait until there is more rows
908
+ # or a new chromosome
909
+ return ();
910
+ }
911
+
912
+ # Output. If no $line was given, output everything
913
+ $to = $$buffer[-1]{from}-$win;
914
+ my $chr = $$buffer[-1]{chr};
915
+ my @out;
916
+ while (@$buffer)
917
+ {
918
+ if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
919
+
920
+ my $buf = shift(@$buffer);
921
+ if ( $$buf{exclude} )
922
+ {
923
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>1);
924
+ }
925
+ else
926
+ {
927
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>0);
928
+ }
929
+ push @out,$$buf{line};
930
+ }
931
+ return @out;
932
+ }
933
+
934
+
935
+ sub apply_snpcluster_filter
936
+ {
937
+ my ($opts,$line) = @_;
938
+
939
+ my $buffer = $$opts{SnpCluster_buffer};
940
+ my $n = @$buffer;
941
+
942
+ # The buffer is empty and the line contains only reference alleles, print directly
943
+ if ( $n==0 && defined $line && $$line[4] eq '.' )
944
+ {
945
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],'SnpCluster'=>0);
946
+ return $line;
947
+ }
948
+
949
+ # Store the line in buffer and check how many lines can be printed
950
+ my $to; # All lines up to and including this index will be printed
951
+ my $win = $$opts{SnpCluster_win};
952
+ if ( defined $line )
953
+ {
954
+ # Exclude REF (and maybe also other filters?) form SnpCluster
955
+ my $can_be_filtered = $$line[4] eq '.' ? 0 : 1;
956
+ push @$buffer, { line=>$line, chr=>$$line[0], pos=>$$line[1], can_be_filtered=>$can_be_filtered, in_cluster=>0 };
957
+ $n++;
958
+
959
+ # Does the buffer hold enough lines now?
960
+ my $last_chr = $$buffer[-1]{chr};
961
+ my $last_pos = $$buffer[-1]{pos};
962
+ for (my $i=$n-1; $i>=0; $i--)
963
+ {
964
+ my $buf = $$buffer[$i];
965
+ if ( $$buf{chr} ne $last_chr ) { $to=$i; last; }
966
+ if ( $last_pos - $$buf{pos} >= $win ) { $to=$i; last; }
967
+ }
968
+
969
+ if ( !defined $to ) { return; }
970
+ }
971
+ if ( !defined $to ) { $to=$n-1; }
972
+
973
+ # Calculate the number of variants within the window
974
+ my $count = 0;
975
+ my $max_count = $$opts{SnpCluster_count};
976
+ my $start_chr = $$buffer[0]{chr};
977
+ my $start_pos = $$buffer[0]{pos};
978
+ my $idx;
979
+ for ($idx=0; $idx<$n; $idx++)
980
+ {
981
+ my $buf = $$buffer[$idx];
982
+ if ( $$buf{chr} ne $start_chr ) { last; }
983
+ if ( $$buf{pos} - $win >= $start_pos ) { last; }
984
+ if ( $$buf{can_be_filtered} ) { $count++; }
985
+ }
986
+
987
+ # If a SNP cluster was found, set the in_cluster flag for all relevant sites.
988
+ # The buffer will be flushed and the orphans would pass unnoticed.
989
+ if ( $count>=$max_count )
990
+ {
991
+ for (my $i=0; $i<$idx; $i++)
992
+ {
993
+ if ( $$buffer[$i]{can_be_filtered} ) { $$buffer[$i]{in_cluster}=1; }
994
+ }
995
+ }
996
+
997
+ # Now output the lines, adding or removing the filter
998
+ my @out = ();
999
+ for (my $i=0; $i<=$to; $i++)
1000
+ {
1001
+ my $buf = shift(@$buffer);
1002
+ if ( $$buf{in_cluster} )
1003
+ {
1004
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>1);
1005
+ }
1006
+ else
1007
+ {
1008
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
1009
+ }
1010
+ push @out,$$buf{line};
1011
+ }
1012
+
1013
+ # Output all non-variant lines at the beggining of the buffer
1014
+ while (@$buffer)
1015
+ {
1016
+ if ( $$buffer[0]{can_be_filtered} ) { last; }
1017
+ my $buf = shift(@$buffer);
1018
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
1019
+ push @out,$$buf{line};
1020
+ }
1021
+ return @out;
1022
+ }
1023
+
1024
+ sub print_line
1025
+ {
1026
+ my ($line) = @_;
1027
+ if ( defined $line ) { print join("\t",@$line) . "\n"; }
1028
+ }
1029
+
1030
+
1031
+
1032
+ #---------------------------------
1033
+
1034
+ package Reader;
1035
+
1036
+ use strict;
1037
+ use warnings;
1038
+ use Carp;
1039
+
1040
+ sub new
1041
+ {
1042
+ my ($class,@args) = @_;
1043
+ my $self = @args ? {@args} : {};
1044
+ bless $self, ref($class) || $class;
1045
+ if ( !$$self{delim} ) { $$self{delim} = qr/\t/; }
1046
+ if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0)
1047
+ if ( !$$self{from} ) { $$self{from} = 1; } # the index of the from column
1048
+ if ( !$$self{to} ) { $$self{to} = 2; } # the index of the to column
1049
+ return $self;
1050
+ }
1051
+
1052
+ sub throw
1053
+ {
1054
+ my ($self,@msg) = @_;
1055
+ confess @msg;
1056
+ }
1057
+
1058
+ sub open
1059
+ {
1060
+ my ($self,%args) = @_;
1061
+ if ( !$$self{file} ) { return; }
1062
+ $self->close();
1063
+ open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!");
1064
+ }
1065
+
1066
+ sub close
1067
+ {
1068
+ my ($self) = @_;
1069
+ if ( !$$self{fh} ) { return; }
1070
+ close($$self{fh});
1071
+ delete($$self{fh});
1072
+ delete($$self{buffer});
1073
+ }
1074
+
1075
+ sub unread_line
1076
+ {
1077
+ my ($self,$line) = @_;
1078
+ unshift @{$$self{buffer}}, $line;
1079
+ return;
1080
+ }
1081
+
1082
+ sub next_line
1083
+ {
1084
+ my ($self) = @_;
1085
+ if ( !$$self{fh} ) { return undef; } # Run in dummy mode
1086
+ if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
1087
+ my $line;
1088
+ # Skip comments
1089
+ while (1)
1090
+ {
1091
+ $line = readline($$self{fh});
1092
+ if ( !defined $line ) { return undef; }
1093
+ if ( $line=~/^#/ ) { next; }
1094
+ last;
1095
+ }
1096
+ my @items = split($$self{delim},$line);
1097
+ return \@items;
1098
+ }
1099
+