ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1099 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Author: petr.danecek@sanger
4
+ #
5
+
6
+ use strict;
7
+ use warnings;
8
+ use Carp;
9
+ use Vcf;
10
+
11
+ my %filters =
12
+ (
13
+ MinAB => { dflt=>2, usage=>'INT', desc=>'Minimum number of alternate bases (INFO/DP4)', nick=>'a' },
14
+ SnpCluster => { dflt=>undef, usage=>'INT1,INT2', desc=>"Filters clusters of 'INT1' or more SNPs within a run of 'INT2' bases", nick=>'c' },
15
+ MinDP => { dflt=>2, usage=>'INT', desc=>"Minimum read depth (INFO/DP or INFO/DP4)", nick=>'d' },
16
+ MaxDP => { dflt=>10_000_000, usage=>'INT', desc=>"Maximum read depth (INFO/DP or INFO/DP4)", nick=>'D' },
17
+ MinMQ => { dflt=>10, usage=>'INT', desc=>"Minimum RMS mapping quality for SNPs (INFO/MQ)", nick=>'q' },
18
+ SnpGap => { dflt=>10, usage=>'INT', desc=>"SNP within INT bp around a gap to be filtered", nick=>'w' },
19
+ GapWin => { dflt=>3, usage=>'INT', desc=>"Window size for filtering adjacent gaps", nick=>'W' },
20
+ StrandBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for strand bias (INFO/PV4)", nick=>'1' },
21
+ BaseQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for baseQ bias (INFO/PV4)", nick=>'2' },
22
+ MapQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for mapQ bias (INFO/PV4)", nick=>'3' },
23
+ EndDistBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for end distance bias (INFO/PV4)", nick=>'4' },
24
+ RefN => { dflt=>'', usage=>'', desc=>"Reference base is N", nick=>'r' },
25
+ Qual => { dflt=>'10', usage=>'INT', desc=>"Minimum value of the QUAL field", nick=>'Q' },
26
+ VDB => { dflt=>'0.015', usage=>'FLOAT', desc=>"Minimum Variant Distance Bias (INFO/VDB)", nick=>'v' },
27
+ HWE => { dflt=>1e-4, usage=>'FLOAT', desc=>"Minimum P-value for HWE (plus F<0) (INFO/HWE and INFO/G3)", nick=>'H' },
28
+ );
29
+
30
+ my $opts = parse_params();
31
+ annotate($opts);
32
+
33
+ exit;
34
+
35
+ #--------------------------------
36
+
37
+ sub error
38
+ {
39
+ my (@msg) = @_;
40
+ if ( scalar @msg ) { confess @msg; }
41
+
42
+ my @filters;
43
+ for my $key (sort {lc($filters{$a}{nick}) cmp lc($filters{$b}{nick})} keys %filters)
44
+ {
45
+ push @filters, sprintf("\t%s, %-25s\t\t%s [%s]\n", $filters{$key}{nick},$key.' '.$filters{$key}{usage},$filters{$key}{desc},defined($filters{$key}{dflt})? $filters{$key}{dflt} : '');
46
+ }
47
+
48
+ print
49
+ "About: Annotates VCF file, adding filters or custom annotations. Requires tabix indexed file with annotations.\n",
50
+ " Currently annotates only the ID and INFO column, but it will be extended on popular demand.\n",
51
+ "Usage: cat in.vcf | vcf-annotate [OPTIONS] > out.vcf\n",
52
+ "Options:\n",
53
+ " -a, --annotations <file.gz> The tabix indexed file with the annotations: CHR\\tFROM[\\tTO][\\tVALUE]+.\n",
54
+ " -c, --columns <list> The list of columns in the annotation file, e.g. CHROM,FROM,TO,-,INFO/STR,INFO/GN. The dash\n",
55
+ " in this example indicates that the third column should be ignored. If TO is not\n",
56
+ " present, it is assumed that TO equals to FROM. When REF and ALT columns are present, only\n",
57
+ " matching lines are annotated.\n",
58
+ " -d, --description <file|string> Header annotation, e.g. key=INFO,ID=HM2,Number=0,Type=Flag,Description='HapMap2 membership'.\n",
59
+ " The descriptions can be read from a file, one annotation per line.\n",
60
+ " -f, --filter <file|list> Apply filters, list is in the format flt1=value/flt2/flt3=value/etc. If argument to -f is a file,\n",
61
+ " user-defined filters be applied. See User Defined Filters below.\n",
62
+ " -h, -?, --help This help message.\n",
63
+ "Filters:\n",
64
+ sprintf("\t+ %-25s\t\tApply all filters with default values (can be overriden, see the example below).\n",''),
65
+ sprintf("\t-X %-25s\t\tExclude the filter X\n",''),
66
+ join('',@filters),
67
+ "Examples:\n",
68
+ " zcat in.vcf.gz | vcf-annotate -a annotations.gz -d descriptions.txt -c FROM,TO,CHROM,ID,INFO/DP | bgzip -c >out.vcf.gz \n",
69
+ " zcat in.vcf.gz | vcf-annotate -f +/-a/c=3,10/q=3/d=5/-D -a annotations.gz -d descriptions.txt | bgzip -c >out.vcf.gz \n",
70
+ " zcat in.vcf.gz | vcf-annotate -a dbSNPv132.tab.gz -c CHROM,FROM,REF,ALT,ID,-,-,- | bgzip -c >out.vcf.gz \n",
71
+ "Where descriptions.txt contains:\n",
72
+ " key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name'\n",
73
+ " key=INFO,ID=STR,Number=1,Type=Integer,Description='Strand'\n",
74
+ "The file dbSNPv132.tab.gz with dbSNP IDs can be downloaded from\n",
75
+ " ftp://ftp.sanger.ac.uk/pub/1000genomes/pd3/dbSNP/\n",
76
+ "\n",
77
+ "User-defined filters:\n",
78
+ " # The examples below are self-explanatory. Notice the use of the predefined\n",
79
+ " # variables (\$MATCH, \$PASS, \$FAIL) and methods (error).\n",
80
+ " #\n",
81
+ " # In this example, a minimum value of AF1=0.1 is required\n",
82
+ " {\n",
83
+ " tag => 'INFO/AF1', # The VCF tag to apply this filter on\n",
84
+ " name => 'MinAF', # The filter ID\n",
85
+ " desc => 'Minimum AF1 [0.01]', # Description for the VCF header\n",
86
+ " test => sub { return \$MATCH < 0.01 ? \$FAIL : \$PASS },\n",
87
+ " },\n",
88
+ " # Filter all indels (presence of INDEL tag is tested)\n",
89
+ " {\n",
90
+ " tag => 'INFO/INDEL',\n",
91
+ " apply_to => 'indels', # Can be one of SNPs, indels, all. Default: [All]\n",
92
+ " name => 'Indel',\n",
93
+ " desc => 'INDEL tag present',\n",
94
+ " test => sub { return \$FAIL },\n",
95
+ " },\n",
96
+ " # Only loci with enough reads supporting the variant will pass the filter\n",
97
+ " {\n",
98
+ " tag => 'INFO/DP4',\n",
99
+ " name => 'FewAlts',\n",
100
+ " desc => 'Too few reads supporting the variant',\n",
101
+ " apply_to => 'SNPs',\n",
102
+ " test => sub {\n",
103
+ " if ( !(\$MATCH =~ /^([^,]+),([^,]+),([^,]+),(.+)\$/) )\n",
104
+ " {\n",
105
+ " error(\"Could not parse INFO/DP4: \$CHROM:\$POS [\$MATCH]\");\n",
106
+ " }\n",
107
+ " if ( 0.1*(\$1+\$2) > \$3+\$4 ) { return \$PASS; }\n",
108
+ " return \$FAIL;\n",
109
+ " },\n",
110
+ " },\n",
111
+ "\n";
112
+ exit -1;
113
+ }
114
+
115
+ sub parse_params
116
+ {
117
+ my $opts = { args=>[$0, @ARGV], };
118
+ while (defined(my $arg=shift(@ARGV)))
119
+ {
120
+ if ( $arg eq '-d' || $arg eq '--description' )
121
+ {
122
+ my $desc = shift(@ARGV);
123
+ if ( -e $desc )
124
+ {
125
+ open(my $fh,'<',$desc) or error("$desc: $!");
126
+ while (my $line=<$fh>)
127
+ {
128
+ if ( $line=~/^\s*$/ or $line=~/^#/ ) { next; }
129
+ chomp($line);
130
+ push @{$$opts{desc}}, $line;
131
+ }
132
+ close($fh);
133
+ }
134
+ else
135
+ {
136
+ push @{$$opts{desc}}, $desc;
137
+ }
138
+ next;
139
+ }
140
+ if ( $arg eq '-f' || $arg eq '--filter' )
141
+ {
142
+ my $filter = shift(@ARGV);
143
+ parse_filters($opts,$filter);
144
+ next;
145
+ }
146
+ if ( $arg eq '-c' || $arg eq '--columns' )
147
+ {
148
+ my $cols = shift(@ARGV);
149
+ $$opts{cols} = [ split(/,/,$cols) ];
150
+ next;
151
+ }
152
+ if ( $arg eq '-a' || $arg eq '--annotations' ) { $$opts{annotations} = shift(@ARGV); next }
153
+ if ( $arg eq '-t' || $arg eq '--tag' ) { $$opts{tag} = shift(@ARGV); next }
154
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
155
+ if ( -e $arg ) { $$opts{file}=$arg; next; }
156
+ error("Unknown parameter \"$arg\". Run -h for help.\n");
157
+ }
158
+ if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
159
+ {
160
+ if ( !exists($$opts{annotations}) ) { error("Missing the -a or -f option.\n") }
161
+ }
162
+ if ( exists($$opts{annotations}) && !exists($$opts{cols}) ) { error("Missing the -c option.\n"); }
163
+ return $opts;
164
+ }
165
+
166
+ sub parse_user_defined_filters
167
+ {
168
+ my ($opts,$str) = @_;
169
+ my $filters = [ do $str ];
170
+ for my $filter (@$filters)
171
+ {
172
+ if ( !exists($$filter{tag}) ) { error("Missing 'tag' key for one of the filters in $str\n"); }
173
+ if ( !($$filter{tag}=~m{^INFO/(.+)$}) ) { error("Currently only INFO tags supported. Could not parse the tag [$$filter{tag}]\n"); }
174
+ my $ID = $1;
175
+
176
+ $$filter{ID} = $ID;
177
+ $$filter{tag_re} = qr{$ID=?([^\t;]*)};
178
+
179
+ if ( !exists($$filter{name}) ) { error("Missing 'name' key for the filter [$$filter{tag}]\n"); }
180
+ if ( !exists($$filter{desc}) ) { error("Missing 'desc' key for the filter [$$filter{tag}]\n"); }
181
+
182
+ my $name = $$filter{name};
183
+ push @{$$opts{desc}}, "key=FILTER,ID=$name,Description='$$filter{desc}";
184
+ if ( !exists($$filter{apply_to}) or lc($$filter{apply_to}) eq 'all' )
185
+ {
186
+ $$opts{udef_filters}{'all'}{$name} = $filter;
187
+ $$opts{udef_filters}{'s'}{$name} = $filter;
188
+ $$opts{udef_filters}{'i'}{$name} = $filter;
189
+ }
190
+ elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'snps' )
191
+ {
192
+ $$opts{udef_filters}{'s'}{$name} = $filter;
193
+ $$opts{udef_filters_typecheck_needed} = 1;
194
+ }
195
+ elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'indels' )
196
+ {
197
+ $$opts{udef_filters}{'i'}{$name} = $filter;
198
+ $$opts{udef_filters_typecheck_needed} = 1;
199
+ }
200
+ }
201
+ }
202
+
203
+ sub parse_filters
204
+ {
205
+ my ($opts,$str) = @_;
206
+
207
+ if ( -e $str )
208
+ {
209
+ parse_user_defined_filters($opts,$str);
210
+ return;
211
+ }
212
+
213
+ my $has_filters = 0;
214
+ my $set_defaults = 0;
215
+ my @filters = split(m{/},$str);
216
+ for my $fltr (@filters)
217
+ {
218
+ if ( $fltr eq '+' ) { $set_defaults=1; last; }
219
+ }
220
+
221
+ my %mapping;
222
+ for my $flt (keys %filters)
223
+ {
224
+ if ( exists($mapping{$filters{$flt}{nick}}) ) { error("FIXME: the nick $filters{$flt}{nick} is not unique.\n"); }
225
+ $mapping{$filters{$flt}{nick}} = $flt;
226
+
227
+ if ( !defined($filters{$flt}{dflt}) ) { next; }
228
+ if ( $set_defaults )
229
+ {
230
+ $$opts{filters}{$flt} = $filters{$flt}{dflt};
231
+ }
232
+ }
233
+
234
+ for my $filter (@filters)
235
+ {
236
+ my ($key,$val) = split(/=/,$filter);
237
+ if ( $key eq '+' ) { next; }
238
+ my $to_be_deleted = 0;
239
+ if ( $key=~/^-(.+)$/ ) { $to_be_deleted=1; $key = $1; }
240
+ if ( !exists($filters{$key}) ) { $key = $mapping{$key}; }
241
+ if ( !exists($filters{$key}) && !exists($mapping{$key}) ) { error("The filter [$key] not recognised.\n"); }
242
+ if ( $to_be_deleted ) { delete($$opts{filters}{$key}); next; }
243
+
244
+ if ( $key eq 'c' || $key eq 'SnpCluster' )
245
+ {
246
+ ($$opts{SnpCluster_count},$$opts{SnpCluster_win}) = split(/,/,$val);
247
+
248
+ # Simple sanity check
249
+ if ( $$opts{SnpCluster_count}>$$opts{SnpCluster_win} )
250
+ {
251
+ error("Did you really mean snpCluster=$$opts{SnpCluster_count},$$opts{SnpCluster_win}? The win (INT2) must be bigger or equal to count (INT1).");
252
+ }
253
+ $$opts{SnpCluster_buffer} = [];
254
+ push @{$$opts{desc}}, "key=FILTER,ID=SnpCluster,Description='$filters{SnpCluster}{desc} [win=$$opts{SnpCluster_win},count=$$opts{SnpCluster_count}]'";
255
+ $has_filters = 1;
256
+ next;
257
+ }
258
+
259
+ $$opts{filters}{$key} = $val;
260
+ $has_filters = 1;
261
+ }
262
+ for my $key (keys %{$$opts{filters}})
263
+ {
264
+ push @{$$opts{desc}}, "key=FILTER,ID=$key,Description='$filters{$key}{desc}" . (defined $$opts{filters}{$key} ? " [$$opts{filters}{$key}]'" : "'");
265
+ }
266
+ if ( !$has_filters && !scalar keys %{$$opts{filters}} ) { delete($$opts{filters}); }
267
+ }
268
+
269
+
270
+ # Convert text descriptions given on command line to hashes which will be
271
+ # passed to Vcf::add_header_line
272
+ sub parse_descriptions
273
+ {
274
+ my ($descs) = @_;
275
+ my @out;
276
+ for my $str (@$descs)
277
+ {
278
+ my $desc = {};
279
+ my $tmp = $str;
280
+ while ($tmp)
281
+ {
282
+ my ($key,$value);
283
+ if ( $tmp=~/^([^=]+)=["']([^\"]+)["']/ ) { $key=$1; $value=$2; }
284
+ elsif ( $tmp=~/^([^=]+)=([^,"]+)/ && $1 eq 'Description' )
285
+ {
286
+ # The command line eats the quotes
287
+ $key=$1; $value=$2.$';
288
+ $$desc{$key} = $value;
289
+ last;
290
+ }
291
+ elsif ( $tmp=~/^([^=]+)=([^,"]+)/ )
292
+ {
293
+ $key=$1; $value=$2;
294
+ }
295
+ else { error(qq[Could not parse the description: [$str]\n]); }
296
+ $$desc{$key} = $value;
297
+
298
+ $tmp = $';
299
+ if ( $tmp=~/^,/ ) { $tmp = $'; }
300
+
301
+ }
302
+ if ( !exists($$desc{ID}) ) { error("No ID in description? [$str]\n"); }
303
+ push @out, $desc;
304
+ }
305
+ return \@out;
306
+ }
307
+
308
+ # Create mapping from the annotation IDs to column indexes. The mapping is used
309
+ # to determine which columns should be used from the annotation file. The
310
+ # following structure is returned:
311
+ # {
312
+ # CHROM => col_idx,
313
+ # FROM => col_idx,
314
+ # TO => col_idx,
315
+ # annots =>
316
+ # [
317
+ # { col=>col_idx, id=>annot_id, vcf_col=>vcf_column, is_flag=>0 },
318
+ # ]
319
+ # }
320
+ # If {annots}{is_flag} is nonzero, "annot_id" will be written to VCF instead of "annot_id=value".
321
+ # Currently only one VCF column (INFO) is supported.
322
+ #
323
+ sub parse_columns
324
+ {
325
+ my ($cols,$descriptions) = @_;
326
+
327
+ my %desc = ();
328
+ my %out = ( annots=>[] );
329
+
330
+ if ( !defined $cols ) { return \%out; }
331
+
332
+ for my $d (@$descriptions)
333
+ {
334
+ $desc{$$d{key}.'/'.$$d{ID}} = $d;
335
+ }
336
+
337
+ for (my $i=0; $i<@$cols; $i++)
338
+ {
339
+ my $col = $$cols[$i];
340
+
341
+ if ( $col eq '-' ) { next; }
342
+ elsif ( $col eq 'CHROM' ) { $out{$col}=$i; }
343
+ elsif ( $col eq 'FROM' ) { $out{$col}=$i; }
344
+ elsif ( $col eq 'POS' ) { $out{'FROM'}=$i; }
345
+ elsif ( $col eq 'TO' ) { $out{$col}=$i; }
346
+ elsif ( $col eq 'ID' ) { $out{$col}=$i; }
347
+ elsif ( $col eq 'REF' ) { $out{$col}=$i; }
348
+ elsif ( $col eq 'ALT' ) { $out{$col}=$i; }
349
+ else
350
+ {
351
+ if ( !exists($desc{$col}) && exists($desc{"INFO/$col"}) )
352
+ {
353
+ print STDERR qq[The description for "$col" does not exist, assuming "INFO/$col"\n];
354
+ $col = "INFO/$col";
355
+ }
356
+
357
+ if ( !exists($desc{$col}))
358
+ {
359
+ error("Missing the -d parameter for the column [$col]\n");
360
+ }
361
+ if ( !($col=~m{^(.+)/(.+)$}) ) { error("Could not parse the column [$col].\n"); }
362
+ my $key = $1;
363
+ my $id = $2;
364
+ my $rec = { col=>$i, id=>$id, vcf_col=>$key, is_flag=>($desc{$col}{Type} eq 'Flag' ? 1 : 0) };
365
+ push @{$out{annots}}, $rec;
366
+ if ( $key ne 'INFO' ) { error("TODO: other than INFO columns\n"); }
367
+ }
368
+ }
369
+ if ( !exists($out{CHROM}) ) { $out{CHROM}=0; }
370
+ if ( !exists($out{FROM}) ) { $out{FROM}=1; }
371
+ if ( !exists($out{TO}) ) { $out{TO}=$out{FROM}; }
372
+ if ( exists($out{REF}) && !exists($out{ALT}) or !exists($out{REF}) && exists($out{ALT}) ) { error("Expected both REF and ALT columns in the annotation file.\n"); }
373
+ return \%out;
374
+ }
375
+
376
+ sub annotate
377
+ {
378
+ my ($opts) = @_;
379
+
380
+ # Init the variables
381
+ my $descs = parse_descriptions($$opts{desc});
382
+ my $cols = parse_columns($$opts{cols},$descs);
383
+
384
+ # Open VCF file and add all required header lines
385
+ my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN);
386
+ $vcf->parse_header();
387
+ for my $desc (@$descs)
388
+ {
389
+ $vcf->add_header_line($desc,silent=>1);
390
+ }
391
+ $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp');
392
+ print $vcf->format_header();
393
+
394
+ my ($prev_chr,$prev_pos,$annot_from,$annot_to,$annot_line);
395
+ my @annots = @{$$cols{annots}};
396
+ my $id_col = exists($$cols{ID}) ? $$cols{ID} : undef;
397
+ my $from_col = $$cols{FROM};
398
+ my $to_col = $$cols{TO};
399
+ my $ref_col = exists($$cols{REF}) ? $$cols{REF} : undef;
400
+ my $alt_col = exists($$cols{ALT}) ? $$cols{ALT} : undef;
401
+
402
+ # Initialize the annotation reader
403
+ my $reader;
404
+ if ( exists($$opts{annotations}) )
405
+ {
406
+ $reader = Reader->new(file=>$$opts{annotations});
407
+ my $line = $vcf->next_line();
408
+ if ( !defined $line )
409
+ {
410
+ # VCF file is empty
411
+ undef $reader;
412
+ }
413
+ else
414
+ {
415
+ my @rec = split(/\t/,$line);
416
+ $prev_chr = $rec[0];
417
+ $prev_pos = $rec[1];
418
+ $vcf->_unread_line($line);
419
+ $reader->open(region=>$prev_chr);
420
+ }
421
+ }
422
+
423
+ while (defined $reader)
424
+ {
425
+ # Read next annotation group, i.e. all records with the same position (or overlapping in case of intervals)
426
+ my (@annot_lines,$annot_prev_from,$annot_prev_to);
427
+ while ($reader)
428
+ {
429
+ my $annot_line = $reader->next_line();
430
+ if ( !defined $annot_line ) { last; }
431
+ my $annot_from = $$annot_line[$from_col];
432
+ my $annot_to = $$annot_line[$to_col];
433
+ if ( !@annot_lines )
434
+ {
435
+ push @annot_lines, $annot_line;
436
+ $annot_prev_from = $annot_from;
437
+ $annot_prev_to = $annot_to;
438
+ next;
439
+ }
440
+ if ( $annot_from <= $annot_prev_to or $annot_to <= $annot_prev_to )
441
+ {
442
+ push @annot_lines, $annot_line;
443
+ if ( $annot_prev_to < $annot_to ) { $annot_prev_to = $annot_to; }
444
+ next;
445
+ }
446
+ $reader->unread_line($annot_line);
447
+ last;
448
+ }
449
+
450
+ # Now loop through the VCF records
451
+ my $line;
452
+ while ($line = $vcf->next_line())
453
+ {
454
+ my @rec = split(/\t/,$line);
455
+ my $chr = $rec[0];
456
+ my $pos = $rec[1];
457
+ chomp($rec[-1]);
458
+
459
+ if ( $chr ne $prev_chr )
460
+ {
461
+ $vcf->_unread_line($line);
462
+ $prev_chr = $chr;
463
+ $reader->open(region=>$prev_chr);
464
+ last;
465
+ }
466
+
467
+ # Quick position-based check: Is there an annotation for this record?
468
+ if ( !defined $annot_prev_from or $pos < $annot_prev_from )
469
+ {
470
+ output_line($opts,\@rec);
471
+ next;
472
+ }
473
+ if ( $pos > $annot_prev_to )
474
+ {
475
+ $vcf->_unread_line($line);
476
+ last;
477
+ }
478
+
479
+ # Initialize the REF,ALT-based check
480
+ my ($ref,%alt);
481
+ if ( defined $alt_col )
482
+ {
483
+ $ref = $rec[3];
484
+ %alt = map { $_=>1 } split(/,/,$rec[4]);
485
+ }
486
+
487
+ # Now fill the annotations; Existing annotations with the same tag will be overwritten
488
+ my %values = ();
489
+ my %ids = ();
490
+ for my $annot_line (@annot_lines)
491
+ {
492
+ # Skip annotation lines which are not relevant to this VCF record
493
+ if ( $$annot_line[$from_col] > $pos or $$annot_line[$to_col] < $pos ) { next; }
494
+ if ( defined $alt_col && $$annot_line[$ref_col] ne '.' )
495
+ {
496
+ if ( $$annot_line[$ref_col] ne $ref ) { next; }
497
+ if ( !exists($alt{$$annot_line[$alt_col]}) ) { next; }
498
+ }
499
+ for my $info (@annots)
500
+ {
501
+ my $val = $$annot_line[$$info{col}];
502
+ chomp($val);
503
+
504
+ if ( $val eq '' or $val eq '.' ) { $val=undef; } # Existing annotation should be removed
505
+ elsif ( $$info{is_flag} )
506
+ {
507
+ if ( $val ) { $val=''; } # Flag annotation should be added
508
+ else { $val=undef; } # Flag annotation should be removed
509
+ }
510
+
511
+ # Only single undef values can be present are overriden if defined in other records
512
+ if ( !defined $val && exists($values{$$info{id}}) ) { next; }
513
+ elsif ( exists($values{$$info{id}}) && !defined $values{$$info{id}}[0] )
514
+ {
515
+ $values{$$info{id}}[0] = $val;
516
+ next;
517
+ }
518
+ push @{$values{$$info{id}}}, $val;
519
+ }
520
+ if ( defined $id_col && $$annot_line[$id_col] ne '' ) { $ids{$$annot_line[$id_col]} = 1; }
521
+ }
522
+ if ( scalar keys %ids ) { $rec[2] = join(';', keys %ids); }
523
+ if ( scalar keys %values )
524
+ {
525
+ for my $key (keys %values)
526
+ {
527
+ # Cannot use join on undef values
528
+ $values{$key} = scalar @{$values{$key}} == 1 ? $values{$key}[0] : join(',', @{$values{$key}});
529
+ }
530
+ $rec[7] = $vcf->add_info_field($rec[7],%values);
531
+ }
532
+ output_line($opts,\@rec);
533
+ }
534
+ if ( !defined $line ) { last; }
535
+ }
536
+
537
+ # Finish the VCF, no annotations for this part
538
+ while (my $line=$vcf->next_line)
539
+ {
540
+ my @rec = split(/\t/,$line);
541
+ chomp($rec[-1]);
542
+ output_line($opts,\@rec);
543
+ }
544
+
545
+ # Output any lines left in the buffer
546
+ output_line($opts);
547
+ }
548
+
549
+
550
+ # Stage the lines and then apply filtering if requested, otherwise just print the line
551
+ sub output_line
552
+ {
553
+ my ($opts,$line) = @_;
554
+
555
+ if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) )
556
+ {
557
+ # No filters requested, print the line
558
+ print_line($line);
559
+ return;
560
+ }
561
+
562
+ if ( defined $line )
563
+ {
564
+ # Local filters return the line back immediately
565
+ if ( scalar keys %{$$opts{filters}} )
566
+ {
567
+ $line = apply_local_filters($opts,$line);
568
+ }
569
+ if ( exists($$opts{udef_filters}) )
570
+ {
571
+ $line = apply_user_defined_filters($opts,$line);
572
+ }
573
+ }
574
+
575
+ # Staging filters may return nothing or multiple lines. If $line is not defined, they will
576
+ # empty the buffers
577
+ my @lines;
578
+ if ( exists($$opts{filters}{SnpGap}) )
579
+ {
580
+ @lines = apply_snpgap_filter($opts,$line);
581
+ if ( defined $line && !scalar @lines ) { return; }
582
+ }
583
+ elsif ( defined $line ) { @lines=($line); }
584
+
585
+ if ( exists($$opts{filters}{GapWin}) )
586
+ {
587
+ my @tmp;
588
+ if ( !defined $line ) { push @lines,undef; }
589
+ for my $line (@lines)
590
+ {
591
+ push @tmp, apply_gapwin_filter($opts,$line);
592
+ }
593
+ @lines = @tmp;
594
+ }
595
+
596
+ if ( exists($$opts{SnpCluster_count}) )
597
+ {
598
+ my @tmp;
599
+ if ( !defined $line ) { push @lines,undef; }
600
+ for my $line (@lines)
601
+ {
602
+ push @tmp, apply_snpcluster_filter($opts,$line);
603
+ }
604
+ @lines = @tmp;
605
+ }
606
+
607
+ for my $line (@lines)
608
+ {
609
+ print_line($line);
610
+ }
611
+ }
612
+
613
+ sub apply_user_defined_filters
614
+ {
615
+ my ($opts,$line) = @_;
616
+
617
+ our($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF);
618
+ $CHROM = $$line[0];
619
+ $POS = $$line[1];
620
+ $FAIL = 1;
621
+ $PASS = 0;
622
+ $RECORD = $line;
623
+ $VCF = $$opts{vcf};
624
+
625
+ my %filters = ();
626
+ if ( $$opts{udef_filters_typecheck_needed} )
627
+ {
628
+ # Check if the line has an indel, SNP or both
629
+ for my $alt (split(/,/,$$line[4]))
630
+ {
631
+ my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt);
632
+ if ( exists($$opts{udef_filters}{$type}) )
633
+ {
634
+ %filters = ( %filters, %{$$opts{udef_filters}{$type}} );
635
+ }
636
+ }
637
+ # Return if the line does not have the wanted variant type
638
+ if ( !scalar %filters ) { return $line; }
639
+ }
640
+ else
641
+ {
642
+ %filters = %{$$opts{udef_filters}{all}};
643
+ }
644
+
645
+ my %apply;
646
+ for my $filter (values %filters)
647
+ {
648
+ if ( !($$line[7]=~$$filter{tag_re}) ) { next; }
649
+ $MATCH = $1;
650
+ $apply{ $$filter{name} } = &{$$filter{test}} == $PASS ? 0 : 1;
651
+ }
652
+ if ( scalar keys %apply )
653
+ {
654
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
655
+ }
656
+
657
+ return $line;
658
+ }
659
+
660
+ sub apply_local_filters
661
+ {
662
+ my ($opts,$line) = @_;
663
+
664
+ if ( !defined $line ) { return; }
665
+
666
+ my $filters = $$opts{filters};
667
+ my %apply;
668
+
669
+ my ($dp,$dp_alt,$mq,$vdb);
670
+ if ( $$line[7]=~/DP4=(\d+),(\d+),(\d+),(\d+)/i )
671
+ {
672
+ $dp = $1 + $2 + $3 + $4;
673
+ $dp_alt = $3 + $4;
674
+ }
675
+ if ( $$line[7]=~/DP=(\d+)/i ) { $dp = $1; }
676
+ if ( $$line[7]=~/MQ=(\d+)/i ) { $mq = $1; }
677
+ if ( $$line[7]=~/VDB=([^;,\t]+)/i ) { $vdb = $1; }
678
+
679
+ if ( exists($$filters{RefN}) )
680
+ {
681
+ $apply{RefN} = ($$line[3]=~/N/) ? 1 : 0;
682
+ }
683
+ if ( exists($$filters{Qual}) && $$line[5] ne '.' )
684
+ {
685
+ $apply{Qual} = $$line[5] < $$filters{Qual} ? 1 : 0;
686
+ }
687
+ if ( exists($$filters{HWE}) && $$line[7]=~/G3=([^,]+),([^,]+),/ )
688
+ {
689
+ my ($rr,$ra);
690
+ $rr = $1;
691
+ $ra = $2;
692
+ $apply{HWE} = 0;
693
+ if ( $$line[7]=~/HWE=([^;\t]+)/ && $1<$$filters{HWE} )
694
+ {
695
+ my $p = 2*$rr + $ra;
696
+ if ( $p>0 && $p<1 && $ra/($p*(1-$p))>1 )
697
+ {
698
+ $apply{HWE} = 1;
699
+ }
700
+ }
701
+ }
702
+ if ( defined $dp_alt )
703
+ {
704
+ if ( exists($$filters{MinAB}) )
705
+ {
706
+ $apply{MinAB} = $dp_alt < $$filters{MinAB} ? 1 : 0;
707
+ }
708
+ }
709
+ if ( defined $vdb )
710
+ {
711
+ $apply{VDB} = $vdb < $$filters{VDB} ? 1 : 0;
712
+ }
713
+ if ( defined $dp )
714
+ {
715
+ if ( exists($$filters{MinDP}) )
716
+ {
717
+ $apply{MinDP} = $dp < $$filters{MinDP} ? 1 : 0;
718
+ }
719
+ if ( exists($$filters{MaxDP}) )
720
+ {
721
+ $apply{MaxDP} = $dp > $$filters{MaxDP} ? 1 : 0;
722
+ }
723
+ }
724
+ if ( defined $mq )
725
+ {
726
+ if ( exists($$filters{MinMQ}) )
727
+ {
728
+ $apply{MinMQ} = $mq < $$filters{MinMQ} ? 1 : 0;
729
+ }
730
+ }
731
+ if ( $$line[7]=~/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ )
732
+ {
733
+ if ( exists($$filters{StrandBias}) )
734
+ {
735
+ $apply{StrandBias} = $1 < $$filters{StrandBias} ? 1 : 0;
736
+ }
737
+ if ( exists($$filters{BaseQualBias}) )
738
+ {
739
+ $apply{BaseQualBias} = $2 < $$filters{BaseQualBias} ? 1 : 0;
740
+ }
741
+ if ( exists($$filters{MapQualBias}) )
742
+ {
743
+ $apply{MapQualBias} = $3 < $$filters{MapQualBias} ? 1 : 0;
744
+ }
745
+ if ( exists($$filters{EndDistBias}) )
746
+ {
747
+ $apply{EndDistBias} = $4 < $$filters{EndDistBias} ? 1 : 0;
748
+ }
749
+ }
750
+ if ( scalar keys %apply )
751
+ {
752
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply);
753
+ }
754
+ return $line;
755
+ }
756
+
757
+ sub apply_snpgap_filter
758
+ {
759
+ my ($opts,$line) = @_;
760
+ if ( !exists($$opts{SnpGap_buffer}) ) { $$opts{SnpGap_buffer}=[]; }
761
+
762
+ my $vcf = $$opts{vcf};
763
+ my $win = $$opts{filters}{SnpGap};
764
+ my $buffer = $$opts{SnpGap_buffer};
765
+ my ($indel_chr,$indel_pos,$to);
766
+
767
+ if ( defined $line )
768
+ {
769
+ # There may be multiple variants, look for an indel. Anything what is not ref can be filtered.
770
+ my $is_indel = 0;
771
+ my $can_be_filtered = 0;
772
+ for my $alt (split(/,/,$$line[4]))
773
+ {
774
+ my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
775
+ if ( $type eq 'i' )
776
+ {
777
+ $is_indel = 1;
778
+ $indel_chr = $$line[0];
779
+ $indel_pos = $$line[1]+1;
780
+ }
781
+ elsif ( $type ne 'r' ) { $can_be_filtered = 1; }
782
+ }
783
+ # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
784
+ # correct as the indel can begin anywhere in the VCF4.x record with
785
+ # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
786
+ # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
787
+ # some valid SNPs.
788
+ $to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
789
+ push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, can_be_filtered=>$can_be_filtered, is_indel=>$is_indel };
790
+ }
791
+
792
+ my $n = @$buffer;
793
+
794
+ # Is the new line an indel? If yes, check the distance to all previous lines
795
+ if ( defined $indel_chr )
796
+ {
797
+ for (my $i=0; $i<$n-1; $i++)
798
+ {
799
+ my $buf = $$buffer[$i];
800
+ if ( $$buf{chr} ne $indel_chr ) { next; }
801
+ if ( !$$buf{can_be_filtered} ) { next; }
802
+ if ( $$buf{is_indel} ) { next; }
803
+ if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; }
804
+ }
805
+ }
806
+
807
+ if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
808
+ {
809
+ # There are not enough rows in the buffer: the SnpGap window spans them all. Wait until there is more rows
810
+ # or a new chromosome
811
+ return ();
812
+ }
813
+
814
+ # 'Look-behind' filtering was done above, now comes 'look-ahead' filtering
815
+ my $indel_to;
816
+ for (my $i=0; $i<$n; $i++)
817
+ {
818
+ my $buf = $$buffer[$i];
819
+ if ( $$buf{is_indel} )
820
+ {
821
+ $indel_to = $$buf{to};
822
+ $indel_chr = $$buf{chr};
823
+ next;
824
+ }
825
+ if ( !defined $indel_to ) { next; }
826
+ if ( !$$buf{can_be_filtered} ) { next; }
827
+ if ( $$buf{chr} ne $indel_chr )
828
+ {
829
+ undef $indel_to;
830
+ next;
831
+ }
832
+ if ( $$buf{from}<=$indel_to+$win ) { $$buf{exclude}=1; }
833
+ }
834
+
835
+ # Output. If no $line was given, output everything
836
+ $to = $$buffer[-1]{from}-$win;
837
+ my $chr = $$buffer[-1]{chr};
838
+ my @out;
839
+ while (@$buffer)
840
+ {
841
+ if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
842
+
843
+ my $buf = shift(@$buffer);
844
+ if ( $$buf{exclude} )
845
+ {
846
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>1);
847
+ }
848
+ else
849
+ {
850
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>0);
851
+ }
852
+ push @out,$$buf{line};
853
+ }
854
+ return @out;
855
+ }
856
+
857
+
858
+ sub apply_gapwin_filter
859
+ {
860
+ my ($opts,$line) = @_;
861
+ if ( !exists($$opts{GapWin_buffer}) ) { $$opts{GapWin_buffer}=[]; }
862
+
863
+ my $vcf = $$opts{vcf};
864
+ my $win = $$opts{filters}{GapWin};
865
+ my $buffer = $$opts{GapWin_buffer};
866
+ my ($indel_chr,$indel_pos,$to);
867
+
868
+ if ( defined $line )
869
+ {
870
+ # There may be multiple variants, only indels can be filtered
871
+ my $is_indel = 0;
872
+ for my $alt (split(/,/,$$line[4]))
873
+ {
874
+ my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt);
875
+ if ( $type eq 'i' )
876
+ {
877
+ $is_indel = 1;
878
+ $indel_chr = $$line[0];
879
+ $indel_pos = $$line[1] + 1;
880
+ }
881
+ }
882
+ # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not
883
+ # correct as the indel can begin anywhere in the VCF4.x record with
884
+ # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA
885
+ # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove
886
+ # some valid SNPs.
887
+ $to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1];
888
+ push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, is_indel=>$is_indel };
889
+ }
890
+
891
+ my $n = @$buffer;
892
+
893
+ # Is the new line an indel? If yes, check the distance to all previous lines
894
+ if ( defined $indel_chr )
895
+ {
896
+ for (my $i=0; $i<$n-1; $i++)
897
+ {
898
+ my $buf = $$buffer[$i];
899
+ if ( $$buf{chr} ne $indel_chr ) { next; }
900
+ if ( !$$buf{is_indel} ) { next; }
901
+ if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; $$buffer[-1]{exclude}=1; }
902
+ }
903
+ }
904
+
905
+ if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} )
906
+ {
907
+ # There are not enough rows in the buffer: the GapWin window spans them all. Wait until there is more rows
908
+ # or a new chromosome
909
+ return ();
910
+ }
911
+
912
+ # Output. If no $line was given, output everything
913
+ $to = $$buffer[-1]{from}-$win;
914
+ my $chr = $$buffer[-1]{chr};
915
+ my @out;
916
+ while (@$buffer)
917
+ {
918
+ if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; }
919
+
920
+ my $buf = shift(@$buffer);
921
+ if ( $$buf{exclude} )
922
+ {
923
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>1);
924
+ }
925
+ else
926
+ {
927
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>0);
928
+ }
929
+ push @out,$$buf{line};
930
+ }
931
+ return @out;
932
+ }
933
+
934
+
935
+ sub apply_snpcluster_filter
936
+ {
937
+ my ($opts,$line) = @_;
938
+
939
+ my $buffer = $$opts{SnpCluster_buffer};
940
+ my $n = @$buffer;
941
+
942
+ # The buffer is empty and the line contains only reference alleles, print directly
943
+ if ( $n==0 && defined $line && $$line[4] eq '.' )
944
+ {
945
+ $$line[6] = $$opts{vcf}->add_filter($$line[6],'SnpCluster'=>0);
946
+ return $line;
947
+ }
948
+
949
+ # Store the line in buffer and check how many lines can be printed
950
+ my $to; # All lines up to and including this index will be printed
951
+ my $win = $$opts{SnpCluster_win};
952
+ if ( defined $line )
953
+ {
954
+ # Exclude REF (and maybe also other filters?) form SnpCluster
955
+ my $can_be_filtered = $$line[4] eq '.' ? 0 : 1;
956
+ push @$buffer, { line=>$line, chr=>$$line[0], pos=>$$line[1], can_be_filtered=>$can_be_filtered, in_cluster=>0 };
957
+ $n++;
958
+
959
+ # Does the buffer hold enough lines now?
960
+ my $last_chr = $$buffer[-1]{chr};
961
+ my $last_pos = $$buffer[-1]{pos};
962
+ for (my $i=$n-1; $i>=0; $i--)
963
+ {
964
+ my $buf = $$buffer[$i];
965
+ if ( $$buf{chr} ne $last_chr ) { $to=$i; last; }
966
+ if ( $last_pos - $$buf{pos} >= $win ) { $to=$i; last; }
967
+ }
968
+
969
+ if ( !defined $to ) { return; }
970
+ }
971
+ if ( !defined $to ) { $to=$n-1; }
972
+
973
+ # Calculate the number of variants within the window
974
+ my $count = 0;
975
+ my $max_count = $$opts{SnpCluster_count};
976
+ my $start_chr = $$buffer[0]{chr};
977
+ my $start_pos = $$buffer[0]{pos};
978
+ my $idx;
979
+ for ($idx=0; $idx<$n; $idx++)
980
+ {
981
+ my $buf = $$buffer[$idx];
982
+ if ( $$buf{chr} ne $start_chr ) { last; }
983
+ if ( $$buf{pos} - $win >= $start_pos ) { last; }
984
+ if ( $$buf{can_be_filtered} ) { $count++; }
985
+ }
986
+
987
+ # If a SNP cluster was found, set the in_cluster flag for all relevant sites.
988
+ # The buffer will be flushed and the orphans would pass unnoticed.
989
+ if ( $count>=$max_count )
990
+ {
991
+ for (my $i=0; $i<$idx; $i++)
992
+ {
993
+ if ( $$buffer[$i]{can_be_filtered} ) { $$buffer[$i]{in_cluster}=1; }
994
+ }
995
+ }
996
+
997
+ # Now output the lines, adding or removing the filter
998
+ my @out = ();
999
+ for (my $i=0; $i<=$to; $i++)
1000
+ {
1001
+ my $buf = shift(@$buffer);
1002
+ if ( $$buf{in_cluster} )
1003
+ {
1004
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>1);
1005
+ }
1006
+ else
1007
+ {
1008
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
1009
+ }
1010
+ push @out,$$buf{line};
1011
+ }
1012
+
1013
+ # Output all non-variant lines at the beggining of the buffer
1014
+ while (@$buffer)
1015
+ {
1016
+ if ( $$buffer[0]{can_be_filtered} ) { last; }
1017
+ my $buf = shift(@$buffer);
1018
+ $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0);
1019
+ push @out,$$buf{line};
1020
+ }
1021
+ return @out;
1022
+ }
1023
+
1024
+ sub print_line
1025
+ {
1026
+ my ($line) = @_;
1027
+ if ( defined $line ) { print join("\t",@$line) . "\n"; }
1028
+ }
1029
+
1030
+
1031
+
1032
+ #---------------------------------
1033
+
1034
+ package Reader;
1035
+
1036
+ use strict;
1037
+ use warnings;
1038
+ use Carp;
1039
+
1040
+ sub new
1041
+ {
1042
+ my ($class,@args) = @_;
1043
+ my $self = @args ? {@args} : {};
1044
+ bless $self, ref($class) || $class;
1045
+ if ( !$$self{delim} ) { $$self{delim} = qr/\t/; }
1046
+ if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0)
1047
+ if ( !$$self{from} ) { $$self{from} = 1; } # the index of the from column
1048
+ if ( !$$self{to} ) { $$self{to} = 2; } # the index of the to column
1049
+ return $self;
1050
+ }
1051
+
1052
+ sub throw
1053
+ {
1054
+ my ($self,@msg) = @_;
1055
+ confess @msg;
1056
+ }
1057
+
1058
+ sub open
1059
+ {
1060
+ my ($self,%args) = @_;
1061
+ if ( !$$self{file} ) { return; }
1062
+ $self->close();
1063
+ open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!");
1064
+ }
1065
+
1066
+ sub close
1067
+ {
1068
+ my ($self) = @_;
1069
+ if ( !$$self{fh} ) { return; }
1070
+ close($$self{fh});
1071
+ delete($$self{fh});
1072
+ delete($$self{buffer});
1073
+ }
1074
+
1075
+ sub unread_line
1076
+ {
1077
+ my ($self,$line) = @_;
1078
+ unshift @{$$self{buffer}}, $line;
1079
+ return;
1080
+ }
1081
+
1082
+ sub next_line
1083
+ {
1084
+ my ($self) = @_;
1085
+ if ( !$$self{fh} ) { return undef; } # Run in dummy mode
1086
+ if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); }
1087
+ my $line;
1088
+ # Skip comments
1089
+ while (1)
1090
+ {
1091
+ $line = readline($$self{fh});
1092
+ if ( !defined $line ) { return undef; }
1093
+ if ( $line=~/^#/ ) { next; }
1094
+ last;
1095
+ }
1096
+ my @items = split($$self{delim},$line);
1097
+ return \@items;
1098
+ }
1099
+