ngs_server 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1215 @@
1
+ /*
2
+ * vcf_file_filters.cpp
3
+ *
4
+ * Created on: Aug 28, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 148 $)
7
+ */
8
+
9
+ #include "vcf_file.h"
10
+
11
+ void vcf_file::apply_filters(const parameters &params)
12
+ {
13
+ printLOG("Applying Required Filters.\n");
14
+ // Apply all filters in turn.
15
+ filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file);
16
+ filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file);
17
+ filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites);
18
+ filter_sites_by_position(params.chr_to_keep, params.start_pos, params.end_pos);
19
+ filter_sites_by_positions(params.positions_file);
20
+ filter_sites_by_BED_file(params.BED_file, params.BED_exclude);
21
+ filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles);
22
+ filter_sites_by_INFO_flags(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep);
23
+ filter_sites_by_quality(params.min_quality);
24
+ filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth);
25
+ filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value);
26
+ filter_individuals_by_mean_depth(params.min_indv_mean_depth, params.max_indv_mean_depth);
27
+ if (params.phased_only == true)
28
+ {
29
+ filter_individuals_by_phase();
30
+ filter_sites_by_phase();
31
+ }
32
+ filter_genotypes_by_quality(params.min_genotype_quality);
33
+ filter_genotypes_by_depth(params.min_genotype_depth, params.max_genotype_depth);
34
+ filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes);
35
+ filter_individuals_by_call_rate(params.min_indv_call_rate);
36
+ filter_individuals_randomly(params.max_N_indv);
37
+ filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_site_call_rate);
38
+ filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.max_missing_call_count);
39
+ filter_sites_by_HWE_pvalue(params.min_HWE_pvalue);
40
+ filter_sites_by_thinning(params.min_interSNP_distance);
41
+ }
42
+
43
+ void vcf_file::filter_genotypes_by_quality(double min_genotype_quality)
44
+ {
45
+ // Filter genotypes by quality
46
+ if ((min_genotype_quality <= 0) || (has_genotypes == false))
47
+ return;
48
+
49
+ if (has_genotypes == false)
50
+ error("Require Genotypes in VCF file in order to filter genotypes by Quality.");
51
+
52
+ printLOG("Filtering out Genotypes with Quality less than " + dbl2str(min_genotype_quality,0) + "\n");
53
+ string vcf_line;
54
+ vcf_entry e(N_indv);
55
+ for (unsigned int s=0; s<N_entries; s++)
56
+ {
57
+ if (include_entry[s] == false)
58
+ continue;
59
+
60
+ get_vcf_entry(s, vcf_line);
61
+ e.reset(vcf_line);
62
+ e.parse_genotype_entries(false, true);
63
+ e.filter_genotypes_by_quality(include_genotype[s], min_genotype_quality);
64
+ }
65
+ }
66
+
67
+ void vcf_file::filter_genotypes_by_depth(int min_depth, int max_depth)
68
+ {
69
+ // Filter genotypes by depth
70
+ if ((min_depth <= 0) && (max_depth == numeric_limits<int>::max()))
71
+ return;
72
+ if (has_genotypes == false)
73
+ error("Require Genotypes in VCF file in order to filter genotypes by Depth.");
74
+
75
+ printLOG("Filtering out Genotypes with Depth less than " + dbl2str(min_depth,0) + " and greater than " + dbl2str(max_depth, 0) + "\n");
76
+ string vcf_line;
77
+ vcf_entry e(N_indv);
78
+ for (unsigned int s=0; s<N_entries; s++)
79
+ {
80
+ if (include_entry[s] == false)
81
+ continue;
82
+
83
+ get_vcf_entry(s, vcf_line);
84
+ e.reset(vcf_line);
85
+ e.parse_genotype_entries(false, false, true);
86
+ e.filter_genotypes_by_depth(include_genotype[s], min_depth, max_depth);
87
+ }
88
+ }
89
+
90
+ void vcf_file::filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all)
91
+ {
92
+ // Filter genotypes by Filter Flags
93
+ if ((remove_all == false) && (filter_flags_to_remove.size() == 0))
94
+ return;
95
+ if (remove_all == true)
96
+ printLOG("Filtering out all genotypes with FILTER flag.\n");
97
+ else
98
+ printLOG("Filtering out genotypes by Filter Status.\n");
99
+
100
+ if (has_genotypes == false)
101
+ error("Require Genotypes in VCF file in order to filter genotypes by Filter Flag.");
102
+
103
+ string vcf_line;
104
+ vcf_entry e(N_indv);
105
+ for (unsigned int s=0; s<N_entries; s++)
106
+ {
107
+ if (include_entry[s] == false)
108
+ continue;
109
+
110
+ get_vcf_entry(s, vcf_line);
111
+ e.reset(vcf_line);
112
+ e.parse_genotype_entries(false, false, false, true);
113
+ e.filter_genotypes_by_filter_status(include_genotype[s], filter_flags_to_remove, remove_all);
114
+ }
115
+ }
116
+
117
+
118
+ void vcf_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude)
119
+ {
120
+ // Filter individuals by user provided lists
121
+ if (keep_then_exclude)
122
+ {
123
+ filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
124
+ filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
125
+ }
126
+ else
127
+ {
128
+ filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
129
+ filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
130
+ }
131
+ }
132
+
133
+ void vcf_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename)
134
+ {
135
+ // Filter individuals by user provided list
136
+ if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0))
137
+ return;
138
+ printLOG("Keeping individuals in 'keep' list\n");
139
+ set<string> indv_to_keep_copy = indv_to_keep;
140
+ if (indv_to_keep_filename != "")
141
+ {
142
+ ifstream infile(indv_to_keep_filename.c_str());
143
+ if (!infile.is_open())
144
+ error("Could not open Individual file:" + indv_to_keep_filename, 1);
145
+ string line;
146
+ string tmp_indv;
147
+ stringstream ss;
148
+ while (!infile.eof())
149
+ {
150
+ getline(infile, line);
151
+ ss.str(line);
152
+ ss >> tmp_indv;
153
+ indv_to_keep_copy.insert(tmp_indv);
154
+ ss.clear();
155
+ }
156
+ infile.close();
157
+ }
158
+
159
+ for (unsigned int ui=0; ui<N_indv; ui++)
160
+ {
161
+ if (include_indv[ui] == false)
162
+ continue;
163
+ if (indv_to_keep_copy.find(indv[ui]) == indv_to_keep_copy.end())
164
+ include_indv[ui] = false;
165
+ }
166
+ }
167
+
168
+ void vcf_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename)
169
+ {
170
+ // Filter individuals by user provided list
171
+ if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0))
172
+ return;
173
+ printLOG("Excluding individuals in 'exclude' list\n");
174
+ set<string> indv_to_exclude_copy = indv_to_exclude;
175
+ if (indv_to_exclude_filename != "")
176
+ {
177
+ ifstream infile(indv_to_exclude_filename.c_str());
178
+ if (!infile.is_open())
179
+ {
180
+ error("Could not open Individual file:" + indv_to_exclude_filename, 1);
181
+ }
182
+ string line;
183
+ string tmp_indv;
184
+ stringstream ss;
185
+ while (!infile.eof())
186
+ {
187
+ getline(infile, line);
188
+ ss.str(line);
189
+ ss >> tmp_indv;
190
+ indv_to_exclude_copy.insert(tmp_indv);
191
+ ss.clear();
192
+ }
193
+ infile.close();
194
+ }
195
+ for (unsigned int ui=0; ui<N_indv; ui++)
196
+ {
197
+ if (include_indv[ui] == false)
198
+ continue;
199
+ if (indv_to_exclude_copy.find(indv[ui]) != indv_to_exclude_copy.end())
200
+ include_indv[ui] = false;
201
+ }
202
+ }
203
+
204
+ void vcf_file::filter_individuals_by_call_rate(double min_call_rate)
205
+ {
206
+ // Filter individuals by call rate
207
+ if (min_call_rate <= 0.0)
208
+ return;
209
+
210
+ if (has_genotypes == false)
211
+ error("Require Genotypes in VCF file in order to filter individuals by call rate.");
212
+
213
+ printLOG("Filtering individuals by call rate\n");
214
+
215
+ unsigned int ui;
216
+ pair<int, int> genotype;
217
+ vector<int> N_sites_included(N_indv, 0);
218
+ vector<int> N_missing(N_indv, 0);
219
+ string vcf_line;
220
+ vcf_entry e(N_indv);
221
+ for (unsigned int s=0; s<N_entries; s++)
222
+ {
223
+ if (include_entry[s] == false)
224
+ continue;
225
+
226
+ get_vcf_entry(s, vcf_line);
227
+ e.reset(vcf_line);
228
+ for (ui=0; ui<N_indv; ui++)
229
+ {
230
+ if (include_indv[ui] == false)
231
+ continue;
232
+
233
+ if (include_genotype[s][ui] == true)
234
+ {
235
+ e.parse_genotype_entry(ui, true);
236
+ e.get_indv_GENOTYPE_ids(ui, genotype);
237
+ if (genotype.first != -1)
238
+ {
239
+ N_missing[ui]++;
240
+ }
241
+ N_sites_included[ui]++;
242
+ }
243
+ }
244
+ }
245
+
246
+ for (ui=0; ui<N_indv; ui++)
247
+ {
248
+ if (include_indv[ui] == false)
249
+ continue;
250
+
251
+ double call_rate = N_missing[ui] / (double)N_sites_included[ui];
252
+ if (call_rate < min_call_rate)
253
+ include_indv[ui] = false;
254
+ }
255
+ }
256
+
257
+ void vcf_file::filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth)
258
+ {
259
+ // Filter individuals by mean depth across sites
260
+ if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
261
+ return;
262
+
263
+ if (has_genotypes == false)
264
+ error("Require Genotypes in VCF file in order to filter individuals by mean depth");
265
+
266
+ printLOG("Filtering individuals by mean depth\n");
267
+ unsigned int ui;
268
+
269
+ vector<int> N_sites_included(N_indv, 0);
270
+ vector<double> depth_sum(N_indv,0.0);
271
+ int depth;
272
+ string vcf_line;
273
+ vcf_entry e(N_indv);
274
+ for (unsigned int s=0; s<N_entries; s++)
275
+ {
276
+ if (include_entry[s] == false)
277
+ continue;
278
+
279
+ get_vcf_entry(s, vcf_line);
280
+ e.reset(vcf_line);
281
+
282
+ for (ui=0; ui<N_indv; ui++)
283
+ {
284
+ if (include_indv[ui] == false)
285
+ continue;
286
+ if (include_genotype[s][ui] == true)
287
+ {
288
+ e.parse_genotype_entry(ui, false, false, true);
289
+ depth = e.get_indv_DEPTH(ui);
290
+ if (depth >= 0)
291
+ {
292
+ depth_sum[ui] += depth;
293
+ N_sites_included[ui]++;
294
+ }
295
+ }
296
+ }
297
+ }
298
+
299
+ for (ui=0; ui<N_indv; ui++)
300
+ {
301
+ if (include_indv[ui] == false)
302
+ continue;
303
+ double mean_depth = depth_sum[ui] / N_sites_included[ui];
304
+ if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
305
+ include_indv[ui] = false;
306
+ }
307
+ }
308
+
309
+ void vcf_file::filter_individuals_by_phase()
310
+ {
311
+ // Filter individuals that are completely unphased.
312
+ // TODO: Alter this to allow for a max/min level of unphased-ness.
313
+ printLOG("Filtering Unphased Individuals\n");
314
+
315
+ if (has_genotypes == false)
316
+ error("Require Genotypes in VCF file to filter by Phase.");
317
+
318
+ unsigned int ui, s;
319
+ vector<unsigned int> indv_count(N_indv, 0);
320
+ vector<unsigned int> indv_count_unphased(N_indv, 0);
321
+ string vcf_line;
322
+ vcf_entry e(N_indv);
323
+ for (s=0; s<N_entries; s++)
324
+ {
325
+ if (include_entry[s] == false)
326
+ continue;
327
+
328
+ get_vcf_entry(s, vcf_line);
329
+ e.reset(vcf_line);
330
+
331
+ for (ui=0; ui<N_indv; ui++)
332
+ {
333
+ if (include_indv[ui] == false)
334
+ continue;
335
+
336
+ e.parse_genotype_entry(ui, true);
337
+
338
+ indv_count[ui]++;
339
+ if (e.get_indv_PHASE(ui) != '|')
340
+ indv_count_unphased[ui]++;
341
+ }
342
+ }
343
+
344
+ for (ui=0; ui<N_indv; ui++)
345
+ {
346
+ if (include_indv[ui] == false)
347
+ continue;
348
+
349
+ if (indv_count_unphased[ui] == indv_count[ui])
350
+ {
351
+ include_indv[ui] = false;
352
+ }
353
+ }
354
+ }
355
+
356
+ void vcf_file::filter_individuals_randomly(int max_N_indv)
357
+ {
358
+ // Filter individuals randomly until have a random subset
359
+ if (max_N_indv < 0)
360
+ return;
361
+ printLOG("Filtering Individuals Randomly\n");
362
+
363
+ if (has_genotypes == false)
364
+ error("Require Genotypes in VCF file filter individuals.");
365
+
366
+ unsigned int N_kept_indv = N_kept_individuals();
367
+
368
+ srand ( time(NULL) );
369
+ vector<unsigned int> keep_index(N_kept_indv);
370
+ int count = 0;
371
+ for (unsigned int ui=0; ui<N_indv; ui++)
372
+ {
373
+ if (include_indv[ui] == true)
374
+ {
375
+ keep_index[count] = ui;
376
+ count++;
377
+ }
378
+ }
379
+
380
+ random_shuffle(keep_index.begin(), keep_index.end()); // Get a random order
381
+ keep_index.resize(min(max_N_indv, (signed)keep_index.size())); // Only keep a subset
382
+
383
+ for (unsigned int ui=0; ui<N_indv; ui++)
384
+ {
385
+ if (include_indv[ui] == false)
386
+ continue;
387
+ bool found = false;
388
+ for (unsigned int uj=0; uj<keep_index.size(); uj++)
389
+ {
390
+ if (keep_index[uj] == ui)
391
+ {
392
+ found = true;
393
+ }
394
+ }
395
+ if (found == false)
396
+ include_indv[ui] = false;
397
+ }
398
+ }
399
+
400
+
401
+ void vcf_file::filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude)
402
+ {
403
+ // Filter sites by user provided lists
404
+ if (keep_then_exclude)
405
+ {
406
+ filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
407
+ filter_sites_to_exclude(snps_to_exclude_file);
408
+ }
409
+ else
410
+ {
411
+ filter_sites_to_exclude(snps_to_exclude_file);
412
+ filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
413
+ }
414
+ }
415
+
416
+ void vcf_file::filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file)
417
+ {
418
+ // Filter sites by user provided list
419
+ if ((snps_to_keep.size() == 0) && (snps_to_keep_file == ""))
420
+ return;
421
+
422
+ set<string> local_snps_to_keep = snps_to_keep;
423
+
424
+ printLOG("Keeping sites by user-supplied list\n");
425
+
426
+ if (snps_to_keep_file != "")
427
+ {
428
+ ifstream in(snps_to_keep_file.c_str());
429
+ string tmp;
430
+ if (!in.is_open())
431
+ {
432
+ error("Could not open SNPs to Keep file" + snps_to_keep_file, 0);
433
+ }
434
+ while (!in.eof())
435
+ {
436
+ in >> tmp;
437
+ local_snps_to_keep.insert(tmp);
438
+ in.ignore(numeric_limits<streamsize>::max(), '\n');
439
+ }
440
+
441
+ in.close();
442
+ }
443
+
444
+ string vcf_line;
445
+ for (unsigned int s=0; s<N_entries; s++)
446
+ {
447
+ if (include_entry[s] == false)
448
+ continue;
449
+
450
+ get_vcf_entry(s, vcf_line);
451
+ vcf_entry e(N_indv, vcf_line);
452
+ e.parse_basic_entry();
453
+ if (local_snps_to_keep.find(e.get_ID()) == local_snps_to_keep.end())
454
+ include_entry[s] = false;
455
+ }
456
+ }
457
+
458
+ void vcf_file::filter_sites_to_exclude(const string &snps_to_exclude_file)
459
+ {
460
+ // Filter sites by user provided list
461
+ if (snps_to_exclude_file == "")
462
+ return;
463
+
464
+ printLOG("Excluding sites by user-supplied list\n");
465
+
466
+ set<string> snps_to_exclude;
467
+ if (snps_to_exclude_file != "")
468
+ {
469
+ ifstream in(snps_to_exclude_file.c_str());
470
+ string tmp;
471
+ if (!in.is_open())
472
+ {
473
+ error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0);
474
+ }
475
+ while (!in.eof())
476
+ {
477
+ in >> tmp;
478
+ snps_to_exclude.insert(tmp);
479
+ in.ignore(numeric_limits<streamsize>::max(), '\n');
480
+ }
481
+ in.close();
482
+ }
483
+
484
+ string vcf_line;
485
+ for (unsigned int s=0; s<N_entries; s++)
486
+ {
487
+ if (include_entry[s] == false)
488
+ continue;
489
+
490
+ get_vcf_entry(s, vcf_line);
491
+ vcf_entry e(N_indv, vcf_line);
492
+ e.parse_basic_entry();
493
+ if (snps_to_exclude.find(e.get_ID()) != snps_to_exclude.end())
494
+ include_entry[s] = false;
495
+ }
496
+ }
497
+
498
+ void vcf_file::filter_sites_by_quality(double min_quality)
499
+ {
500
+ // Filter sites by quality
501
+ if (min_quality < 0)
502
+ return;
503
+
504
+ printLOG("Filtering sites with Quality less than " + dbl2str(min_quality,0) + "\n");
505
+
506
+ unsigned int s;
507
+ string vcf_line;
508
+ for (s=0; s<N_entries; s++)
509
+ {
510
+ if (include_entry[s] == false)
511
+ continue;
512
+ get_vcf_entry(s, vcf_line);
513
+ vcf_entry e(N_indv, vcf_line);
514
+ e.parse_basic_entry(true);
515
+ string alt_allele = e.get_ALT_allele(0);
516
+ // The QUAL field has different definitions depending on the state of the
517
+ // alternative allele. Here I treat them separately, although in this case
518
+ // it is unnecessary.
519
+ if ((alt_allele == ".") || (alt_allele == ""))
520
+ { // The case that the alternative allele is unknown
521
+ // QUAL is -10log_10 p(variant)
522
+ if (e.get_QUAL() < min_quality)
523
+ include_entry[s] = false;
524
+ }
525
+ else
526
+ { // The normal case
527
+ // QUAL is -10log_10 p(no variant)
528
+ if (e.get_QUAL() < min_quality)
529
+ include_entry[s] = false;
530
+ }
531
+ }
532
+ }
533
+
534
+ void vcf_file::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth)
535
+ {
536
+ // Filter sites by mean depth
537
+ if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
538
+ return;
539
+
540
+ if (has_genotypes == false)
541
+ error("Require Genotypes in VCF file in order to filter sites by mean depth");
542
+
543
+ printLOG("Filtering sites by mean depth\n");
544
+ int depth;
545
+
546
+ string vcf_line;
547
+ for (unsigned int s=0; s<N_entries; s++)
548
+ {
549
+ if (include_entry[s] == false)
550
+ continue;
551
+
552
+ get_vcf_entry(s, vcf_line);
553
+ vcf_entry e(N_indv, vcf_line);
554
+
555
+ unsigned int N_indv_included = 0;
556
+ double depth_sum = 0.0;
557
+ for (unsigned int ui=0; ui<N_indv; ui++)
558
+ {
559
+ if (include_indv[ui] == false)
560
+ continue;
561
+
562
+ if (include_genotype[s][ui] == true)
563
+ {
564
+ e.parse_genotype_entry(ui, false, false, true);
565
+ depth = e.get_indv_DEPTH(ui);
566
+ if (depth >= 0)
567
+ {
568
+ depth_sum += depth;
569
+ }
570
+ N_indv_included++;
571
+ }
572
+ }
573
+ double mean_depth = depth_sum / N_indv_included;
574
+
575
+ if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
576
+ include_entry[s] = false;
577
+ }
578
+ }
579
+
580
+ void vcf_file::filter_sites_by_position(const string &chr, int start_pos, int end_pos)
581
+ {
582
+ // Filter sites by user provided position range
583
+ if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits<int>::max())))
584
+ return;
585
+ printLOG("Filtering sites by chromosome and/or position\n");
586
+ string vcf_line;
587
+ string chrom; int pos1;
588
+ for (unsigned int s=0; s<N_entries; s++)
589
+ {
590
+ if (include_entry[s] == false)
591
+ continue;
592
+ //get_vcf_entry(s, vcf_line);
593
+ //vcf_entry e(N_indv, vcf_line);
594
+ //e.parse_basic_entry();
595
+ set_filepos(entry_file_locations[s]);
596
+ read_CHROM_and_POS_only(chrom, pos1);
597
+ if (chrom == chr)
598
+ {
599
+ if ((pos1 < start_pos) || (pos1 > end_pos))
600
+ include_entry[s] = false;
601
+ }
602
+ else
603
+ include_entry[s] = false;
604
+ }
605
+ }
606
+
607
+ void vcf_file::filter_sites_by_positions(const string &positions_file)
608
+ {
609
+ // Filter sites by a user defined file containing a list of positions
610
+ if (positions_file == "")
611
+ return;
612
+ printLOG("Filtering sites by Positions file\n");
613
+ ifstream BED(positions_file.c_str());
614
+ if (!BED.is_open())
615
+ error("Could not open Positions file: " + positions_file);
616
+
617
+ string chr;
618
+ int pos1;
619
+ int idx;
620
+ unsigned int N_chr=0;
621
+ map<string,int> chr_to_idx;
622
+ vector< set<int > > lims;
623
+ stringstream ss;
624
+ string line;
625
+ // Skip header
626
+ BED.ignore(numeric_limits<streamsize>::max(), '\n');
627
+ while (!BED.eof())
628
+ {
629
+ getline(BED, line);
630
+ if (line[0] == '#')
631
+ continue;
632
+
633
+ ss.clear();
634
+ ss.str(line);
635
+ ss >> chr >> pos1;
636
+
637
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
638
+ {
639
+ N_chr++;
640
+ chr_to_idx[chr] = (N_chr-1);
641
+ lims.resize(N_chr);
642
+ }
643
+
644
+ idx = chr_to_idx[chr];
645
+ lims[idx].insert(pos1);
646
+ }
647
+ BED.close();
648
+
649
+ string vcf_line;
650
+ for (unsigned int s=0; s<N_entries; s++)
651
+ {
652
+ if (include_entry[s] == false)
653
+ continue;
654
+ //get_vcf_entry(s, vcf_line);
655
+ //vcf_entry e(N_indv, vcf_line);
656
+ //e.parse_basic_entry();
657
+ //e.get_CHROM(chr);
658
+ set_filepos(entry_file_locations[s]);
659
+ read_CHROM_and_POS_only(chr, pos1);
660
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
661
+ include_entry[s] = false;
662
+ else
663
+ {
664
+ //pos1 = e.get_POS();
665
+ idx = chr_to_idx[chr];
666
+ bool found=false;
667
+
668
+ if (lims[idx].find(pos1) != lims[idx].end())
669
+ found = true;
670
+
671
+ if (found == false)
672
+ include_entry[s] = false;
673
+ }
674
+ }
675
+ }
676
+
677
+ void vcf_file::filter_sites_by_BED_file(const string &bed_file, bool BED_exclude)
678
+ {
679
+ // Filter sites depending on positions in a BED file.
680
+ if (bed_file == "")
681
+ return;
682
+ printLOG("Filtering sites by BED file\n");
683
+ ifstream BED(bed_file.c_str());
684
+ if (!BED.is_open())
685
+ error("Could not open BED file: " + bed_file);
686
+
687
+ string chr;
688
+ int pos1, pos2;
689
+ int idx;
690
+ unsigned int N_chr=0;
691
+ map<string,int> chr_to_idx;
692
+ vector< deque<pair<int,int> > > lims;
693
+ BED.ignore(numeric_limits<streamsize>::max(), '\n'); // Ignore header
694
+ while (!BED.eof())
695
+ {
696
+ BED >> chr >> pos1 >> pos2;
697
+ BED.ignore(numeric_limits<streamsize>::max(), '\n');
698
+
699
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
700
+ {
701
+ N_chr++;
702
+ chr_to_idx[chr] = (N_chr-1);
703
+ lims.resize(N_chr);
704
+ }
705
+
706
+ idx = chr_to_idx[chr];
707
+ lims[idx].push_back(make_pair(pos1,pos2));
708
+ }
709
+ BED.close();
710
+
711
+ for (unsigned int ui=0; ui<lims.size(); ui++)
712
+ sort(lims[ui].begin(), lims[ui].end());
713
+
714
+ pair<int,int> range;
715
+ string vcf_line;
716
+ vector<unsigned int> min_ui(lims.size(), 0);
717
+ for (unsigned int s=0; s<N_entries; s++)
718
+ {
719
+ if (include_entry[s] == false)
720
+ continue;
721
+ //get_vcf_entry(s, vcf_line);
722
+ //vcf_entry e(N_indv, vcf_line);
723
+ //e.parse_basic_entry();
724
+ //e.get_CHROM(chr);
725
+ set_filepos(entry_file_locations[s]);
726
+ read_CHROM_and_POS_only(chr, pos1);
727
+ if (BED_exclude == false)
728
+ { // Exclude sites not in BED file
729
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
730
+ include_entry[s] = false;
731
+ else
732
+ {
733
+ idx = chr_to_idx[chr];
734
+ bool found=false;
735
+ unsigned int max_ui = lims[idx].size();
736
+ for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
737
+ { // No need to start this loop at zero every time...
738
+ if ((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second))
739
+ {
740
+ found=true;
741
+ break;
742
+ }
743
+ else if (pos1 > lims[idx][ui].second)
744
+ min_ui[idx] = ui+1;
745
+ }
746
+ if (found == false)
747
+ include_entry[s] = false;
748
+ }
749
+ }
750
+ else
751
+ { // Exclude sites in BED file
752
+ if (chr_to_idx.find(chr) != chr_to_idx.end())
753
+ {
754
+ idx = chr_to_idx[chr];
755
+ bool found=false;
756
+ unsigned int max_ui = lims[idx].size();
757
+ for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
758
+ { // No need to start this loop at zero every time...
759
+ if ((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second))
760
+ {
761
+ found=true;
762
+ break;
763
+ }
764
+ else if (pos1 > lims[idx][ui].second)
765
+ min_ui[idx] = ui+1;
766
+ }
767
+ if (found == true)
768
+ include_entry[s] = false;
769
+ }
770
+ }
771
+ }
772
+ }
773
+
774
+ void vcf_file::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value)
775
+ {
776
+ // Filter sites on the basis of a fasta-like mask file.
777
+ if (mask_file == "")
778
+ return;
779
+ if (invert_mask == false)
780
+ printLOG("Filtering sites by mask file\n");
781
+ else
782
+ printLOG("Filtering sites by inverted mask file\n");
783
+ ifstream mask(mask_file.c_str());
784
+ if (!mask.is_open())
785
+ error("Could not open mask file: " + mask_file);
786
+
787
+ string line;
788
+ string next_chr="", vcf_line;
789
+ unsigned int next_pos = 0;
790
+ unsigned int next_s = 0;
791
+
792
+ unsigned int current_pos = 1;
793
+ string current_header = "";
794
+ bool keep;
795
+ while (!mask.eof())
796
+ {
797
+ getline(mask, line);
798
+ line.erase( line.find_last_not_of(" \t") + 1);
799
+
800
+ if (line[0] == '>')
801
+ { // Header
802
+ current_header = line.substr(1, line.find_first_of(" \t")-1);
803
+ current_pos = 1;
804
+ for (unsigned int s=0; s<N_entries; s++)
805
+ {
806
+ if (include_entry[s] == true)
807
+ {
808
+ get_vcf_entry(s, vcf_line);
809
+ vcf_entry e(N_indv, vcf_line);
810
+ e.parse_basic_entry();
811
+ e.get_CHROM(next_chr);
812
+ if (next_chr == current_header)
813
+ {
814
+ next_pos = (unsigned)e.get_POS();
815
+ next_s = s;
816
+ break;
817
+ }
818
+ else
819
+ {
820
+ include_entry[s] = false;
821
+ }
822
+ }
823
+ }
824
+ }
825
+ else
826
+ {
827
+ if ((current_pos + line.size() >= next_pos) && (next_chr == current_header))
828
+ {
829
+ for (unsigned int ui=0; ui<line.size(); ui++)
830
+ {
831
+ if (current_pos + ui == next_pos)
832
+ {
833
+ char mask_base = line[ui]-48;
834
+ keep = (mask_base <= min_kept_mask_value);
835
+ if (invert_mask == true)
836
+ keep = !keep;
837
+
838
+ if (keep == false)
839
+ {
840
+ include_entry[next_s] = false;
841
+ }
842
+
843
+ next_s += 1;
844
+ for (unsigned int s=next_s; s<N_entries; s++)
845
+ {
846
+ if (include_entry[s] == true)
847
+ {
848
+ get_vcf_entry(s, vcf_line);
849
+ vcf_entry e(N_indv, vcf_line);
850
+ e.parse_basic_entry();
851
+ e.get_CHROM(next_chr);
852
+ next_pos = (unsigned)e.get_POS();
853
+ next_s = s;
854
+ break;
855
+ }
856
+ }
857
+ }
858
+ }
859
+ }
860
+ current_pos += line.size();
861
+ }
862
+ }
863
+ mask.close();
864
+
865
+ // Remaining sites aren't covered by mask, so exclude
866
+ for (unsigned int s=next_s; s<N_entries; s++)
867
+ {
868
+ include_entry[s] = false;
869
+ }
870
+ }
871
+
872
+
873
+ void vcf_file::filter_sites_by_number_of_alleles(int min_alleles, int max_alleles)
874
+ {
875
+ // Filter sites by the number of alleles (e.g. 2 for bi-allelic)
876
+ if ((min_alleles <= 0) && (max_alleles == numeric_limits<int>::max()))
877
+ return;
878
+ printLOG("Filtering sites by number of alleles\n");
879
+
880
+ int N_alleles;
881
+ string vcf_line;
882
+ for (unsigned int s=0; s<N_entries; s++)
883
+ {
884
+ if (include_entry[s] == false)
885
+ continue;
886
+
887
+ get_vcf_entry(s, vcf_line);
888
+ vcf_entry e(N_indv, vcf_line);
889
+ e.parse_basic_entry(true);
890
+ N_alleles = e.get_N_alleles();
891
+ if ((N_alleles < min_alleles) || (N_alleles > max_alleles))
892
+ {
893
+ include_entry[s] = false;
894
+ }
895
+ }
896
+ }
897
+
898
+ void vcf_file::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate)
899
+ {
900
+ // Filter sites so that all allele frequencies are between limits
901
+ if ((min_maf <= 0.0) && (max_maf >= 1.0) && (min_site_call_rate <= 0) && (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0))
902
+ return;
903
+
904
+ if (has_genotypes == false)
905
+ error("Require Genotypes in VCF file to filter by frequency and/or call rate");
906
+
907
+ printLOG("Filtering sites by allele frequency and call rate\n");
908
+
909
+ unsigned int N_alleles;
910
+ unsigned int N_non_missing_chr;
911
+
912
+ string vcf_line;
913
+ vcf_entry e(N_indv);
914
+ for (unsigned int s=0; s<N_entries; s++)
915
+ {
916
+ if (include_entry[s] == false)
917
+ continue;
918
+
919
+ get_vcf_entry(s, vcf_line);
920
+ e.reset(vcf_line);
921
+ e.parse_basic_entry(true);
922
+ e.parse_genotype_entries(true);
923
+ N_alleles = e.get_N_alleles();
924
+
925
+ vector<int> allele_counts;
926
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
927
+
928
+ double freq;
929
+ double maf=numeric_limits<double>::max();
930
+ for (unsigned int ui=0; ui<N_alleles; ui++)
931
+ {
932
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
933
+ freq = min(freq, 1.0 - freq);
934
+
935
+ maf = min(maf, freq);
936
+ if ((ui > 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af)))
937
+ include_entry[s] = false;
938
+ }
939
+
940
+
941
+ if ((maf < min_maf) || (maf > max_maf))
942
+ include_entry[s] = false;
943
+
944
+ //unsigned int N_geno_included = e.get_N_chr();
945
+ double call_rate = N_non_missing_chr / double(e.get_N_chr(include_indv, include_genotype[s]));
946
+
947
+ if (call_rate < min_site_call_rate)
948
+ include_entry[s] = false;
949
+ }
950
+ }
951
+
952
+
953
+
954
+ void vcf_file::filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count)
955
+ {
956
+ if ((min_mac <= 0) && (max_mac == numeric_limits<int>::max()) &&
957
+ (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits<int>::max()) &&
958
+ (max_missing_call_count == numeric_limits<int>::max()))
959
+ return;
960
+
961
+ // Filter sites so that all allele counts are between limits
962
+ if (has_genotypes == false)
963
+ error("Require Genotypes in VCF file to filter by allele counts and/or missing data");
964
+
965
+ printLOG("Filtering sites by allele count and missing data\n");
966
+
967
+ unsigned int N_alleles, N_chr, N_non_missing_chr;
968
+
969
+ string vcf_line;
970
+ vcf_entry e(N_indv);
971
+ for (unsigned int s=0; s<N_entries; s++)
972
+ {
973
+ if (include_entry[s] == false)
974
+ continue;
975
+
976
+ get_vcf_entry(s, vcf_line);
977
+ e.reset(vcf_line);
978
+ e.parse_basic_entry(true);
979
+ e.parse_genotype_entries(true);
980
+ N_alleles = e.get_N_alleles();
981
+
982
+ vector<int> allele_counts;
983
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
984
+ N_chr = e.get_N_chr(include_indv, include_genotype[s]);
985
+
986
+ int mac = numeric_limits<int>::max();
987
+ for (unsigned int ui=0; ui<N_alleles; ui++)
988
+ {
989
+ mac = min(allele_counts[ui], mac);
990
+ if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac)))
991
+ include_entry[s] = false;
992
+ }
993
+
994
+ if ((mac < min_mac) || (mac > max_mac))
995
+ include_entry[s] = false;
996
+
997
+ if ((N_chr-N_non_missing_chr) > max_missing_call_count)
998
+ include_entry[s] = false;
999
+ }
1000
+ }
1001
+
1002
+
1003
+ void vcf_file::filter_sites_by_HWE_pvalue(double min_HWE_pvalue)
1004
+ {
1005
+ // Filter sites by HWE p-value
1006
+ if (min_HWE_pvalue <= 0)
1007
+ return;
1008
+
1009
+ if (has_genotypes == false)
1010
+ error("Require Genotypes in VCF file to filter sites by HWE.");
1011
+
1012
+ // Note this assumes Biallelic SNPs.
1013
+ printLOG("Filtering sites by HWE p-value (only including bi-allelic sites)\n");
1014
+
1015
+ unsigned int b11, b12, b22;
1016
+ double p;
1017
+ string vcf_line;
1018
+ for (unsigned int s=0; s<N_entries; s++)
1019
+ {
1020
+ if (include_entry[s] == false)
1021
+ continue;
1022
+
1023
+ get_vcf_entry(s, vcf_line);
1024
+ vcf_entry e(N_indv, vcf_line);
1025
+
1026
+ e.parse_basic_entry(true);
1027
+ e.parse_genotype_entries(true);
1028
+
1029
+ e.get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22);
1030
+ p = vcf_entry::SNPHWE(b12, b11, b22);
1031
+
1032
+ if (p < min_HWE_pvalue)
1033
+ include_entry[s] = false;
1034
+ }
1035
+ }
1036
+
1037
+ void vcf_file::filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all)
1038
+ {
1039
+ // Filter sites by entries in the FILTER field.
1040
+ if ((remove_all == false) && (filter_flags_to_remove.size() == 0) && (filter_flags_to_keep.size() == 0))
1041
+ return;
1042
+
1043
+ printLOG("Filtering sites by FILTER Status.\n");
1044
+
1045
+ vector<string> FILTERs;
1046
+ string vcf_line;
1047
+ unsigned int N_to_remove = filter_flags_to_remove.size();
1048
+ unsigned int N_to_keep = filter_flags_to_keep.size();
1049
+ for (unsigned int s=0; s<N_entries; s++)
1050
+ {
1051
+ if (include_entry[s] == false)
1052
+ continue;
1053
+
1054
+ get_vcf_entry(s, vcf_line);
1055
+ vcf_entry e(N_indv, vcf_line);
1056
+
1057
+ e.parse_basic_entry(false, true);
1058
+
1059
+ e.get_FILTER_vector(FILTERs);
1060
+
1061
+ if (N_to_keep > 0)
1062
+ {
1063
+ bool keep = false;
1064
+ for (unsigned int ui=0; ui<FILTERs.size(); ui++)
1065
+ if (filter_flags_to_keep.find(FILTERs[ui]) != filter_flags_to_keep.end())
1066
+ {
1067
+ keep = true; break;
1068
+ }
1069
+
1070
+ include_entry[s] = keep;
1071
+ }
1072
+
1073
+ if (include_entry[s]==false)
1074
+ continue;
1075
+
1076
+ if ((remove_all == true) && (FILTERs.size() > 0))
1077
+ include_entry[s] = false;
1078
+ else if (N_to_remove > 0)
1079
+ {
1080
+ for (unsigned int ui=0; ui<FILTERs.size(); ui++)
1081
+ if (filter_flags_to_remove.find(FILTERs[ui]) != filter_flags_to_remove.end())
1082
+ include_entry[s] = false;
1083
+ }
1084
+ }
1085
+ }
1086
+
1087
+ void vcf_file::filter_sites_by_phase()
1088
+ {
1089
+ // Filter out sites with unphased entries
1090
+ // TODO: Alter this to allow for a max/min level of unphased-ness.
1091
+ printLOG("Filtering Sites with Unphased Genotypes\n");
1092
+ string vcf_line;
1093
+ vcf_entry e(N_indv);
1094
+
1095
+ for (unsigned int s=0; s<N_entries; s++)
1096
+ {
1097
+ if (include_entry[s] == false)
1098
+ continue;
1099
+
1100
+ unsigned int count = 0;
1101
+ unsigned int count_unphased = 0;
1102
+ get_vcf_entry(s, vcf_line);
1103
+ e.reset(vcf_line);
1104
+
1105
+ for (unsigned int ui=0; ui<N_indv; ui++)
1106
+ {
1107
+ if (include_indv[ui] == false)
1108
+ continue;
1109
+
1110
+ e.parse_genotype_entry(ui, true);
1111
+
1112
+ count++;
1113
+ if (e.get_indv_PHASE(ui) != '|')
1114
+ count_unphased++;
1115
+ }
1116
+
1117
+ if (count_unphased > 0)
1118
+ include_entry[s] = false;
1119
+ }
1120
+ }
1121
+
1122
+ void vcf_file::filter_sites_by_thinning(int min_SNP_distance)
1123
+ {
1124
+ // Filter sites so that no two SNPs are within some minimum distance
1125
+ if (min_SNP_distance < 1)
1126
+ return;
1127
+ printLOG("Filtering sites so that no two sites are within " + int2str(min_SNP_distance) + "bp\n");
1128
+
1129
+ string vcf_line;
1130
+ vcf_entry e(N_indv);
1131
+ map<string, int> CHROM_to_idx;
1132
+ string CHROM, last_CHROM="";
1133
+ int POS, last_POS = -1;
1134
+ int distance_from_last_SNP;
1135
+
1136
+ for (unsigned int s=0; s<N_entries; s++)
1137
+ {
1138
+ if (include_entry[s] == false)
1139
+ continue;
1140
+
1141
+ //get_vcf_entry(s, vcf_line);
1142
+ //e.reset(vcf_line);
1143
+ //e.parse_basic_entry();
1144
+
1145
+ //CHROM = e.get_CHROM();
1146
+ //POS = e.get_POS();
1147
+ set_filepos(entry_file_locations[s]);
1148
+ read_CHROM_and_POS_only(CHROM, POS);
1149
+ if (CHROM == last_CHROM)
1150
+ {
1151
+ distance_from_last_SNP = POS - last_POS;
1152
+ if (distance_from_last_SNP < min_SNP_distance)
1153
+ include_entry[s] = false;
1154
+ }
1155
+ if (include_entry[s] == true)
1156
+ last_POS = POS;
1157
+ last_CHROM = CHROM;
1158
+ }
1159
+ }
1160
+
1161
+
1162
+ void vcf_file::filter_sites_by_INFO_flags(const set<string> &flags_to_remove, const set<string> &flags_to_keep)
1163
+ {
1164
+ // Filter sites by entries in the INFO field.
1165
+ if ((flags_to_remove.size() == 0) && (flags_to_keep.size() == 0))
1166
+ return;
1167
+
1168
+ printLOG("Filtering sites by INFO flags.\n");
1169
+
1170
+ vector<string> INFOs;
1171
+ string vcf_line;
1172
+ string value;
1173
+ unsigned int N_to_remove = flags_to_remove.size();
1174
+ unsigned int N_to_keep = flags_to_keep.size();
1175
+ for (unsigned int s=0; s<N_entries; s++)
1176
+ {
1177
+ if (include_entry[s] == false)
1178
+ continue;
1179
+
1180
+ get_vcf_entry(s, vcf_line);
1181
+ vcf_entry e(N_indv, vcf_line);
1182
+
1183
+ e.parse_basic_entry(false, false, true);
1184
+
1185
+ if (N_to_keep > 0)
1186
+ {
1187
+ bool keep = false;
1188
+ for (set<string>::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it)
1189
+ {
1190
+ value = e.get_INFO_value(*it);
1191
+ if (value == "1")
1192
+ keep = true;
1193
+ }
1194
+
1195
+ include_entry[s] = keep;
1196
+ }
1197
+
1198
+ if (include_entry[s]==false)
1199
+ continue;
1200
+
1201
+ if (N_to_remove > 0)
1202
+ {
1203
+ for (set<string>::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it)
1204
+ {
1205
+ value = e.get_INFO_value(*it);
1206
+ if (value == "1")
1207
+ {
1208
+ include_entry[s] = false;
1209
+ continue;
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+