ngs_server 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. data/bin/ngs_server +72 -50
  2. data/ext/bamtools/extconf.rb +3 -3
  3. data/ext/vcftools/Makefile +28 -0
  4. data/ext/vcftools/README.txt +36 -0
  5. data/ext/vcftools/cpp/.svn/all-wcprops +125 -0
  6. data/ext/vcftools/cpp/.svn/dir-prop-base +6 -0
  7. data/ext/vcftools/cpp/.svn/entries +708 -0
  8. data/ext/vcftools/cpp/.svn/text-base/Makefile.svn-base +46 -0
  9. data/ext/vcftools/cpp/.svn/text-base/dgeev.cpp.svn-base +146 -0
  10. data/ext/vcftools/cpp/.svn/text-base/dgeev.h.svn-base +43 -0
  11. data/ext/vcftools/cpp/.svn/text-base/output_log.cpp.svn-base +79 -0
  12. data/ext/vcftools/cpp/.svn/text-base/output_log.h.svn-base +34 -0
  13. data/ext/vcftools/cpp/.svn/text-base/parameters.cpp.svn-base +535 -0
  14. data/ext/vcftools/cpp/.svn/text-base/parameters.h.svn-base +154 -0
  15. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.cpp.svn-base +497 -0
  16. data/ext/vcftools/cpp/.svn/text-base/vcf_entry.h.svn-base +190 -0
  17. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_getters.cpp.svn-base +421 -0
  18. data/ext/vcftools/cpp/.svn/text-base/vcf_entry_setters.cpp.svn-base +482 -0
  19. data/ext/vcftools/cpp/.svn/text-base/vcf_file.cpp.svn-base +495 -0
  20. data/ext/vcftools/cpp/.svn/text-base/vcf_file.h.svn-base +184 -0
  21. data/ext/vcftools/cpp/.svn/text-base/vcf_file_diff.cpp.svn-base +1282 -0
  22. data/ext/vcftools/cpp/.svn/text-base/vcf_file_filters.cpp.svn-base +1215 -0
  23. data/ext/vcftools/cpp/.svn/text-base/vcf_file_format_convert.cpp.svn-base +1138 -0
  24. data/ext/vcftools/cpp/.svn/text-base/vcf_file_index.cpp.svn-base +171 -0
  25. data/ext/vcftools/cpp/.svn/text-base/vcf_file_output.cpp.svn-base +3012 -0
  26. data/ext/vcftools/cpp/.svn/text-base/vcftools.cpp.svn-base +107 -0
  27. data/ext/vcftools/cpp/.svn/text-base/vcftools.h.svn-base +25 -0
  28. data/ext/vcftools/cpp/Makefile +46 -0
  29. data/ext/vcftools/cpp/dgeev.cpp +146 -0
  30. data/ext/vcftools/cpp/dgeev.h +43 -0
  31. data/ext/vcftools/cpp/output_log.cpp +79 -0
  32. data/ext/vcftools/cpp/output_log.h +34 -0
  33. data/ext/vcftools/cpp/parameters.cpp +535 -0
  34. data/ext/vcftools/cpp/parameters.h +154 -0
  35. data/ext/vcftools/cpp/vcf_entry.cpp +497 -0
  36. data/ext/vcftools/cpp/vcf_entry.h +190 -0
  37. data/ext/vcftools/cpp/vcf_entry_getters.cpp +421 -0
  38. data/ext/vcftools/cpp/vcf_entry_setters.cpp +482 -0
  39. data/ext/vcftools/cpp/vcf_file.cpp +495 -0
  40. data/ext/vcftools/cpp/vcf_file.h +184 -0
  41. data/ext/vcftools/cpp/vcf_file_diff.cpp +1282 -0
  42. data/ext/vcftools/cpp/vcf_file_filters.cpp +1215 -0
  43. data/ext/vcftools/cpp/vcf_file_format_convert.cpp +1138 -0
  44. data/ext/vcftools/cpp/vcf_file_index.cpp +171 -0
  45. data/ext/vcftools/cpp/vcf_file_output.cpp +3012 -0
  46. data/ext/vcftools/cpp/vcftools.cpp +107 -0
  47. data/ext/vcftools/cpp/vcftools.h +25 -0
  48. data/ext/vcftools/examples/.svn/all-wcprops +185 -0
  49. data/ext/vcftools/examples/.svn/dir-prop-base +6 -0
  50. data/ext/vcftools/examples/.svn/entries +1048 -0
  51. data/ext/vcftools/examples/.svn/prop-base/perl-api-1.pl.svn-base +5 -0
  52. data/ext/vcftools/examples/.svn/text-base/annotate-test.vcf.svn-base +37 -0
  53. data/ext/vcftools/examples/.svn/text-base/annotate.out.svn-base +23 -0
  54. data/ext/vcftools/examples/.svn/text-base/annotate.txt.svn-base +7 -0
  55. data/ext/vcftools/examples/.svn/text-base/annotate2.out.svn-base +52 -0
  56. data/ext/vcftools/examples/.svn/text-base/annotate3.out.svn-base +23 -0
  57. data/ext/vcftools/examples/.svn/text-base/cmp-test-a-3.3.vcf.svn-base +12 -0
  58. data/ext/vcftools/examples/.svn/text-base/cmp-test-a.vcf.svn-base +12 -0
  59. data/ext/vcftools/examples/.svn/text-base/cmp-test-b-3.3.vcf.svn-base +12 -0
  60. data/ext/vcftools/examples/.svn/text-base/cmp-test-b.vcf.svn-base +12 -0
  61. data/ext/vcftools/examples/.svn/text-base/cmp-test.out.svn-base +53 -0
  62. data/ext/vcftools/examples/.svn/text-base/concat-a.vcf.svn-base +21 -0
  63. data/ext/vcftools/examples/.svn/text-base/concat-b.vcf.svn-base +13 -0
  64. data/ext/vcftools/examples/.svn/text-base/concat-c.vcf.svn-base +19 -0
  65. data/ext/vcftools/examples/.svn/text-base/concat.out.svn-base +39 -0
  66. data/ext/vcftools/examples/.svn/text-base/invalid-4.0.vcf.svn-base +31 -0
  67. data/ext/vcftools/examples/.svn/text-base/isec-n2-test.vcf.out.svn-base +19 -0
  68. data/ext/vcftools/examples/.svn/text-base/merge-test-a.vcf.svn-base +17 -0
  69. data/ext/vcftools/examples/.svn/text-base/merge-test-b.vcf.svn-base +17 -0
  70. data/ext/vcftools/examples/.svn/text-base/merge-test-c.vcf.svn-base +15 -0
  71. data/ext/vcftools/examples/.svn/text-base/merge-test.vcf.out.svn-base +31 -0
  72. data/ext/vcftools/examples/.svn/text-base/perl-api-1.pl.svn-base +46 -0
  73. data/ext/vcftools/examples/.svn/text-base/query-test.out.svn-base +6 -0
  74. data/ext/vcftools/examples/.svn/text-base/shuffle-test.vcf.svn-base +12 -0
  75. data/ext/vcftools/examples/.svn/text-base/subset.SNPs.out.svn-base +10 -0
  76. data/ext/vcftools/examples/.svn/text-base/subset.indels.out.svn-base +18 -0
  77. data/ext/vcftools/examples/.svn/text-base/subset.vcf.svn-base +21 -0
  78. data/ext/vcftools/examples/.svn/text-base/valid-3.3.vcf.svn-base +30 -0
  79. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.stats.svn-base +104 -0
  80. data/ext/vcftools/examples/.svn/text-base/valid-4.0.vcf.svn-base +34 -0
  81. data/ext/vcftools/examples/.svn/text-base/valid-4.1.vcf.svn-base +37 -0
  82. data/ext/vcftools/examples/annotate-test.vcf +37 -0
  83. data/ext/vcftools/examples/annotate.out +23 -0
  84. data/ext/vcftools/examples/annotate.txt +7 -0
  85. data/ext/vcftools/examples/annotate2.out +52 -0
  86. data/ext/vcftools/examples/annotate3.out +23 -0
  87. data/ext/vcftools/examples/cmp-test-a-3.3.vcf +12 -0
  88. data/ext/vcftools/examples/cmp-test-a.vcf +12 -0
  89. data/ext/vcftools/examples/cmp-test-b-3.3.vcf +12 -0
  90. data/ext/vcftools/examples/cmp-test-b.vcf +12 -0
  91. data/ext/vcftools/examples/cmp-test.out +53 -0
  92. data/ext/vcftools/examples/concat-a.vcf +21 -0
  93. data/ext/vcftools/examples/concat-b.vcf +13 -0
  94. data/ext/vcftools/examples/concat-c.vcf +19 -0
  95. data/ext/vcftools/examples/concat.out +39 -0
  96. data/ext/vcftools/examples/invalid-4.0.vcf +31 -0
  97. data/ext/vcftools/examples/isec-n2-test.vcf.out +19 -0
  98. data/ext/vcftools/examples/merge-test-a.vcf +17 -0
  99. data/ext/vcftools/examples/merge-test-b.vcf +17 -0
  100. data/ext/vcftools/examples/merge-test-c.vcf +15 -0
  101. data/ext/vcftools/examples/merge-test.vcf.out +31 -0
  102. data/ext/vcftools/examples/perl-api-1.pl +46 -0
  103. data/ext/vcftools/examples/query-test.out +6 -0
  104. data/ext/vcftools/examples/shuffle-test.vcf +12 -0
  105. data/ext/vcftools/examples/subset.SNPs.out +10 -0
  106. data/ext/vcftools/examples/subset.indels.out +18 -0
  107. data/ext/vcftools/examples/subset.vcf +21 -0
  108. data/ext/vcftools/examples/valid-3.3.vcf +30 -0
  109. data/ext/vcftools/examples/valid-4.0.vcf +34 -0
  110. data/ext/vcftools/examples/valid-4.0.vcf.stats +104 -0
  111. data/ext/vcftools/examples/valid-4.1.vcf +37 -0
  112. data/ext/vcftools/extconf.rb +2 -0
  113. data/ext/vcftools/perl/.svn/all-wcprops +149 -0
  114. data/ext/vcftools/perl/.svn/entries +844 -0
  115. data/ext/vcftools/perl/.svn/prop-base/fill-aa.svn-base +5 -0
  116. data/ext/vcftools/perl/.svn/prop-base/fill-an-ac.svn-base +5 -0
  117. data/ext/vcftools/perl/.svn/prop-base/fill-ref-md5.svn-base +5 -0
  118. data/ext/vcftools/perl/.svn/prop-base/tab-to-vcf.svn-base +5 -0
  119. data/ext/vcftools/perl/.svn/prop-base/test.t.svn-base +5 -0
  120. data/ext/vcftools/perl/.svn/prop-base/vcf-annotate.svn-base +5 -0
  121. data/ext/vcftools/perl/.svn/prop-base/vcf-compare.svn-base +5 -0
  122. data/ext/vcftools/perl/.svn/prop-base/vcf-concat.svn-base +5 -0
  123. data/ext/vcftools/perl/.svn/prop-base/vcf-convert.svn-base +5 -0
  124. data/ext/vcftools/perl/.svn/prop-base/vcf-fix-newlines.svn-base +5 -0
  125. data/ext/vcftools/perl/.svn/prop-base/vcf-isec.svn-base +5 -0
  126. data/ext/vcftools/perl/.svn/prop-base/vcf-merge.svn-base +5 -0
  127. data/ext/vcftools/perl/.svn/prop-base/vcf-query.svn-base +5 -0
  128. data/ext/vcftools/perl/.svn/prop-base/vcf-shuffle-cols.svn-base +5 -0
  129. data/ext/vcftools/perl/.svn/prop-base/vcf-sort.svn-base +5 -0
  130. data/ext/vcftools/perl/.svn/prop-base/vcf-stats.svn-base +5 -0
  131. data/ext/vcftools/perl/.svn/prop-base/vcf-subset.svn-base +5 -0
  132. data/ext/vcftools/perl/.svn/prop-base/vcf-to-tab.svn-base +5 -0
  133. data/ext/vcftools/perl/.svn/prop-base/vcf-validator.svn-base +5 -0
  134. data/ext/vcftools/perl/.svn/text-base/ChangeLog.svn-base +84 -0
  135. data/ext/vcftools/perl/.svn/text-base/FaSlice.pm.svn-base +214 -0
  136. data/ext/vcftools/perl/.svn/text-base/Makefile.svn-base +12 -0
  137. data/ext/vcftools/perl/.svn/text-base/Vcf.pm.svn-base +2853 -0
  138. data/ext/vcftools/perl/.svn/text-base/VcfStats.pm.svn-base +681 -0
  139. data/ext/vcftools/perl/.svn/text-base/fill-aa.svn-base +103 -0
  140. data/ext/vcftools/perl/.svn/text-base/fill-an-ac.svn-base +56 -0
  141. data/ext/vcftools/perl/.svn/text-base/fill-ref-md5.svn-base +204 -0
  142. data/ext/vcftools/perl/.svn/text-base/tab-to-vcf.svn-base +92 -0
  143. data/ext/vcftools/perl/.svn/text-base/test.t.svn-base +376 -0
  144. data/ext/vcftools/perl/.svn/text-base/vcf-annotate.svn-base +1099 -0
  145. data/ext/vcftools/perl/.svn/text-base/vcf-compare.svn-base +1193 -0
  146. data/ext/vcftools/perl/.svn/text-base/vcf-concat.svn-base +310 -0
  147. data/ext/vcftools/perl/.svn/text-base/vcf-convert.svn-base +180 -0
  148. data/ext/vcftools/perl/.svn/text-base/vcf-fix-newlines.svn-base +97 -0
  149. data/ext/vcftools/perl/.svn/text-base/vcf-isec.svn-base +660 -0
  150. data/ext/vcftools/perl/.svn/text-base/vcf-merge.svn-base +577 -0
  151. data/ext/vcftools/perl/.svn/text-base/vcf-query.svn-base +272 -0
  152. data/ext/vcftools/perl/.svn/text-base/vcf-shuffle-cols.svn-base +89 -0
  153. data/ext/vcftools/perl/.svn/text-base/vcf-sort.svn-base +79 -0
  154. data/ext/vcftools/perl/.svn/text-base/vcf-stats.svn-base +160 -0
  155. data/ext/vcftools/perl/.svn/text-base/vcf-subset.svn-base +206 -0
  156. data/ext/vcftools/perl/.svn/text-base/vcf-to-tab.svn-base +112 -0
  157. data/ext/vcftools/perl/.svn/text-base/vcf-validator.svn-base +145 -0
  158. data/ext/vcftools/perl/ChangeLog +84 -0
  159. data/ext/vcftools/perl/FaSlice.pm +214 -0
  160. data/ext/vcftools/perl/Makefile +12 -0
  161. data/ext/vcftools/perl/Vcf.pm +2853 -0
  162. data/ext/vcftools/perl/VcfStats.pm +681 -0
  163. data/ext/vcftools/perl/fill-aa +103 -0
  164. data/ext/vcftools/perl/fill-an-ac +56 -0
  165. data/ext/vcftools/perl/fill-ref-md5 +204 -0
  166. data/ext/vcftools/perl/tab-to-vcf +92 -0
  167. data/ext/vcftools/perl/test.t +376 -0
  168. data/ext/vcftools/perl/vcf-annotate +1099 -0
  169. data/ext/vcftools/perl/vcf-compare +1193 -0
  170. data/ext/vcftools/perl/vcf-concat +310 -0
  171. data/ext/vcftools/perl/vcf-convert +180 -0
  172. data/ext/vcftools/perl/vcf-fix-newlines +97 -0
  173. data/ext/vcftools/perl/vcf-isec +660 -0
  174. data/ext/vcftools/perl/vcf-merge +577 -0
  175. data/ext/vcftools/perl/vcf-query +286 -0
  176. data/ext/vcftools/perl/vcf-shuffle-cols +89 -0
  177. data/ext/vcftools/perl/vcf-sort +79 -0
  178. data/ext/vcftools/perl/vcf-stats +160 -0
  179. data/ext/vcftools/perl/vcf-subset +206 -0
  180. data/ext/vcftools/perl/vcf-to-tab +112 -0
  181. data/ext/vcftools/perl/vcf-validator +145 -0
  182. data/ext/vcftools/website/.svn/all-wcprops +41 -0
  183. data/ext/vcftools/website/.svn/entries +238 -0
  184. data/ext/vcftools/website/.svn/prop-base/VCF-poster.pdf.svn-base +5 -0
  185. data/ext/vcftools/website/.svn/prop-base/favicon.ico.svn-base +5 -0
  186. data/ext/vcftools/website/.svn/prop-base/favicon.png.svn-base +5 -0
  187. data/ext/vcftools/website/.svn/text-base/Makefile.svn-base +6 -0
  188. data/ext/vcftools/website/.svn/text-base/README.svn-base +2 -0
  189. data/ext/vcftools/website/.svn/text-base/VCF-poster.pdf.svn-base +0 -0
  190. data/ext/vcftools/website/.svn/text-base/default.css.svn-base +250 -0
  191. data/ext/vcftools/website/.svn/text-base/favicon.ico.svn-base +0 -0
  192. data/ext/vcftools/website/.svn/text-base/favicon.png.svn-base +0 -0
  193. data/ext/vcftools/website/Makefile +6 -0
  194. data/ext/vcftools/website/README +2 -0
  195. data/ext/vcftools/website/VCF-poster.pdf +0 -0
  196. data/ext/vcftools/website/default.css +250 -0
  197. data/ext/vcftools/website/favicon.ico +0 -0
  198. data/ext/vcftools/website/favicon.png +0 -0
  199. data/ext/vcftools/website/img/.svn/all-wcprops +53 -0
  200. data/ext/vcftools/website/img/.svn/entries +300 -0
  201. data/ext/vcftools/website/img/.svn/prop-base/bg.gif.svn-base +5 -0
  202. data/ext/vcftools/website/img/.svn/prop-base/bgcode.gif.svn-base +5 -0
  203. data/ext/vcftools/website/img/.svn/prop-base/bgcontainer.gif.svn-base +5 -0
  204. data/ext/vcftools/website/img/.svn/prop-base/bgul.gif.svn-base +5 -0
  205. data/ext/vcftools/website/img/.svn/prop-base/header.gif.svn-base +5 -0
  206. data/ext/vcftools/website/img/.svn/prop-base/li.gif.svn-base +5 -0
  207. data/ext/vcftools/website/img/.svn/prop-base/quote.gif.svn-base +5 -0
  208. data/ext/vcftools/website/img/.svn/prop-base/search.gif.svn-base +5 -0
  209. data/ext/vcftools/website/img/.svn/text-base/bg.gif.svn-base +0 -0
  210. data/ext/vcftools/website/img/.svn/text-base/bgcode.gif.svn-base +0 -0
  211. data/ext/vcftools/website/img/.svn/text-base/bgcontainer.gif.svn-base +0 -0
  212. data/ext/vcftools/website/img/.svn/text-base/bgul.gif.svn-base +0 -0
  213. data/ext/vcftools/website/img/.svn/text-base/header.gif.svn-base +0 -0
  214. data/ext/vcftools/website/img/.svn/text-base/li.gif.svn-base +0 -0
  215. data/ext/vcftools/website/img/.svn/text-base/quote.gif.svn-base +0 -0
  216. data/ext/vcftools/website/img/.svn/text-base/search.gif.svn-base +0 -0
  217. data/ext/vcftools/website/img/bg.gif +0 -0
  218. data/ext/vcftools/website/img/bgcode.gif +0 -0
  219. data/ext/vcftools/website/img/bgcontainer.gif +0 -0
  220. data/ext/vcftools/website/img/bgul.gif +0 -0
  221. data/ext/vcftools/website/img/header.gif +0 -0
  222. data/ext/vcftools/website/img/li.gif +0 -0
  223. data/ext/vcftools/website/img/quote.gif +0 -0
  224. data/ext/vcftools/website/img/search.gif +0 -0
  225. data/ext/vcftools/website/src/.svn/all-wcprops +53 -0
  226. data/ext/vcftools/website/src/.svn/entries +300 -0
  227. data/ext/vcftools/website/src/.svn/text-base/docs.inc.svn-base +202 -0
  228. data/ext/vcftools/website/src/.svn/text-base/index.inc.svn-base +52 -0
  229. data/ext/vcftools/website/src/.svn/text-base/index.php.svn-base +80 -0
  230. data/ext/vcftools/website/src/.svn/text-base/license.inc.svn-base +27 -0
  231. data/ext/vcftools/website/src/.svn/text-base/links.inc.svn-base +13 -0
  232. data/ext/vcftools/website/src/.svn/text-base/options.inc.svn-base +654 -0
  233. data/ext/vcftools/website/src/.svn/text-base/perl_module.inc.svn-base +249 -0
  234. data/ext/vcftools/website/src/.svn/text-base/specs.inc.svn-base +18 -0
  235. data/ext/vcftools/website/src/docs.inc +202 -0
  236. data/ext/vcftools/website/src/index.inc +52 -0
  237. data/ext/vcftools/website/src/index.php +80 -0
  238. data/ext/vcftools/website/src/license.inc +27 -0
  239. data/ext/vcftools/website/src/links.inc +13 -0
  240. data/ext/vcftools/website/src/options.inc +654 -0
  241. data/ext/vcftools/website/src/perl_module.inc +249 -0
  242. data/ext/vcftools/website/src/specs.inc +18 -0
  243. data/lib/config.ru +9 -0
  244. data/lib/ngs_server/add.rb +9 -0
  245. data/lib/ngs_server/version.rb +1 -1
  246. data/lib/ngs_server.rb +55 -3
  247. data/ngs_server.gemspec +5 -2
  248. metadata +296 -6
@@ -0,0 +1,1215 @@
1
+ /*
2
+ * vcf_file_filters.cpp
3
+ *
4
+ * Created on: Aug 28, 2009
5
+ * Author: Adam Auton
6
+ * ($Revision: 148 $)
7
+ */
8
+
9
+ #include "vcf_file.h"
10
+
11
+ void vcf_file::apply_filters(const parameters &params)
12
+ {
13
+ printLOG("Applying Required Filters.\n");
14
+ // Apply all filters in turn.
15
+ filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file);
16
+ filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file);
17
+ filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites);
18
+ filter_sites_by_position(params.chr_to_keep, params.start_pos, params.end_pos);
19
+ filter_sites_by_positions(params.positions_file);
20
+ filter_sites_by_BED_file(params.BED_file, params.BED_exclude);
21
+ filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles);
22
+ filter_sites_by_INFO_flags(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep);
23
+ filter_sites_by_quality(params.min_quality);
24
+ filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth);
25
+ filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value);
26
+ filter_individuals_by_mean_depth(params.min_indv_mean_depth, params.max_indv_mean_depth);
27
+ if (params.phased_only == true)
28
+ {
29
+ filter_individuals_by_phase();
30
+ filter_sites_by_phase();
31
+ }
32
+ filter_genotypes_by_quality(params.min_genotype_quality);
33
+ filter_genotypes_by_depth(params.min_genotype_depth, params.max_genotype_depth);
34
+ filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes);
35
+ filter_individuals_by_call_rate(params.min_indv_call_rate);
36
+ filter_individuals_randomly(params.max_N_indv);
37
+ filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_site_call_rate);
38
+ filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.max_missing_call_count);
39
+ filter_sites_by_HWE_pvalue(params.min_HWE_pvalue);
40
+ filter_sites_by_thinning(params.min_interSNP_distance);
41
+ }
42
+
43
+ void vcf_file::filter_genotypes_by_quality(double min_genotype_quality)
44
+ {
45
+ // Filter genotypes by quality
46
+ if ((min_genotype_quality <= 0) || (has_genotypes == false))
47
+ return;
48
+
49
+ if (has_genotypes == false)
50
+ error("Require Genotypes in VCF file in order to filter genotypes by Quality.");
51
+
52
+ printLOG("Filtering out Genotypes with Quality less than " + dbl2str(min_genotype_quality,0) + "\n");
53
+ string vcf_line;
54
+ vcf_entry e(N_indv);
55
+ for (unsigned int s=0; s<N_entries; s++)
56
+ {
57
+ if (include_entry[s] == false)
58
+ continue;
59
+
60
+ get_vcf_entry(s, vcf_line);
61
+ e.reset(vcf_line);
62
+ e.parse_genotype_entries(false, true);
63
+ e.filter_genotypes_by_quality(include_genotype[s], min_genotype_quality);
64
+ }
65
+ }
66
+
67
+ void vcf_file::filter_genotypes_by_depth(int min_depth, int max_depth)
68
+ {
69
+ // Filter genotypes by depth
70
+ if ((min_depth <= 0) && (max_depth == numeric_limits<int>::max()))
71
+ return;
72
+ if (has_genotypes == false)
73
+ error("Require Genotypes in VCF file in order to filter genotypes by Depth.");
74
+
75
+ printLOG("Filtering out Genotypes with Depth less than " + dbl2str(min_depth,0) + " and greater than " + dbl2str(max_depth, 0) + "\n");
76
+ string vcf_line;
77
+ vcf_entry e(N_indv);
78
+ for (unsigned int s=0; s<N_entries; s++)
79
+ {
80
+ if (include_entry[s] == false)
81
+ continue;
82
+
83
+ get_vcf_entry(s, vcf_line);
84
+ e.reset(vcf_line);
85
+ e.parse_genotype_entries(false, false, true);
86
+ e.filter_genotypes_by_depth(include_genotype[s], min_depth, max_depth);
87
+ }
88
+ }
89
+
90
+ void vcf_file::filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all)
91
+ {
92
+ // Filter genotypes by Filter Flags
93
+ if ((remove_all == false) && (filter_flags_to_remove.size() == 0))
94
+ return;
95
+ if (remove_all == true)
96
+ printLOG("Filtering out all genotypes with FILTER flag.\n");
97
+ else
98
+ printLOG("Filtering out genotypes by Filter Status.\n");
99
+
100
+ if (has_genotypes == false)
101
+ error("Require Genotypes in VCF file in order to filter genotypes by Filter Flag.");
102
+
103
+ string vcf_line;
104
+ vcf_entry e(N_indv);
105
+ for (unsigned int s=0; s<N_entries; s++)
106
+ {
107
+ if (include_entry[s] == false)
108
+ continue;
109
+
110
+ get_vcf_entry(s, vcf_line);
111
+ e.reset(vcf_line);
112
+ e.parse_genotype_entries(false, false, false, true);
113
+ e.filter_genotypes_by_filter_status(include_genotype[s], filter_flags_to_remove, remove_all);
114
+ }
115
+ }
116
+
117
+
118
+ void vcf_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude)
119
+ {
120
+ // Filter individuals by user provided lists
121
+ if (keep_then_exclude)
122
+ {
123
+ filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
124
+ filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
125
+ }
126
+ else
127
+ {
128
+ filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
129
+ filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
130
+ }
131
+ }
132
+
133
+ void vcf_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename)
134
+ {
135
+ // Filter individuals by user provided list
136
+ if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0))
137
+ return;
138
+ printLOG("Keeping individuals in 'keep' list\n");
139
+ set<string> indv_to_keep_copy = indv_to_keep;
140
+ if (indv_to_keep_filename != "")
141
+ {
142
+ ifstream infile(indv_to_keep_filename.c_str());
143
+ if (!infile.is_open())
144
+ error("Could not open Individual file:" + indv_to_keep_filename, 1);
145
+ string line;
146
+ string tmp_indv;
147
+ stringstream ss;
148
+ while (!infile.eof())
149
+ {
150
+ getline(infile, line);
151
+ ss.str(line);
152
+ ss >> tmp_indv;
153
+ indv_to_keep_copy.insert(tmp_indv);
154
+ ss.clear();
155
+ }
156
+ infile.close();
157
+ }
158
+
159
+ for (unsigned int ui=0; ui<N_indv; ui++)
160
+ {
161
+ if (include_indv[ui] == false)
162
+ continue;
163
+ if (indv_to_keep_copy.find(indv[ui]) == indv_to_keep_copy.end())
164
+ include_indv[ui] = false;
165
+ }
166
+ }
167
+
168
+ void vcf_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename)
169
+ {
170
+ // Filter individuals by user provided list
171
+ if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0))
172
+ return;
173
+ printLOG("Excluding individuals in 'exclude' list\n");
174
+ set<string> indv_to_exclude_copy = indv_to_exclude;
175
+ if (indv_to_exclude_filename != "")
176
+ {
177
+ ifstream infile(indv_to_exclude_filename.c_str());
178
+ if (!infile.is_open())
179
+ {
180
+ error("Could not open Individual file:" + indv_to_exclude_filename, 1);
181
+ }
182
+ string line;
183
+ string tmp_indv;
184
+ stringstream ss;
185
+ while (!infile.eof())
186
+ {
187
+ getline(infile, line);
188
+ ss.str(line);
189
+ ss >> tmp_indv;
190
+ indv_to_exclude_copy.insert(tmp_indv);
191
+ ss.clear();
192
+ }
193
+ infile.close();
194
+ }
195
+ for (unsigned int ui=0; ui<N_indv; ui++)
196
+ {
197
+ if (include_indv[ui] == false)
198
+ continue;
199
+ if (indv_to_exclude_copy.find(indv[ui]) != indv_to_exclude_copy.end())
200
+ include_indv[ui] = false;
201
+ }
202
+ }
203
+
204
+ void vcf_file::filter_individuals_by_call_rate(double min_call_rate)
205
+ {
206
+ // Filter individuals by call rate
207
+ if (min_call_rate <= 0.0)
208
+ return;
209
+
210
+ if (has_genotypes == false)
211
+ error("Require Genotypes in VCF file in order to filter individuals by call rate.");
212
+
213
+ printLOG("Filtering individuals by call rate\n");
214
+
215
+ unsigned int ui;
216
+ pair<int, int> genotype;
217
+ vector<int> N_sites_included(N_indv, 0);
218
+ vector<int> N_missing(N_indv, 0);
219
+ string vcf_line;
220
+ vcf_entry e(N_indv);
221
+ for (unsigned int s=0; s<N_entries; s++)
222
+ {
223
+ if (include_entry[s] == false)
224
+ continue;
225
+
226
+ get_vcf_entry(s, vcf_line);
227
+ e.reset(vcf_line);
228
+ for (ui=0; ui<N_indv; ui++)
229
+ {
230
+ if (include_indv[ui] == false)
231
+ continue;
232
+
233
+ if (include_genotype[s][ui] == true)
234
+ {
235
+ e.parse_genotype_entry(ui, true);
236
+ e.get_indv_GENOTYPE_ids(ui, genotype);
237
+ if (genotype.first != -1)
238
+ {
239
+ N_missing[ui]++;
240
+ }
241
+ N_sites_included[ui]++;
242
+ }
243
+ }
244
+ }
245
+
246
+ for (ui=0; ui<N_indv; ui++)
247
+ {
248
+ if (include_indv[ui] == false)
249
+ continue;
250
+
251
+ double call_rate = N_missing[ui] / (double)N_sites_included[ui];
252
+ if (call_rate < min_call_rate)
253
+ include_indv[ui] = false;
254
+ }
255
+ }
256
+
257
+ void vcf_file::filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth)
258
+ {
259
+ // Filter individuals by mean depth across sites
260
+ if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
261
+ return;
262
+
263
+ if (has_genotypes == false)
264
+ error("Require Genotypes in VCF file in order to filter individuals by mean depth");
265
+
266
+ printLOG("Filtering individuals by mean depth\n");
267
+ unsigned int ui;
268
+
269
+ vector<int> N_sites_included(N_indv, 0);
270
+ vector<double> depth_sum(N_indv,0.0);
271
+ int depth;
272
+ string vcf_line;
273
+ vcf_entry e(N_indv);
274
+ for (unsigned int s=0; s<N_entries; s++)
275
+ {
276
+ if (include_entry[s] == false)
277
+ continue;
278
+
279
+ get_vcf_entry(s, vcf_line);
280
+ e.reset(vcf_line);
281
+
282
+ for (ui=0; ui<N_indv; ui++)
283
+ {
284
+ if (include_indv[ui] == false)
285
+ continue;
286
+ if (include_genotype[s][ui] == true)
287
+ {
288
+ e.parse_genotype_entry(ui, false, false, true);
289
+ depth = e.get_indv_DEPTH(ui);
290
+ if (depth >= 0)
291
+ {
292
+ depth_sum[ui] += depth;
293
+ N_sites_included[ui]++;
294
+ }
295
+ }
296
+ }
297
+ }
298
+
299
+ for (ui=0; ui<N_indv; ui++)
300
+ {
301
+ if (include_indv[ui] == false)
302
+ continue;
303
+ double mean_depth = depth_sum[ui] / N_sites_included[ui];
304
+ if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
305
+ include_indv[ui] = false;
306
+ }
307
+ }
308
+
309
+ void vcf_file::filter_individuals_by_phase()
310
+ {
311
+ // Filter individuals that are completely unphased.
312
+ // TODO: Alter this to allow for a max/min level of unphased-ness.
313
+ printLOG("Filtering Unphased Individuals\n");
314
+
315
+ if (has_genotypes == false)
316
+ error("Require Genotypes in VCF file to filter by Phase.");
317
+
318
+ unsigned int ui, s;
319
+ vector<unsigned int> indv_count(N_indv, 0);
320
+ vector<unsigned int> indv_count_unphased(N_indv, 0);
321
+ string vcf_line;
322
+ vcf_entry e(N_indv);
323
+ for (s=0; s<N_entries; s++)
324
+ {
325
+ if (include_entry[s] == false)
326
+ continue;
327
+
328
+ get_vcf_entry(s, vcf_line);
329
+ e.reset(vcf_line);
330
+
331
+ for (ui=0; ui<N_indv; ui++)
332
+ {
333
+ if (include_indv[ui] == false)
334
+ continue;
335
+
336
+ e.parse_genotype_entry(ui, true);
337
+
338
+ indv_count[ui]++;
339
+ if (e.get_indv_PHASE(ui) != '|')
340
+ indv_count_unphased[ui]++;
341
+ }
342
+ }
343
+
344
+ for (ui=0; ui<N_indv; ui++)
345
+ {
346
+ if (include_indv[ui] == false)
347
+ continue;
348
+
349
+ if (indv_count_unphased[ui] == indv_count[ui])
350
+ {
351
+ include_indv[ui] = false;
352
+ }
353
+ }
354
+ }
355
+
356
+ void vcf_file::filter_individuals_randomly(int max_N_indv)
357
+ {
358
+ // Filter individuals randomly until have a random subset
359
+ if (max_N_indv < 0)
360
+ return;
361
+ printLOG("Filtering Individuals Randomly\n");
362
+
363
+ if (has_genotypes == false)
364
+ error("Require Genotypes in VCF file filter individuals.");
365
+
366
+ unsigned int N_kept_indv = N_kept_individuals();
367
+
368
+ srand ( time(NULL) );
369
+ vector<unsigned int> keep_index(N_kept_indv);
370
+ int count = 0;
371
+ for (unsigned int ui=0; ui<N_indv; ui++)
372
+ {
373
+ if (include_indv[ui] == true)
374
+ {
375
+ keep_index[count] = ui;
376
+ count++;
377
+ }
378
+ }
379
+
380
+ random_shuffle(keep_index.begin(), keep_index.end()); // Get a random order
381
+ keep_index.resize(min(max_N_indv, (signed)keep_index.size())); // Only keep a subset
382
+
383
+ for (unsigned int ui=0; ui<N_indv; ui++)
384
+ {
385
+ if (include_indv[ui] == false)
386
+ continue;
387
+ bool found = false;
388
+ for (unsigned int uj=0; uj<keep_index.size(); uj++)
389
+ {
390
+ if (keep_index[uj] == ui)
391
+ {
392
+ found = true;
393
+ }
394
+ }
395
+ if (found == false)
396
+ include_indv[ui] = false;
397
+ }
398
+ }
399
+
400
+
401
+ void vcf_file::filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude)
402
+ {
403
+ // Filter sites by user provided lists
404
+ if (keep_then_exclude)
405
+ {
406
+ filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
407
+ filter_sites_to_exclude(snps_to_exclude_file);
408
+ }
409
+ else
410
+ {
411
+ filter_sites_to_exclude(snps_to_exclude_file);
412
+ filter_sites_to_keep(snps_to_keep, snps_to_keep_file);
413
+ }
414
+ }
415
+
416
+ void vcf_file::filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file)
417
+ {
418
+ // Filter sites by user provided list
419
+ if ((snps_to_keep.size() == 0) && (snps_to_keep_file == ""))
420
+ return;
421
+
422
+ set<string> local_snps_to_keep = snps_to_keep;
423
+
424
+ printLOG("Keeping sites by user-supplied list\n");
425
+
426
+ if (snps_to_keep_file != "")
427
+ {
428
+ ifstream in(snps_to_keep_file.c_str());
429
+ string tmp;
430
+ if (!in.is_open())
431
+ {
432
+ error("Could not open SNPs to Keep file" + snps_to_keep_file, 0);
433
+ }
434
+ while (!in.eof())
435
+ {
436
+ in >> tmp;
437
+ local_snps_to_keep.insert(tmp);
438
+ in.ignore(numeric_limits<streamsize>::max(), '\n');
439
+ }
440
+
441
+ in.close();
442
+ }
443
+
444
+ string vcf_line;
445
+ for (unsigned int s=0; s<N_entries; s++)
446
+ {
447
+ if (include_entry[s] == false)
448
+ continue;
449
+
450
+ get_vcf_entry(s, vcf_line);
451
+ vcf_entry e(N_indv, vcf_line);
452
+ e.parse_basic_entry();
453
+ if (local_snps_to_keep.find(e.get_ID()) == local_snps_to_keep.end())
454
+ include_entry[s] = false;
455
+ }
456
+ }
457
+
458
+ void vcf_file::filter_sites_to_exclude(const string &snps_to_exclude_file)
459
+ {
460
+ // Filter sites by user provided list
461
+ if (snps_to_exclude_file == "")
462
+ return;
463
+
464
+ printLOG("Excluding sites by user-supplied list\n");
465
+
466
+ set<string> snps_to_exclude;
467
+ if (snps_to_exclude_file != "")
468
+ {
469
+ ifstream in(snps_to_exclude_file.c_str());
470
+ string tmp;
471
+ if (!in.is_open())
472
+ {
473
+ error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0);
474
+ }
475
+ while (!in.eof())
476
+ {
477
+ in >> tmp;
478
+ snps_to_exclude.insert(tmp);
479
+ in.ignore(numeric_limits<streamsize>::max(), '\n');
480
+ }
481
+ in.close();
482
+ }
483
+
484
+ string vcf_line;
485
+ for (unsigned int s=0; s<N_entries; s++)
486
+ {
487
+ if (include_entry[s] == false)
488
+ continue;
489
+
490
+ get_vcf_entry(s, vcf_line);
491
+ vcf_entry e(N_indv, vcf_line);
492
+ e.parse_basic_entry();
493
+ if (snps_to_exclude.find(e.get_ID()) != snps_to_exclude.end())
494
+ include_entry[s] = false;
495
+ }
496
+ }
497
+
498
+ void vcf_file::filter_sites_by_quality(double min_quality)
499
+ {
500
+ // Filter sites by quality
501
+ if (min_quality < 0)
502
+ return;
503
+
504
+ printLOG("Filtering sites with Quality less than " + dbl2str(min_quality,0) + "\n");
505
+
506
+ unsigned int s;
507
+ string vcf_line;
508
+ for (s=0; s<N_entries; s++)
509
+ {
510
+ if (include_entry[s] == false)
511
+ continue;
512
+ get_vcf_entry(s, vcf_line);
513
+ vcf_entry e(N_indv, vcf_line);
514
+ e.parse_basic_entry(true);
515
+ string alt_allele = e.get_ALT_allele(0);
516
+ // The QUAL field has different definitions depending on the state of the
517
+ // alternative allele. Here I treat them separately, although in this case
518
+ // it is unnecessary.
519
+ if ((alt_allele == ".") || (alt_allele == ""))
520
+ { // The case that the alternative allele is unknown
521
+ // QUAL is -10log_10 p(variant)
522
+ if (e.get_QUAL() < min_quality)
523
+ include_entry[s] = false;
524
+ }
525
+ else
526
+ { // The normal case
527
+ // QUAL is -10log_10 p(no variant)
528
+ if (e.get_QUAL() < min_quality)
529
+ include_entry[s] = false;
530
+ }
531
+ }
532
+ }
533
+
534
+ void vcf_file::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth)
535
+ {
536
+ // Filter sites by mean depth
537
+ if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits<double>::max()))
538
+ return;
539
+
540
+ if (has_genotypes == false)
541
+ error("Require Genotypes in VCF file in order to filter sites by mean depth");
542
+
543
+ printLOG("Filtering sites by mean depth\n");
544
+ int depth;
545
+
546
+ string vcf_line;
547
+ for (unsigned int s=0; s<N_entries; s++)
548
+ {
549
+ if (include_entry[s] == false)
550
+ continue;
551
+
552
+ get_vcf_entry(s, vcf_line);
553
+ vcf_entry e(N_indv, vcf_line);
554
+
555
+ unsigned int N_indv_included = 0;
556
+ double depth_sum = 0.0;
557
+ for (unsigned int ui=0; ui<N_indv; ui++)
558
+ {
559
+ if (include_indv[ui] == false)
560
+ continue;
561
+
562
+ if (include_genotype[s][ui] == true)
563
+ {
564
+ e.parse_genotype_entry(ui, false, false, true);
565
+ depth = e.get_indv_DEPTH(ui);
566
+ if (depth >= 0)
567
+ {
568
+ depth_sum += depth;
569
+ }
570
+ N_indv_included++;
571
+ }
572
+ }
573
+ double mean_depth = depth_sum / N_indv_included;
574
+
575
+ if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth))
576
+ include_entry[s] = false;
577
+ }
578
+ }
579
+
580
+ void vcf_file::filter_sites_by_position(const string &chr, int start_pos, int end_pos)
581
+ {
582
+ // Filter sites by user provided position range
583
+ if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits<int>::max())))
584
+ return;
585
+ printLOG("Filtering sites by chromosome and/or position\n");
586
+ string vcf_line;
587
+ string chrom; int pos1;
588
+ for (unsigned int s=0; s<N_entries; s++)
589
+ {
590
+ if (include_entry[s] == false)
591
+ continue;
592
+ //get_vcf_entry(s, vcf_line);
593
+ //vcf_entry e(N_indv, vcf_line);
594
+ //e.parse_basic_entry();
595
+ set_filepos(entry_file_locations[s]);
596
+ read_CHROM_and_POS_only(chrom, pos1);
597
+ if (chrom == chr)
598
+ {
599
+ if ((pos1 < start_pos) || (pos1 > end_pos))
600
+ include_entry[s] = false;
601
+ }
602
+ else
603
+ include_entry[s] = false;
604
+ }
605
+ }
606
+
607
+ void vcf_file::filter_sites_by_positions(const string &positions_file)
608
+ {
609
+ // Filter sites by a user defined file containing a list of positions
610
+ if (positions_file == "")
611
+ return;
612
+ printLOG("Filtering sites by Positions file\n");
613
+ ifstream BED(positions_file.c_str());
614
+ if (!BED.is_open())
615
+ error("Could not open Positions file: " + positions_file);
616
+
617
+ string chr;
618
+ int pos1;
619
+ int idx;
620
+ unsigned int N_chr=0;
621
+ map<string,int> chr_to_idx;
622
+ vector< set<int > > lims;
623
+ stringstream ss;
624
+ string line;
625
+ // Skip header
626
+ BED.ignore(numeric_limits<streamsize>::max(), '\n');
627
+ while (!BED.eof())
628
+ {
629
+ getline(BED, line);
630
+ if (line[0] == '#')
631
+ continue;
632
+
633
+ ss.clear();
634
+ ss.str(line);
635
+ ss >> chr >> pos1;
636
+
637
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
638
+ {
639
+ N_chr++;
640
+ chr_to_idx[chr] = (N_chr-1);
641
+ lims.resize(N_chr);
642
+ }
643
+
644
+ idx = chr_to_idx[chr];
645
+ lims[idx].insert(pos1);
646
+ }
647
+ BED.close();
648
+
649
+ string vcf_line;
650
+ for (unsigned int s=0; s<N_entries; s++)
651
+ {
652
+ if (include_entry[s] == false)
653
+ continue;
654
+ //get_vcf_entry(s, vcf_line);
655
+ //vcf_entry e(N_indv, vcf_line);
656
+ //e.parse_basic_entry();
657
+ //e.get_CHROM(chr);
658
+ set_filepos(entry_file_locations[s]);
659
+ read_CHROM_and_POS_only(chr, pos1);
660
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
661
+ include_entry[s] = false;
662
+ else
663
+ {
664
+ //pos1 = e.get_POS();
665
+ idx = chr_to_idx[chr];
666
+ bool found=false;
667
+
668
+ if (lims[idx].find(pos1) != lims[idx].end())
669
+ found = true;
670
+
671
+ if (found == false)
672
+ include_entry[s] = false;
673
+ }
674
+ }
675
+ }
676
+
677
+ void vcf_file::filter_sites_by_BED_file(const string &bed_file, bool BED_exclude)
678
+ {
679
+ // Filter sites depending on positions in a BED file.
680
+ if (bed_file == "")
681
+ return;
682
+ printLOG("Filtering sites by BED file\n");
683
+ ifstream BED(bed_file.c_str());
684
+ if (!BED.is_open())
685
+ error("Could not open BED file: " + bed_file);
686
+
687
+ string chr;
688
+ int pos1, pos2;
689
+ int idx;
690
+ unsigned int N_chr=0;
691
+ map<string,int> chr_to_idx;
692
+ vector< deque<pair<int,int> > > lims;
693
+ BED.ignore(numeric_limits<streamsize>::max(), '\n'); // Ignore header
694
+ while (!BED.eof())
695
+ {
696
+ BED >> chr >> pos1 >> pos2;
697
+ BED.ignore(numeric_limits<streamsize>::max(), '\n');
698
+
699
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
700
+ {
701
+ N_chr++;
702
+ chr_to_idx[chr] = (N_chr-1);
703
+ lims.resize(N_chr);
704
+ }
705
+
706
+ idx = chr_to_idx[chr];
707
+ lims[idx].push_back(make_pair(pos1,pos2));
708
+ }
709
+ BED.close();
710
+
711
+ for (unsigned int ui=0; ui<lims.size(); ui++)
712
+ sort(lims[ui].begin(), lims[ui].end());
713
+
714
+ pair<int,int> range;
715
+ string vcf_line;
716
+ vector<unsigned int> min_ui(lims.size(), 0);
717
+ for (unsigned int s=0; s<N_entries; s++)
718
+ {
719
+ if (include_entry[s] == false)
720
+ continue;
721
+ //get_vcf_entry(s, vcf_line);
722
+ //vcf_entry e(N_indv, vcf_line);
723
+ //e.parse_basic_entry();
724
+ //e.get_CHROM(chr);
725
+ set_filepos(entry_file_locations[s]);
726
+ read_CHROM_and_POS_only(chr, pos1);
727
+ if (BED_exclude == false)
728
+ { // Exclude sites not in BED file
729
+ if (chr_to_idx.find(chr) == chr_to_idx.end())
730
+ include_entry[s] = false;
731
+ else
732
+ {
733
+ idx = chr_to_idx[chr];
734
+ bool found=false;
735
+ unsigned int max_ui = lims[idx].size();
736
+ for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
737
+ { // No need to start this loop at zero every time...
738
+ if ((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second))
739
+ {
740
+ found=true;
741
+ break;
742
+ }
743
+ else if (pos1 > lims[idx][ui].second)
744
+ min_ui[idx] = ui+1;
745
+ }
746
+ if (found == false)
747
+ include_entry[s] = false;
748
+ }
749
+ }
750
+ else
751
+ { // Exclude sites in BED file
752
+ if (chr_to_idx.find(chr) != chr_to_idx.end())
753
+ {
754
+ idx = chr_to_idx[chr];
755
+ bool found=false;
756
+ unsigned int max_ui = lims[idx].size();
757
+ for (unsigned int ui=min_ui[idx]; ui<max_ui; ui++)
758
+ { // No need to start this loop at zero every time...
759
+ if ((pos1 > lims[idx][ui].first) && (pos1 <= lims[idx][ui].second))
760
+ {
761
+ found=true;
762
+ break;
763
+ }
764
+ else if (pos1 > lims[idx][ui].second)
765
+ min_ui[idx] = ui+1;
766
+ }
767
+ if (found == true)
768
+ include_entry[s] = false;
769
+ }
770
+ }
771
+ }
772
+ }
773
+
774
+ void vcf_file::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value)
775
+ {
776
+ // Filter sites on the basis of a fasta-like mask file.
777
+ if (mask_file == "")
778
+ return;
779
+ if (invert_mask == false)
780
+ printLOG("Filtering sites by mask file\n");
781
+ else
782
+ printLOG("Filtering sites by inverted mask file\n");
783
+ ifstream mask(mask_file.c_str());
784
+ if (!mask.is_open())
785
+ error("Could not open mask file: " + mask_file);
786
+
787
+ string line;
788
+ string next_chr="", vcf_line;
789
+ unsigned int next_pos = 0;
790
+ unsigned int next_s = 0;
791
+
792
+ unsigned int current_pos = 1;
793
+ string current_header = "";
794
+ bool keep;
795
+ while (!mask.eof())
796
+ {
797
+ getline(mask, line);
798
+ line.erase( line.find_last_not_of(" \t") + 1);
799
+
800
+ if (line[0] == '>')
801
+ { // Header
802
+ current_header = line.substr(1, line.find_first_of(" \t")-1);
803
+ current_pos = 1;
804
+ for (unsigned int s=0; s<N_entries; s++)
805
+ {
806
+ if (include_entry[s] == true)
807
+ {
808
+ get_vcf_entry(s, vcf_line);
809
+ vcf_entry e(N_indv, vcf_line);
810
+ e.parse_basic_entry();
811
+ e.get_CHROM(next_chr);
812
+ if (next_chr == current_header)
813
+ {
814
+ next_pos = (unsigned)e.get_POS();
815
+ next_s = s;
816
+ break;
817
+ }
818
+ else
819
+ {
820
+ include_entry[s] = false;
821
+ }
822
+ }
823
+ }
824
+ }
825
+ else
826
+ {
827
+ if ((current_pos + line.size() >= next_pos) && (next_chr == current_header))
828
+ {
829
+ for (unsigned int ui=0; ui<line.size(); ui++)
830
+ {
831
+ if (current_pos + ui == next_pos)
832
+ {
833
+ char mask_base = line[ui]-48;
834
+ keep = (mask_base <= min_kept_mask_value);
835
+ if (invert_mask == true)
836
+ keep = !keep;
837
+
838
+ if (keep == false)
839
+ {
840
+ include_entry[next_s] = false;
841
+ }
842
+
843
+ next_s += 1;
844
+ for (unsigned int s=next_s; s<N_entries; s++)
845
+ {
846
+ if (include_entry[s] == true)
847
+ {
848
+ get_vcf_entry(s, vcf_line);
849
+ vcf_entry e(N_indv, vcf_line);
850
+ e.parse_basic_entry();
851
+ e.get_CHROM(next_chr);
852
+ next_pos = (unsigned)e.get_POS();
853
+ next_s = s;
854
+ break;
855
+ }
856
+ }
857
+ }
858
+ }
859
+ }
860
+ current_pos += line.size();
861
+ }
862
+ }
863
+ mask.close();
864
+
865
+ // Remaining sites aren't covered by mask, so exclude
866
+ for (unsigned int s=next_s; s<N_entries; s++)
867
+ {
868
+ include_entry[s] = false;
869
+ }
870
+ }
871
+
872
+
873
+ void vcf_file::filter_sites_by_number_of_alleles(int min_alleles, int max_alleles)
874
+ {
875
+ // Filter sites by the number of alleles (e.g. 2 for bi-allelic)
876
+ if ((min_alleles <= 0) && (max_alleles == numeric_limits<int>::max()))
877
+ return;
878
+ printLOG("Filtering sites by number of alleles\n");
879
+
880
+ int N_alleles;
881
+ string vcf_line;
882
+ for (unsigned int s=0; s<N_entries; s++)
883
+ {
884
+ if (include_entry[s] == false)
885
+ continue;
886
+
887
+ get_vcf_entry(s, vcf_line);
888
+ vcf_entry e(N_indv, vcf_line);
889
+ e.parse_basic_entry(true);
890
+ N_alleles = e.get_N_alleles();
891
+ if ((N_alleles < min_alleles) || (N_alleles > max_alleles))
892
+ {
893
+ include_entry[s] = false;
894
+ }
895
+ }
896
+ }
897
+
898
+ void vcf_file::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate)
899
+ {
900
+ // Filter sites so that all allele frequencies are between limits
901
+ if ((min_maf <= 0.0) && (max_maf >= 1.0) && (min_site_call_rate <= 0) && (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0))
902
+ return;
903
+
904
+ if (has_genotypes == false)
905
+ error("Require Genotypes in VCF file to filter by frequency and/or call rate");
906
+
907
+ printLOG("Filtering sites by allele frequency and call rate\n");
908
+
909
+ unsigned int N_alleles;
910
+ unsigned int N_non_missing_chr;
911
+
912
+ string vcf_line;
913
+ vcf_entry e(N_indv);
914
+ for (unsigned int s=0; s<N_entries; s++)
915
+ {
916
+ if (include_entry[s] == false)
917
+ continue;
918
+
919
+ get_vcf_entry(s, vcf_line);
920
+ e.reset(vcf_line);
921
+ e.parse_basic_entry(true);
922
+ e.parse_genotype_entries(true);
923
+ N_alleles = e.get_N_alleles();
924
+
925
+ vector<int> allele_counts;
926
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
927
+
928
+ double freq;
929
+ double maf=numeric_limits<double>::max();
930
+ for (unsigned int ui=0; ui<N_alleles; ui++)
931
+ {
932
+ freq = allele_counts[ui] / (double)N_non_missing_chr;
933
+ freq = min(freq, 1.0 - freq);
934
+
935
+ maf = min(maf, freq);
936
+ if ((ui > 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af)))
937
+ include_entry[s] = false;
938
+ }
939
+
940
+
941
+ if ((maf < min_maf) || (maf > max_maf))
942
+ include_entry[s] = false;
943
+
944
+ //unsigned int N_geno_included = e.get_N_chr();
945
+ double call_rate = N_non_missing_chr / double(e.get_N_chr(include_indv, include_genotype[s]));
946
+
947
+ if (call_rate < min_site_call_rate)
948
+ include_entry[s] = false;
949
+ }
950
+ }
951
+
952
+
953
+
954
+ void vcf_file::filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count)
955
+ {
956
+ if ((min_mac <= 0) && (max_mac == numeric_limits<int>::max()) &&
957
+ (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits<int>::max()) &&
958
+ (max_missing_call_count == numeric_limits<int>::max()))
959
+ return;
960
+
961
+ // Filter sites so that all allele counts are between limits
962
+ if (has_genotypes == false)
963
+ error("Require Genotypes in VCF file to filter by allele counts and/or missing data");
964
+
965
+ printLOG("Filtering sites by allele count and missing data\n");
966
+
967
+ unsigned int N_alleles, N_chr, N_non_missing_chr;
968
+
969
+ string vcf_line;
970
+ vcf_entry e(N_indv);
971
+ for (unsigned int s=0; s<N_entries; s++)
972
+ {
973
+ if (include_entry[s] == false)
974
+ continue;
975
+
976
+ get_vcf_entry(s, vcf_line);
977
+ e.reset(vcf_line);
978
+ e.parse_basic_entry(true);
979
+ e.parse_genotype_entries(true);
980
+ N_alleles = e.get_N_alleles();
981
+
982
+ vector<int> allele_counts;
983
+ e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]);
984
+ N_chr = e.get_N_chr(include_indv, include_genotype[s]);
985
+
986
+ int mac = numeric_limits<int>::max();
987
+ for (unsigned int ui=0; ui<N_alleles; ui++)
988
+ {
989
+ mac = min(allele_counts[ui], mac);
990
+ if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac)))
991
+ include_entry[s] = false;
992
+ }
993
+
994
+ if ((mac < min_mac) || (mac > max_mac))
995
+ include_entry[s] = false;
996
+
997
+ if ((N_chr-N_non_missing_chr) > max_missing_call_count)
998
+ include_entry[s] = false;
999
+ }
1000
+ }
1001
+
1002
+
1003
+ void vcf_file::filter_sites_by_HWE_pvalue(double min_HWE_pvalue)
1004
+ {
1005
+ // Filter sites by HWE p-value
1006
+ if (min_HWE_pvalue <= 0)
1007
+ return;
1008
+
1009
+ if (has_genotypes == false)
1010
+ error("Require Genotypes in VCF file to filter sites by HWE.");
1011
+
1012
+ // Note this assumes Biallelic SNPs.
1013
+ printLOG("Filtering sites by HWE p-value (only including bi-allelic sites)\n");
1014
+
1015
+ unsigned int b11, b12, b22;
1016
+ double p;
1017
+ string vcf_line;
1018
+ for (unsigned int s=0; s<N_entries; s++)
1019
+ {
1020
+ if (include_entry[s] == false)
1021
+ continue;
1022
+
1023
+ get_vcf_entry(s, vcf_line);
1024
+ vcf_entry e(N_indv, vcf_line);
1025
+
1026
+ e.parse_basic_entry(true);
1027
+ e.parse_genotype_entries(true);
1028
+
1029
+ e.get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22);
1030
+ p = vcf_entry::SNPHWE(b12, b11, b22);
1031
+
1032
+ if (p < min_HWE_pvalue)
1033
+ include_entry[s] = false;
1034
+ }
1035
+ }
1036
+
1037
+ void vcf_file::filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all)
1038
+ {
1039
+ // Filter sites by entries in the FILTER field.
1040
+ if ((remove_all == false) && (filter_flags_to_remove.size() == 0) && (filter_flags_to_keep.size() == 0))
1041
+ return;
1042
+
1043
+ printLOG("Filtering sites by FILTER Status.\n");
1044
+
1045
+ vector<string> FILTERs;
1046
+ string vcf_line;
1047
+ unsigned int N_to_remove = filter_flags_to_remove.size();
1048
+ unsigned int N_to_keep = filter_flags_to_keep.size();
1049
+ for (unsigned int s=0; s<N_entries; s++)
1050
+ {
1051
+ if (include_entry[s] == false)
1052
+ continue;
1053
+
1054
+ get_vcf_entry(s, vcf_line);
1055
+ vcf_entry e(N_indv, vcf_line);
1056
+
1057
+ e.parse_basic_entry(false, true);
1058
+
1059
+ e.get_FILTER_vector(FILTERs);
1060
+
1061
+ if (N_to_keep > 0)
1062
+ {
1063
+ bool keep = false;
1064
+ for (unsigned int ui=0; ui<FILTERs.size(); ui++)
1065
+ if (filter_flags_to_keep.find(FILTERs[ui]) != filter_flags_to_keep.end())
1066
+ {
1067
+ keep = true; break;
1068
+ }
1069
+
1070
+ include_entry[s] = keep;
1071
+ }
1072
+
1073
+ if (include_entry[s]==false)
1074
+ continue;
1075
+
1076
+ if ((remove_all == true) && (FILTERs.size() > 0))
1077
+ include_entry[s] = false;
1078
+ else if (N_to_remove > 0)
1079
+ {
1080
+ for (unsigned int ui=0; ui<FILTERs.size(); ui++)
1081
+ if (filter_flags_to_remove.find(FILTERs[ui]) != filter_flags_to_remove.end())
1082
+ include_entry[s] = false;
1083
+ }
1084
+ }
1085
+ }
1086
+
1087
+ void vcf_file::filter_sites_by_phase()
1088
+ {
1089
+ // Filter out sites with unphased entries
1090
+ // TODO: Alter this to allow for a max/min level of unphased-ness.
1091
+ printLOG("Filtering Sites with Unphased Genotypes\n");
1092
+ string vcf_line;
1093
+ vcf_entry e(N_indv);
1094
+
1095
+ for (unsigned int s=0; s<N_entries; s++)
1096
+ {
1097
+ if (include_entry[s] == false)
1098
+ continue;
1099
+
1100
+ unsigned int count = 0;
1101
+ unsigned int count_unphased = 0;
1102
+ get_vcf_entry(s, vcf_line);
1103
+ e.reset(vcf_line);
1104
+
1105
+ for (unsigned int ui=0; ui<N_indv; ui++)
1106
+ {
1107
+ if (include_indv[ui] == false)
1108
+ continue;
1109
+
1110
+ e.parse_genotype_entry(ui, true);
1111
+
1112
+ count++;
1113
+ if (e.get_indv_PHASE(ui) != '|')
1114
+ count_unphased++;
1115
+ }
1116
+
1117
+ if (count_unphased > 0)
1118
+ include_entry[s] = false;
1119
+ }
1120
+ }
1121
+
1122
+ void vcf_file::filter_sites_by_thinning(int min_SNP_distance)
1123
+ {
1124
+ // Filter sites so that no two SNPs are within some minimum distance
1125
+ if (min_SNP_distance < 1)
1126
+ return;
1127
+ printLOG("Filtering sites so that no two sites are within " + int2str(min_SNP_distance) + "bp\n");
1128
+
1129
+ string vcf_line;
1130
+ vcf_entry e(N_indv);
1131
+ map<string, int> CHROM_to_idx;
1132
+ string CHROM, last_CHROM="";
1133
+ int POS, last_POS = -1;
1134
+ int distance_from_last_SNP;
1135
+
1136
+ for (unsigned int s=0; s<N_entries; s++)
1137
+ {
1138
+ if (include_entry[s] == false)
1139
+ continue;
1140
+
1141
+ //get_vcf_entry(s, vcf_line);
1142
+ //e.reset(vcf_line);
1143
+ //e.parse_basic_entry();
1144
+
1145
+ //CHROM = e.get_CHROM();
1146
+ //POS = e.get_POS();
1147
+ set_filepos(entry_file_locations[s]);
1148
+ read_CHROM_and_POS_only(CHROM, POS);
1149
+ if (CHROM == last_CHROM)
1150
+ {
1151
+ distance_from_last_SNP = POS - last_POS;
1152
+ if (distance_from_last_SNP < min_SNP_distance)
1153
+ include_entry[s] = false;
1154
+ }
1155
+ if (include_entry[s] == true)
1156
+ last_POS = POS;
1157
+ last_CHROM = CHROM;
1158
+ }
1159
+ }
1160
+
1161
+
1162
+ void vcf_file::filter_sites_by_INFO_flags(const set<string> &flags_to_remove, const set<string> &flags_to_keep)
1163
+ {
1164
+ // Filter sites by entries in the INFO field.
1165
+ if ((flags_to_remove.size() == 0) && (flags_to_keep.size() == 0))
1166
+ return;
1167
+
1168
+ printLOG("Filtering sites by INFO flags.\n");
1169
+
1170
+ vector<string> INFOs;
1171
+ string vcf_line;
1172
+ string value;
1173
+ unsigned int N_to_remove = flags_to_remove.size();
1174
+ unsigned int N_to_keep = flags_to_keep.size();
1175
+ for (unsigned int s=0; s<N_entries; s++)
1176
+ {
1177
+ if (include_entry[s] == false)
1178
+ continue;
1179
+
1180
+ get_vcf_entry(s, vcf_line);
1181
+ vcf_entry e(N_indv, vcf_line);
1182
+
1183
+ e.parse_basic_entry(false, false, true);
1184
+
1185
+ if (N_to_keep > 0)
1186
+ {
1187
+ bool keep = false;
1188
+ for (set<string>::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it)
1189
+ {
1190
+ value = e.get_INFO_value(*it);
1191
+ if (value == "1")
1192
+ keep = true;
1193
+ }
1194
+
1195
+ include_entry[s] = keep;
1196
+ }
1197
+
1198
+ if (include_entry[s]==false)
1199
+ continue;
1200
+
1201
+ if (N_to_remove > 0)
1202
+ {
1203
+ for (set<string>::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it)
1204
+ {
1205
+ value = e.get_INFO_value(*it);
1206
+ if (value == "1")
1207
+ {
1208
+ include_entry[s] = false;
1209
+ continue;
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+