bio-vcf 0.8.1 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 814f6cb6c8bc237fd08ab4f22f2bfea514525fc6c2fd1b9081cd314bfa3c2fd2
|
4
|
+
data.tar.gz: 194aa006ac5c46c157360e37e98e42dbf88d121f9c64983792e8a7cfe43d0142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cad19fa108652d42aaaff95296176a21781813848a1dacae879cd44fcb324b9f38f763fc1e516ce41cb492433457c17d27e4aca0f64308758594fb68c0abadf
|
7
|
+
data.tar.gz: 5290a23fe85fe063b6fb8606c77cca3abd3df3af624ac9da2ac3348336bbfafa7e8b3d70d2c5c3ba57a55da75a052f1fa2c1d20212ba7b4e656ac7263f5f1af0
|
data/.travis.yml
CHANGED
@@ -1,13 +1,3 @@
|
|
1
1
|
language: ruby
|
2
|
-
rvm:
|
3
|
-
# - 1.9.3 <- No longer working
|
4
|
-
- 2.0.0
|
5
|
-
- 2.1.0
|
6
|
-
# - jruby-head
|
7
|
-
# - jruby-19mode # JRuby in 1.9 mode
|
8
|
-
# - 1.8.7
|
9
|
-
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
-
# - rbx-18mode
|
11
2
|
|
12
|
-
|
13
|
-
# script: bundle exec rspec spec
|
3
|
+
arch: arm64
|
data/Gemfile
CHANGED
@@ -1,15 +1,9 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
|
-
# Add dependencies to develop your gem here.
|
7
|
-
# Include everything needed to run rake, tests, features, etc.
|
8
3
|
group :development do
|
9
|
-
|
4
|
+
gem "rake"
|
10
5
|
gem "rspec"
|
11
6
|
gem "cucumber"
|
12
|
-
gem "jeweler", "~> 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
gem "regressiontest", "~> 0.0.3"
|
14
7
|
end
|
15
8
|
|
9
|
+
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,49 +1,146 @@
|
|
1
1
|
# bio-vcf
|
2
2
|
|
3
|
-
[![Build Status](https://secure.travis-ci.org/
|
3
|
+
[![Build Status](https://secure.travis-ci.org/vcflib/bio-vcf.png)](http://travis-ci.org/vcflib/bio-vcf) [![rubygem](https://img.shields.io/gem/v/bio-vcf.svg?style=flat)](http://rubygems.org/gems/bio-vcf "Install with Rubygems") [![AnacondaBadge](https://anaconda.org/bioconda/bio-vcf/badges/installer/conda.svg)](https://anaconda.org/bioconda/bio-vcf) [![DL](https://anaconda.org/bioconda/bio-vcf/badges/downloads.svg)](https://anaconda.org/bioconda/bio-vcf)
|
4
|
+
[![DebianBadge](https://badges.debian.net/badges/debian/testing/bio-vcf/version.svg)](https://packages.debian.org/testing/bio-vcf)
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
Quick index:
|
7
|
+
|
8
|
+
- [INSTALL](#Install)
|
9
|
+
- [Command line interface (CLI)](#command-line-interface-cli)
|
10
|
+
+ [Set analysis](#set-analysis)
|
11
|
+
+ [Genotype processing](#genotype-processing)
|
12
|
+
+ [Sample counting](#sample-counting)
|
13
|
+
+ [Filter with lambda](#reorder-filter-with-lambda)
|
14
|
+
+ [Modify VCF files](#modify-vcf-files)
|
15
|
+
+ [RDF output](#rdf-output)
|
16
|
+
- [Templates](#templates)
|
17
|
+
- [Metadata](#metadata)
|
18
|
+
- [Statistics](#statistics)
|
19
|
+
- [API](#api)
|
20
|
+
- [Cite](#cite)
|
21
|
+
|
22
|
+
|
23
|
+
## Bio-vcf
|
24
|
+
|
25
|
+
Bio-vcf provides a domain specific language (DSL) for processing the
|
26
|
+
VCF format. Record named fields can be queried with regular
|
27
|
+
expressions, e.g.
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
sample.dp>20 and rec.filter !~ /LowQD/ and rec.tumor.bcount[rec.alt]>4
|
31
|
+
```
|
32
|
+
|
33
|
+
Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf
|
34
|
+
is not only very fast for genome-wide (WGS) data, it also comes with a
|
35
|
+
really nice filtering, evaluation and rewrite language and it can
|
36
|
+
output any type of textual data, including VCF header and contents in
|
37
|
+
RDF and JSON.
|
38
|
+
|
39
|
+
So, why would you use bio-vcf over other parsers? Because
|
9
40
|
|
10
41
|
1. Bio-vcf is fast and scales on multi-core computers
|
11
42
|
2. Bio-vcf has an expressive filtering and evaluation language
|
12
43
|
3. Bio-vcf has great multi-sample support
|
13
44
|
4. Bio-vcf has multiple global filters and sample filters
|
14
45
|
5. Bio-vcf can access any VCF format
|
15
|
-
6. Bio-vcf can
|
16
|
-
7. Bio-vcf
|
17
|
-
8. Bio-vcf
|
18
|
-
9. Bio-vcf has
|
19
|
-
10. Bio-vcf
|
46
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
47
|
+
7. Bio-vcf can do calculations on fields
|
48
|
+
8. Bio-vcf allows for genotype processing
|
49
|
+
9. Bio-vcf has support for set analysis
|
50
|
+
10. Bio-vcf has sane error handling
|
51
|
+
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
52
|
+
12. Bio-vcf has soft filters
|
53
|
+
|
54
|
+
Some examples are documented for [reducing GTeX](doc/GTEx_reduce.md),
|
55
|
+
[comparing GATK](doc/GATK_comparison.md), [comparing
|
56
|
+
VCFs](doc/Compare_VCFs.md), JSON [loading Mongo
|
57
|
+
database](doc/Using_Mongo.md), and [generating RDF](doc/Using_RDF.md).
|
58
|
+
|
59
|
+
## Options
|
60
|
+
|
61
|
+
In true Unix fashion files can be piped in or passed on the command
|
62
|
+
line:
|
63
|
+
|
64
|
+
bio-vcf --help
|
65
|
+
|
66
|
+
```
|
67
|
+
bio-vcf (biogem with pcows) by Pjotr Prins 2015-2020
|
68
|
+
|
69
|
+
Usage: bio-vcf [options] filename
|
70
|
+
e.g. bio-vcf < test/data/input/somaticsniper.vcf
|
71
|
+
-i, --ignore-missing Ignore missing data
|
72
|
+
--filter cmd Evaluate filter on each record
|
73
|
+
--sfilter cmd Evaluate filter on each sample
|
74
|
+
--sfilter-samples list Filter on selected samples (e.g., 0,1
|
75
|
+
--ifilter, --if cmd Include filter
|
76
|
+
--ifilter-samples list Include set - implicitely defines exclude set
|
77
|
+
--efilter, --ef cmd Exclude filter
|
78
|
+
--efilter-samples list Exclude set - overrides exclude set
|
79
|
+
--add-filter name Set/add filter field to name
|
80
|
+
--bed bedfile Filter on BED elements
|
81
|
+
-e, --eval cmd Evaluate command on each record
|
82
|
+
--eval-once cmd Evaluate command once (usually for header info)
|
83
|
+
--seval cmd Evaluate command on each sample
|
84
|
+
--rewrite eval Rewrite INFO
|
85
|
+
--samples list Output selected samples
|
86
|
+
--rdf Generate Turtle RDF (also check out --template!)
|
87
|
+
--num-threads [num] Multi-core version (default ALL)
|
88
|
+
--thread-lines num Fork thread on num lines (default 40000)
|
89
|
+
--skip-header Do not output VCF header info
|
90
|
+
--set-header list Set a special tab delimited output header (#samples expands to sample names)
|
91
|
+
-t, --template erb Use ERB template for output
|
92
|
+
--add-header-tag Add bio-vcf status tag to header output
|
93
|
+
--timeout [num] Timeout waiting for thread to complete (default 180)
|
94
|
+
--names Output sample names
|
95
|
+
--statistics Output statistics
|
96
|
+
-q, --quiet Run quietly
|
97
|
+
-v, --verbose Run verbosely
|
98
|
+
--debug Show debug messages and keep intermediate output
|
99
|
+
|
100
|
+
--id name Identifier
|
101
|
+
--tags list Add tags
|
102
|
+
-h, --help display this help and exit
|
103
|
+
```
|
104
|
+
|
105
|
+
## Performance
|
106
|
+
|
107
|
+
Bio-vcf has better performance than other tools because of lazy
|
108
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
109
|
+
line filtering. Adding cores, bio-vcf just
|
110
|
+
does better. The more complicated the filters, the larger the
|
111
|
+
gain. First a base line test to show IO performance
|
20
112
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
113
|
+
```sh
|
114
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
115
|
+
1987143 15897724 1003214613
|
116
|
+
real 0m7.823s
|
117
|
+
user 0m7.002s
|
118
|
+
sys 0m2.972s
|
119
|
+
```
|
120
|
+
|
121
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
27
122
|
|
28
123
|
```sh
|
29
|
-
time
|
30
|
-
real
|
31
|
-
user
|
32
|
-
sys
|
124
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
125
|
+
real 0m32.491s
|
126
|
+
user 2m34.767s
|
127
|
+
sys 0m12.733s
|
33
128
|
```
|
34
129
|
|
35
|
-
|
130
|
+
The same with SnpSift v4.0 takes
|
36
131
|
|
37
132
|
```sh
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
133
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > /dev/null
|
134
|
+
real 12m36.121s
|
135
|
+
user 12m53.273s
|
136
|
+
sys 0m9.913s
|
42
137
|
```
|
43
138
|
|
44
|
-
|
45
|
-
|
46
|
-
|
139
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
140
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
141
|
+
filters and parsing large data files on powerful machines. Parsing a
|
142
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
143
|
+
BED format on a 16 core machine takes
|
47
144
|
|
48
145
|
```sh
|
49
146
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -52,29 +149,38 @@ a 16 core machine takes
|
|
52
149
|
sys 0m5.039s
|
53
150
|
```
|
54
151
|
|
55
|
-
which shows decent core utilisation (10x).
|
56
|
-
gzip compressed VCF files of 30+ Gb
|
152
|
+
which shows decent core utilisation (10x). Running
|
153
|
+
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
57
154
|
|
58
|
-
|
59
|
-
|
155
|
+
To view some complex filters on an 80Gb SNP file check out a
|
156
|
+
[GTEx exercise](https://github.com/vcflib/bio-vcf/blob/master/doc/GTEx_reduce.md).
|
157
|
+
|
158
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
159
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
60
160
|
|
61
161
|
```sh
|
62
162
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
63
|
-
--sfilter '!s.empty? and s.dp>20'
|
163
|
+
--sfilter '!s.empty? and s.dp>20'
|
64
164
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
65
165
|
```
|
66
166
|
|
67
|
-
bio-vcf comes with a sensible parser definition language
|
68
|
-
|
167
|
+
bio-vcf comes with a sensible parser definition language, an embedded Ragel parser for INFO and
|
168
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
69
169
|
assumptions are made about the actual contents of the VCF file (field
|
70
|
-
names are resolved on the fly), so bio-vcf should
|
71
|
-
|
170
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
171
|
+
files.
|
72
172
|
|
73
|
-
To fetch all entries where all samples have depth larger than 20
|
74
|
-
a sample filter
|
173
|
+
To fetch all entries where all samples have depth larger than 20 and
|
174
|
+
filter set to PASS use a sample filter
|
75
175
|
|
76
176
|
```ruby
|
77
|
-
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
177
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter=="PASS"' < file.vcf
|
178
|
+
```
|
179
|
+
|
180
|
+
or with a regex
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter !~ /LowQD/' < file.vcf
|
78
184
|
```
|
79
185
|
|
80
186
|
To only filter on some samples number 0 and 3:
|
@@ -87,7 +193,7 @@ Where 's.dp' is the shorter name for 'sample.dp'.
|
|
87
193
|
|
88
194
|
It is also possible to specify sample names, or info fields:
|
89
195
|
|
90
|
-
For example, to filter somatic data
|
196
|
+
For example, to filter somatic data
|
91
197
|
|
92
198
|
```ruby
|
93
199
|
bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
@@ -140,7 +246,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
140
246
|
1 10303 25 31 28 32 17 23 22
|
141
247
|
```
|
142
248
|
|
143
|
-
To calculate alt frequencies from s.ad
|
249
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
250
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
144
251
|
|
145
252
|
```ruby
|
146
253
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -214,30 +321,64 @@ The VCF format is commonly used for variant calling between NGS
|
|
214
321
|
samples. The fast parser needs to carry some state, recorded for each
|
215
322
|
file in VcfHeader, which contains the VCF file header. Individual
|
216
323
|
lines (variant calls) first go through a raw parser returning an array
|
217
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
324
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
218
325
|
|
219
326
|
At this point the filter is pretty generic with multi-sample support.
|
220
327
|
If something is not working, check out the feature descriptions and
|
221
328
|
the source code. It is not hard to add features. Otherwise, send a short
|
222
329
|
example of a VCF statement you need to work on.
|
223
330
|
|
224
|
-
##
|
331
|
+
## Install
|
332
|
+
|
333
|
+
Requirements:
|
225
334
|
|
226
|
-
|
227
|
-
a performance improvement. Bio-vcf will show the Ruby version when
|
228
|
-
typing the command 'bio-vcf -h'.
|
335
|
+
* ruby
|
229
336
|
|
230
|
-
To
|
337
|
+
To install bio-vcf with Ruby gems, install Ruby first, e.g. on Debian
|
338
|
+
(as root)
|
339
|
+
|
340
|
+
```sh
|
341
|
+
apt-get install ruby
|
342
|
+
```
|
343
|
+
|
344
|
+
Installing ruby includes the `gem` command to install bio-vcf:
|
231
345
|
|
232
346
|
```sh
|
233
347
|
gem install bio-vcf
|
348
|
+
export PATH=/usr/local/bin:$PATH
|
234
349
|
bio-vcf -h
|
235
350
|
```
|
236
351
|
|
237
|
-
|
352
|
+
displays the help
|
353
|
+
|
354
|
+
```
|
355
|
+
bio-vcf x.x (biogem Ruby with pcows) by Pjotr Prins 2015-2020
|
356
|
+
Usage: bio-vcf [options] filename
|
357
|
+
e.g. bio-vcf < test/data/input/somaticsniper.vcf
|
358
|
+
-i, --ignore-missing Ignore missing data
|
359
|
+
--filter cmd Evaluate filter on each record
|
360
|
+
(etc.)
|
361
|
+
```
|
362
|
+
|
363
|
+
To install without root you may install a gem locally with
|
238
364
|
|
239
365
|
```sh
|
240
|
-
gem install
|
366
|
+
gem install --install-dir ~/bio-vcf bio-vcf
|
367
|
+
```
|
368
|
+
|
369
|
+
and run it with something like
|
370
|
+
|
371
|
+
```sh
|
372
|
+
~/bio-vcf/gems/bio-vcf-0.9.4/bin/bio-vcf -h
|
373
|
+
```
|
374
|
+
|
375
|
+
Finally, it is possible to checkout the git repository and simply
|
376
|
+
run the tool with
|
377
|
+
|
378
|
+
```sh
|
379
|
+
git clone https://github.com/vcflib/bio-vcf.git
|
380
|
+
cd bio-vcf
|
381
|
+
ruby ./bin/bio-vcf -h
|
241
382
|
```
|
242
383
|
|
243
384
|
## Command line interface (CLI)
|
@@ -263,56 +404,72 @@ Get the sample names
|
|
263
404
|
NORMAL,TUMOR
|
264
405
|
```
|
265
406
|
|
407
|
+
Alternatively use the command line switch for --names, e.g.
|
408
|
+
|
409
|
+
```ruby
|
410
|
+
bio-vcf --names < file.vcf
|
411
|
+
NORMAL,TUMOR
|
412
|
+
```
|
413
|
+
|
414
|
+
Get information from the header (META)
|
415
|
+
|
416
|
+
```ruby
|
417
|
+
bio-vcf -q --skip-header --eval-once 'header.meta["GATKCommandLine"]' < gatk_exome.vcf
|
418
|
+
```
|
419
|
+
|
266
420
|
The 'fields' array contains unprocessed data (strings). Print first
|
267
421
|
five raw fields
|
268
422
|
|
269
423
|
```ruby
|
270
|
-
bio-vcf --eval 'fields[0..4]' < file.vcf
|
424
|
+
bio-vcf --eval 'fields[0..4]' < file.vcf
|
271
425
|
```
|
272
426
|
|
273
427
|
Add a filter to display the fields on chromosome 12
|
274
428
|
|
275
429
|
```ruby
|
276
|
-
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
430
|
+
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
277
431
|
```
|
278
432
|
|
279
433
|
It gets better when we start using processed data, represented by an
|
280
434
|
object named 'rec'. Position is a value, so we can filter a range
|
281
435
|
|
282
436
|
```ruby
|
283
|
-
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
437
|
+
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
284
438
|
```
|
285
439
|
|
286
440
|
The shorter name for 'rec.chrom' is 'r.chrom', so you may write
|
287
441
|
|
288
442
|
```ruby
|
289
|
-
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
443
|
+
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
290
444
|
```
|
291
445
|
|
292
446
|
To ignore and continue parsing on missing data use the
|
293
447
|
--ignore-missing (-i) and or --quiet (-q) switches
|
294
448
|
|
295
449
|
```ruby
|
296
|
-
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
450
|
+
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
297
451
|
```
|
298
452
|
|
299
453
|
Info fields are referenced by
|
300
454
|
|
301
455
|
```ruby
|
302
|
-
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
456
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
303
457
|
```
|
304
458
|
|
305
|
-
|
459
|
+
(alternatively you can use the indexed rec.info['DP'] and list INFO fields with
|
460
|
+
rec.info.fields).
|
461
|
+
|
462
|
+
Subfields defined by rec.format:
|
306
463
|
|
307
464
|
```ruby
|
308
|
-
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
465
|
+
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
309
466
|
```
|
310
467
|
|
311
468
|
Output
|
312
469
|
|
313
470
|
```ruby
|
314
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
315
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
471
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
472
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
316
473
|
< file.vcf
|
317
474
|
```
|
318
475
|
|
@@ -326,26 +483,26 @@ Show the count of the bases that were scored as somatic
|
|
326
483
|
Actually, we have a convenience implementation for bcount, so this is the same
|
327
484
|
|
328
485
|
```ruby
|
329
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
486
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
330
487
|
< file.vcf
|
331
488
|
```
|
332
489
|
|
333
490
|
Filter on the somatic results that were scored at least 4 times
|
334
|
-
|
491
|
+
|
335
492
|
```ruby
|
336
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
493
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
337
494
|
```
|
338
495
|
|
339
496
|
Similar for base quality scores
|
340
497
|
|
341
498
|
```ruby
|
342
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
499
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
343
500
|
```
|
344
501
|
|
345
502
|
Filter out on sample values
|
346
503
|
|
347
504
|
```ruby
|
348
|
-
bio-vcf --sfilter 's.dp>20' < test.vcf
|
505
|
+
bio-vcf --sfilter 's.dp>20' < test.vcf
|
349
506
|
```
|
350
507
|
|
351
508
|
To filter missing on samples:
|
@@ -360,6 +517,23 @@ or for all
|
|
360
517
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
361
518
|
```
|
362
519
|
|
520
|
+
To set a soft filter, i.e. the filter column is updated
|
521
|
+
|
522
|
+
```sh
|
523
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
524
|
+
```
|
525
|
+
|
526
|
+
may render something like
|
527
|
+
|
528
|
+
```
|
529
|
+
1 46527674 4 LowQD
|
530
|
+
1 108417572 4 LowQD
|
531
|
+
1 155449089 4 LowQD
|
532
|
+
1 169847826 4 LowQD
|
533
|
+
1 203098164 3 LowQD
|
534
|
+
2 39213209 4 LowQD
|
535
|
+
```
|
536
|
+
|
363
537
|
Likewise you can check for record validity
|
364
538
|
|
365
539
|
```sh
|
@@ -410,17 +584,17 @@ Even shorter r is an alias for rec
|
|
410
584
|
Note: special functions are not yet implemented! Look below
|
411
585
|
for genotype processing which has indexing in 'gti'.
|
412
586
|
|
413
|
-
Sometime you want to use a special function in a filter. For
|
414
|
-
example percentage variant reads can be defined as [a,c,g,t]
|
415
|
-
with frequencies against sample read depth (dp) as
|
416
|
-
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
587
|
+
Sometime you want to use a special function in a filter. For
|
588
|
+
example percentage variant reads can be defined as [a,c,g,t]
|
589
|
+
with frequencies against sample read depth (dp) as
|
590
|
+
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
417
591
|
which we named freq
|
418
592
|
|
419
593
|
```sh
|
420
594
|
bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
|
421
595
|
```
|
422
596
|
|
423
|
-
which is equal to
|
597
|
+
which is equal to
|
424
598
|
|
425
599
|
```sh
|
426
600
|
bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
|
@@ -440,7 +614,7 @@ ref should always be identical across samples.
|
|
440
614
|
|
441
615
|
## DbSNP
|
442
616
|
|
443
|
-
One clinical variant DbSNP example
|
617
|
+
One clinical variant DbSNP example
|
444
618
|
|
445
619
|
```sh
|
446
620
|
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
@@ -465,16 +639,16 @@ renders
|
|
465
639
|
|
466
640
|
bio-vcf allows for set analysis. With the complement filter, for
|
467
641
|
example, samples are selected that evaluate to true, all others should
|
468
|
-
evaluate to false. For this we create three filters, one for all
|
642
|
+
evaluate to false. For this we create three filters, one for all
|
469
643
|
samples that are included (the --ifilter or -if), for all samples that
|
470
644
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
471
645
|
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
472
|
-
filter).
|
646
|
+
filter).
|
473
647
|
|
474
648
|
The equivalent of the union filter is by using the --sfilter, so
|
475
649
|
|
476
650
|
```sh
|
477
|
-
bio-vcf --sfilter 's.dp>20'
|
651
|
+
bio-vcf --sfilter 's.dp>20'
|
478
652
|
```
|
479
653
|
|
480
654
|
Filters DP on all samples and is true if all samples match the
|
@@ -482,7 +656,7 @@ criterium (AND). To filter on a subset you can add a
|
|
482
656
|
selector
|
483
657
|
|
484
658
|
```sh
|
485
|
-
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
659
|
+
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
486
660
|
```
|
487
661
|
|
488
662
|
For set analysis there are the additional ifilter (include) and
|
@@ -502,7 +676,7 @@ values
|
|
502
676
|
|
503
677
|
The equivalent of the complement filter is by specifying what samples
|
504
678
|
to include, here with a regex and define filters on the included
|
505
|
-
and excluded samples (the ones not in ifilter-samples) and the
|
679
|
+
and excluded samples (the ones not in ifilter-samples) and the
|
506
680
|
|
507
681
|
```sh
|
508
682
|
./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
|
@@ -523,7 +697,7 @@ To print out the GT's add --seval
|
|
523
697
|
To set an additional filter on the excluded samples:
|
524
698
|
|
525
699
|
```sh
|
526
|
-
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
700
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
527
701
|
```
|
528
702
|
|
529
703
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
@@ -536,15 +710,15 @@ In the near future it is also possible to select samples on a regex (here
|
|
536
710
|
select all samples where the name starts with s3)
|
537
711
|
|
538
712
|
```sh
|
539
|
-
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
713
|
+
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
540
714
|
```
|
541
715
|
|
542
716
|
```sh
|
543
|
-
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
717
|
+
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
544
718
|
--set-intersect include=true
|
545
|
-
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
719
|
+
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
546
720
|
--set-catesian one in include=true, rest=false
|
547
|
-
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
721
|
+
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
548
722
|
```
|
549
723
|
|
550
724
|
With the filter commands you can use --ignore-missing to skip errors.
|
@@ -567,7 +741,7 @@ results in a string value
|
|
567
741
|
to access components of the genotype field we can use standard Ruby
|
568
742
|
|
569
743
|
```ruby
|
570
|
-
bio-vcf --seval 's.gt.split(/\//)[0]'
|
744
|
+
bio-vcf --seval 's.gt.split(/\//)[0]'
|
571
745
|
1 10665 . . 0 0 . 0 0
|
572
746
|
1 10694 . . 1 1 . . .
|
573
747
|
1 12783 0 0 0 0 0 0 0
|
@@ -578,7 +752,7 @@ or special functions, such as 'gti' which gives the genotype as an
|
|
578
752
|
indexed value array
|
579
753
|
|
580
754
|
```ruby
|
581
|
-
bio-vcf --seval 's.gti[0]'
|
755
|
+
bio-vcf --seval 's.gti[0]'
|
582
756
|
1 10665 0 0 0 0
|
583
757
|
1 10694 1 1
|
584
758
|
1 12783 0 0 0 0 0 0 0
|
@@ -588,7 +762,7 @@ indexed value array
|
|
588
762
|
and 'gts' as a nucleotide string array
|
589
763
|
|
590
764
|
```ruby
|
591
|
-
bio-vcf --seval 's.gts
|
765
|
+
bio-vcf --seval 's.gts'
|
592
766
|
1 10665 C C C C
|
593
767
|
1 10694 G G
|
594
768
|
1 12783 G G G G G G G
|
@@ -597,6 +771,28 @@ and 'gts' as a nucleotide string array
|
|
597
771
|
|
598
772
|
where gts represents the indexed genotype on [ref] + [alt].
|
599
773
|
|
774
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
775
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
776
|
+
example signficance, use
|
777
|
+
|
778
|
+
```ruby
|
779
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
780
|
+
11 58949455 0 1
|
781
|
+
11 65481082 0 1
|
782
|
+
11 94180424 0 1
|
783
|
+
11 121036021 0 1
|
784
|
+
```
|
785
|
+
|
786
|
+
Now you can index other fields, e.g. GL
|
787
|
+
|
788
|
+
```ruby
|
789
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
790
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
791
|
+
```
|
792
|
+
|
793
|
+
shows a number of SNPs have been scored with high significance and a
|
794
|
+
number are missing, here marked as -1.
|
795
|
+
|
600
796
|
These values can also be used in filters and output allele depth, for
|
601
797
|
example
|
602
798
|
|
@@ -618,6 +814,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
618
814
|
1 13757 47 47 4 47 47 4 47
|
619
815
|
```
|
620
816
|
|
817
|
+
## Sample counting
|
818
|
+
|
819
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
820
|
+
some expert advice here.
|
821
|
+
|
822
|
+
To count valid genotype field in samples you can do something like
|
823
|
+
|
824
|
+
```ruby
|
825
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
826
|
+
```
|
827
|
+
|
828
|
+
A similar complex count would be
|
829
|
+
|
830
|
+
```ruby
|
831
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
832
|
+
```
|
833
|
+
|
834
|
+
which tests for perfect SNPs scored (for example).
|
835
|
+
|
836
|
+
## Reorder filter with lambda
|
837
|
+
|
838
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
839
|
+
example where the greedy sample counts are done only for those
|
840
|
+
samples that match the other criteria:
|
841
|
+
|
842
|
+
```ruby
|
843
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
844
|
+
```
|
621
845
|
|
622
846
|
## Modify VCF files
|
623
847
|
|
@@ -633,6 +857,17 @@ To remove/select 3 samples:
|
|
633
857
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
634
858
|
```
|
635
859
|
|
860
|
+
You can also select samples by name (as long as they do not contain
|
861
|
+
spaces)
|
862
|
+
|
863
|
+
|
864
|
+
```sh
|
865
|
+
bio-vcf --names < mytest.vcf
|
866
|
+
Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
867
|
+
bio-vcf --samples "Original,s1t1,s3t1" < mytest.vcf
|
868
|
+
```
|
869
|
+
|
870
|
+
|
636
871
|
Filter on a BED file and annotate the gene name in the resulting VCF
|
637
872
|
|
638
873
|
```sh
|
@@ -679,11 +914,11 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
|
|
679
914
|
|
680
915
|
## Templates
|
681
916
|
|
682
|
-
To have more output options
|
917
|
+
To have more output options bio-vcf can use an [ERB
|
683
918
|
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
684
919
|
very flexible option that can output textual formats such as JSON, YAML, HTML
|
685
920
|
and RDF. Examples are provided in
|
686
|
-
[./templates](https://github.com/
|
921
|
+
[./templates](https://github.com/vcflib/bio-vcf/templates/). A JSON
|
687
922
|
template could be
|
688
923
|
|
689
924
|
```Javascript
|
@@ -693,11 +928,11 @@ template could be
|
|
693
928
|
"seq:ref": "<%= rec.ref %>" ,
|
694
929
|
"seq:alt": "<%= rec.alt[0] %>" ,
|
695
930
|
"seq:maf": <%= rec.info.maf[0] %> ,
|
696
|
-
"dp": <%= rec.info.dp %>
|
931
|
+
"dp": <%= rec.info.dp %>
|
697
932
|
};
|
698
933
|
```
|
699
934
|
|
700
|
-
To get JSON, run with something like (combining
|
935
|
+
To get JSON, run with something like (combining
|
701
936
|
with a filter)
|
702
937
|
|
703
938
|
```sh
|
@@ -713,7 +948,7 @@ which renders
|
|
713
948
|
"seq:ref": "C" ,
|
714
949
|
"seq:alt": "T" ,
|
715
950
|
"seq:maf": 0.0151 ,
|
716
|
-
"dp": 86
|
951
|
+
"dp": 86
|
717
952
|
};
|
718
953
|
```
|
719
954
|
|
@@ -723,11 +958,11 @@ Likewise for RDF output:
|
|
723
958
|
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
724
959
|
```
|
725
960
|
|
726
|
-
renders the ERB template
|
961
|
+
renders the ERB template
|
727
962
|
|
728
963
|
```ruby
|
729
964
|
<%
|
730
|
-
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
965
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
731
966
|
%>
|
732
967
|
:<%= id %>
|
733
968
|
:query_id "<%= id %>",
|
@@ -740,7 +975,7 @@ renders the ERB template
|
|
740
975
|
db:vcf true .
|
741
976
|
```
|
742
977
|
|
743
|
-
into
|
978
|
+
into
|
744
979
|
|
745
980
|
```
|
746
981
|
:ch13_33703698_A
|
@@ -765,50 +1000,56 @@ can be
|
|
765
1000
|
```Javascript
|
766
1001
|
=HEADER
|
767
1002
|
<% require 'json' %>
|
768
|
-
|
769
|
-
{ "HEADER": {
|
1003
|
+
{ "HEADER": {
|
770
1004
|
"options": <%= options.to_h.to_json %>,
|
771
1005
|
"files": <%= ARGV %>,
|
772
1006
|
"version": "<%= BIOVCF_VERSION %>"
|
773
1007
|
},
|
774
|
-
|
1008
|
+
"BODY":[
|
775
1009
|
=BODY
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
},
|
1010
|
+
{
|
1011
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
1012
|
+
"seq:pos": <%= rec.pos %> ,
|
1013
|
+
"seq:ref": "<%= rec.ref %>" ,
|
1014
|
+
"seq:alt": "<%= rec.alt[0] %>" ,
|
1015
|
+
"dp": <%= rec.info.dp %>
|
1016
|
+
},
|
784
1017
|
=FOOTER
|
785
|
-
]
|
1018
|
+
]
|
1019
|
+
}
|
1020
|
+
```
|
1021
|
+
|
1022
|
+
with
|
1023
|
+
|
1024
|
+
```sh
|
1025
|
+
bio-vcf --template template/vcf2json.erb < dbsnp.vcf
|
786
1026
|
```
|
787
1027
|
|
788
1028
|
may generate something like
|
789
1029
|
|
790
1030
|
```Javascript
|
791
|
-
|
792
|
-
{ "HEADER": {
|
1031
|
+
{ "HEADER": {
|
793
1032
|
"options": {"show_help":false,"source":"https://github.com/CuppenResearch/bioruby-vcf","version":"0.8.1-pre3 (Pjotr Prins)","date":"2014-11-26 12:51:36 +0000","thread_lines":40000,"template":"template/vcf2json.erb","skip_header":true},
|
794
1033
|
"files": [],
|
795
1034
|
"version": "0.8.1-pre3"
|
796
1035
|
},
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
1036
|
+
"BODY":[
|
1037
|
+
{
|
1038
|
+
"seq:chr": "1" ,
|
1039
|
+
"seq:pos": 883516 ,
|
1040
|
+
"seq:ref": "G" ,
|
1041
|
+
"seq:alt": "A" ,
|
1042
|
+
"dp":
|
1043
|
+
},
|
1044
|
+
{
|
1045
|
+
"seq:chr": "1" ,
|
1046
|
+
"seq:pos": 891344 ,
|
1047
|
+
"seq:ref": "G" ,
|
1048
|
+
"seq:alt": "A" ,
|
1049
|
+
"dp": ,
|
1050
|
+
},
|
1051
|
+
]
|
1052
|
+
}
|
812
1053
|
```
|
813
1054
|
|
814
1055
|
Note that the template is not smart enough to remove the final comma
|
@@ -816,6 +1057,19 @@ from the last BODY element. To make it valid JSON that needs to be
|
|
816
1057
|
removed. A future version may add a parameter to the BODY element or a
|
817
1058
|
global rewrite function for this purpose. YAML and RDF have no such issue.
|
818
1059
|
|
1060
|
+
### Using full VCF header (meta) info
|
1061
|
+
|
1062
|
+
To get and put the full information from the header, simple use
|
1063
|
+
vcf.meta.to_json. See ./template/vcf2json_full_header.erb for an
|
1064
|
+
example. This meta information can also be used to output info fields
|
1065
|
+
and sample values on the fly! For an example, see the template at
|
1066
|
+
[./template/vcf2json_use_meta.erb](https://github.com/vcflib/bio-vcf/tree/master/template/vcf2json_use_meta.erb)
|
1067
|
+
and the generated output at
|
1068
|
+
[./test/data/regression/vcf2json_use_meta.ref](https://github.com/vcflib/bio-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
|
1069
|
+
|
1070
|
+
This way, it is possible to write templates that can convert the content of
|
1071
|
+
*any* VCF file without prior knowledge to JSON, RDF, etc.
|
1072
|
+
|
819
1073
|
## Statistics
|
820
1074
|
|
821
1075
|
Simple statistics are available for REF>ALT changes:
|
@@ -828,7 +1082,7 @@ Simple statistics are available for REF>ALT changes:
|
|
828
1082
|
G>A 59 45%
|
829
1083
|
C>T 30 23%
|
830
1084
|
A>G 5 4%
|
831
|
-
C>G 5 4%
|
1085
|
+
C>G 5 4%
|
832
1086
|
C>A 5 4%
|
833
1087
|
G>T 4 3%
|
834
1088
|
T>C 4 3%
|
@@ -848,7 +1102,10 @@ Simple statistics are available for REF>ALT changes:
|
|
848
1102
|
|
849
1103
|
## Other examples
|
850
1104
|
|
851
|
-
For more examples see
|
1105
|
+
For more exercises and examples see
|
1106
|
+
[doc](https://github.com/vcflib/bio-vcf/tree/master/doc) directory
|
1107
|
+
and the the feature
|
1108
|
+
[section](https://github.com/vcflib/bio-vcf/tree/master/features).
|
852
1109
|
|
853
1110
|
## API
|
854
1111
|
|
@@ -879,27 +1136,109 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
879
1136
|
end
|
880
1137
|
```
|
881
1138
|
|
1139
|
+
### VCFFile
|
1140
|
+
|
1141
|
+
The class ```BioVcf::VCFfile``` wraps a file and provides an ```enum``` with the
|
1142
|
+
method each, that can be used as in iterator.
|
1143
|
+
|
1144
|
+
```ruby
|
1145
|
+
vcf_file = "dbsnp.vcf"
|
1146
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: false )
|
1147
|
+
it vcf.each
|
1148
|
+
puts it.peek
|
1149
|
+
|
1150
|
+
vcf_file = "dbsnp.vcf.gz"
|
1151
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: true )
|
1152
|
+
it vcf.each
|
1153
|
+
puts it.peek
|
1154
|
+
```
|
1155
|
+
|
882
1156
|
## Trouble shooting
|
883
1157
|
|
1158
|
+
### MRI supports threading
|
1159
|
+
|
884
1160
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
885
1161
|
in single threaded mode (for now).
|
886
1162
|
|
1163
|
+
### Set TMPDIR when running out of space
|
1164
|
+
|
887
1165
|
The multi-threading creates temporary files using the system TMPDIR.
|
888
1166
|
This behaviour can be overridden by setting the environment variable.
|
889
|
-
|
890
|
-
|
1167
|
+
|
1168
|
+
### Reorder filter on time out
|
1169
|
+
|
1170
|
+
Make sure to minimize expensive calculations by moving them
|
1171
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1172
|
+
|
1173
|
+
```ruby
|
1174
|
+
fast_check and slow_check
|
1175
|
+
```
|
1176
|
+
|
1177
|
+
slow_check only gets executed if fast_check is true.
|
1178
|
+
|
1179
|
+
For more complex filters use lambda inside a conditional
|
1180
|
+
|
1181
|
+
```ruby
|
1182
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1183
|
+
```
|
1184
|
+
|
1185
|
+
where slow_check is the slow section of your query. As is shown
|
1186
|
+
earlier in this document. Don't forget the .call!
|
1187
|
+
|
1188
|
+
### Reduce thread lines on timeout
|
1189
|
+
|
1190
|
+
Depending on your input data and the speed filters it may be useful to
|
1191
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1192
|
+
|
1193
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1194
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1195
|
+
the computations are intensive (per line) reduce the number of
|
1196
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1197
|
+
the one to try.
|
1198
|
+
|
1199
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1200
|
+
|
1201
|
+
Different values may show different core use on a machine.
|
1202
|
+
|
1203
|
+
### Development
|
1204
|
+
|
1205
|
+
To run the tests from source
|
1206
|
+
|
1207
|
+
```sh
|
1208
|
+
bundle install --path vendor/bundle
|
1209
|
+
bundle exec rake
|
1210
|
+
```
|
1211
|
+
|
1212
|
+
Note: we develop in a GNU Guix environment, see the header of
|
1213
|
+
[guix.scm](guix.scm) which does not use bundler.
|
1214
|
+
|
1215
|
+
### Debugging
|
1216
|
+
|
1217
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1218
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1219
|
+
are problems.
|
1220
|
+
|
1221
|
+
### Could not find rake-10.4.2 in any of the sources
|
1222
|
+
|
1223
|
+
Remove Gemfile.lock before running other tools.
|
1224
|
+
|
1225
|
+
### Tmpdir contains (old) bio-vcf directories
|
1226
|
+
|
1227
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1228
|
+
processing. When a process gets interrupted for some reason the
|
1229
|
+
temporary directory may remain.
|
891
1230
|
|
892
1231
|
## Project home page
|
893
1232
|
|
894
1233
|
Information on the source tree, documentation, examples, issues and
|
895
1234
|
how to contribute, see
|
896
1235
|
|
897
|
-
http://github.com/
|
1236
|
+
http://github.com/vcflib/bio-vcf
|
898
1237
|
|
899
1238
|
## Cite
|
900
1239
|
|
901
1240
|
If you use this software, please cite one of
|
902
|
-
|
1241
|
+
|
903
1242
|
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
904
1243
|
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
905
1244
|
|
@@ -909,5 +1248,4 @@ This Biogem is published at (http://biogems.info/index.html#bio-vcf)
|
|
909
1248
|
|
910
1249
|
## Copyright
|
911
1250
|
|
912
|
-
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
913
|
-
|
1251
|
+
Copyright (c) 2014-2020 Pjotr Prins. See LICENSE.txt for further details.
|