bio-vcf 0.8.1 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 814f6cb6c8bc237fd08ab4f22f2bfea514525fc6c2fd1b9081cd314bfa3c2fd2
|
4
|
+
data.tar.gz: 194aa006ac5c46c157360e37e98e42dbf88d121f9c64983792e8a7cfe43d0142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cad19fa108652d42aaaff95296176a21781813848a1dacae879cd44fcb324b9f38f763fc1e516ce41cb492433457c17d27e4aca0f64308758594fb68c0abadf
|
7
|
+
data.tar.gz: 5290a23fe85fe063b6fb8606c77cca3abd3df3af624ac9da2ac3348336bbfafa7e8b3d70d2c5c3ba57a55da75a052f1fa2c1d20212ba7b4e656ac7263f5f1af0
|
data/.travis.yml
CHANGED
@@ -1,13 +1,3 @@
|
|
1
1
|
language: ruby
|
2
|
-
rvm:
|
3
|
-
# - 1.9.3 <- No longer working
|
4
|
-
- 2.0.0
|
5
|
-
- 2.1.0
|
6
|
-
# - jruby-head
|
7
|
-
# - jruby-19mode # JRuby in 1.9 mode
|
8
|
-
# - 1.8.7
|
9
|
-
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
-
# - rbx-18mode
|
11
2
|
|
12
|
-
|
13
|
-
# script: bundle exec rspec spec
|
3
|
+
arch: arm64
|
data/Gemfile
CHANGED
@@ -1,15 +1,9 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
|
-
# Add dependencies to develop your gem here.
|
7
|
-
# Include everything needed to run rake, tests, features, etc.
|
8
3
|
group :development do
|
9
|
-
|
4
|
+
gem "rake"
|
10
5
|
gem "rspec"
|
11
6
|
gem "cucumber"
|
12
|
-
gem "jeweler", "~> 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
gem "regressiontest", "~> 0.0.3"
|
14
7
|
end
|
15
8
|
|
9
|
+
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,49 +1,146 @@
|
|
1
1
|
# bio-vcf
|
2
2
|
|
3
|
-
[](http://travis-ci.org/vcflib/bio-vcf) [](http://rubygems.org/gems/bio-vcf "Install with Rubygems") [](https://anaconda.org/bioconda/bio-vcf) [](https://anaconda.org/bioconda/bio-vcf)
|
4
|
+
[](https://packages.debian.org/testing/bio-vcf)
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
Quick index:
|
7
|
+
|
8
|
+
- [INSTALL](#Install)
|
9
|
+
- [Command line interface (CLI)](#command-line-interface-cli)
|
10
|
+
+ [Set analysis](#set-analysis)
|
11
|
+
+ [Genotype processing](#genotype-processing)
|
12
|
+
+ [Sample counting](#sample-counting)
|
13
|
+
+ [Filter with lambda](#reorder-filter-with-lambda)
|
14
|
+
+ [Modify VCF files](#modify-vcf-files)
|
15
|
+
+ [RDF output](#rdf-output)
|
16
|
+
- [Templates](#templates)
|
17
|
+
- [Metadata](#metadata)
|
18
|
+
- [Statistics](#statistics)
|
19
|
+
- [API](#api)
|
20
|
+
- [Cite](#cite)
|
21
|
+
|
22
|
+
|
23
|
+
## Bio-vcf
|
24
|
+
|
25
|
+
Bio-vcf provides a domain specific language (DSL) for processing the
|
26
|
+
VCF format. Record named fields can be queried with regular
|
27
|
+
expressions, e.g.
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
sample.dp>20 and rec.filter !~ /LowQD/ and rec.tumor.bcount[rec.alt]>4
|
31
|
+
```
|
32
|
+
|
33
|
+
Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf
|
34
|
+
is not only very fast for genome-wide (WGS) data, it also comes with a
|
35
|
+
really nice filtering, evaluation and rewrite language and it can
|
36
|
+
output any type of textual data, including VCF header and contents in
|
37
|
+
RDF and JSON.
|
38
|
+
|
39
|
+
So, why would you use bio-vcf over other parsers? Because
|
9
40
|
|
10
41
|
1. Bio-vcf is fast and scales on multi-core computers
|
11
42
|
2. Bio-vcf has an expressive filtering and evaluation language
|
12
43
|
3. Bio-vcf has great multi-sample support
|
13
44
|
4. Bio-vcf has multiple global filters and sample filters
|
14
45
|
5. Bio-vcf can access any VCF format
|
15
|
-
6. Bio-vcf can
|
16
|
-
7. Bio-vcf
|
17
|
-
8. Bio-vcf
|
18
|
-
9. Bio-vcf has
|
19
|
-
10. Bio-vcf
|
46
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
47
|
+
7. Bio-vcf can do calculations on fields
|
48
|
+
8. Bio-vcf allows for genotype processing
|
49
|
+
9. Bio-vcf has support for set analysis
|
50
|
+
10. Bio-vcf has sane error handling
|
51
|
+
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
52
|
+
12. Bio-vcf has soft filters
|
53
|
+
|
54
|
+
Some examples are documented for [reducing GTeX](doc/GTEx_reduce.md),
|
55
|
+
[comparing GATK](doc/GATK_comparison.md), [comparing
|
56
|
+
VCFs](doc/Compare_VCFs.md), JSON [loading Mongo
|
57
|
+
database](doc/Using_Mongo.md), and [generating RDF](doc/Using_RDF.md).
|
58
|
+
|
59
|
+
## Options
|
60
|
+
|
61
|
+
In true Unix fashion files can be piped in or passed on the command
|
62
|
+
line:
|
63
|
+
|
64
|
+
bio-vcf --help
|
65
|
+
|
66
|
+
```
|
67
|
+
bio-vcf (biogem with pcows) by Pjotr Prins 2015-2020
|
68
|
+
|
69
|
+
Usage: bio-vcf [options] filename
|
70
|
+
e.g. bio-vcf < test/data/input/somaticsniper.vcf
|
71
|
+
-i, --ignore-missing Ignore missing data
|
72
|
+
--filter cmd Evaluate filter on each record
|
73
|
+
--sfilter cmd Evaluate filter on each sample
|
74
|
+
--sfilter-samples list Filter on selected samples (e.g., 0,1
|
75
|
+
--ifilter, --if cmd Include filter
|
76
|
+
--ifilter-samples list Include set - implicitely defines exclude set
|
77
|
+
--efilter, --ef cmd Exclude filter
|
78
|
+
--efilter-samples list Exclude set - overrides exclude set
|
79
|
+
--add-filter name Set/add filter field to name
|
80
|
+
--bed bedfile Filter on BED elements
|
81
|
+
-e, --eval cmd Evaluate command on each record
|
82
|
+
--eval-once cmd Evaluate command once (usually for header info)
|
83
|
+
--seval cmd Evaluate command on each sample
|
84
|
+
--rewrite eval Rewrite INFO
|
85
|
+
--samples list Output selected samples
|
86
|
+
--rdf Generate Turtle RDF (also check out --template!)
|
87
|
+
--num-threads [num] Multi-core version (default ALL)
|
88
|
+
--thread-lines num Fork thread on num lines (default 40000)
|
89
|
+
--skip-header Do not output VCF header info
|
90
|
+
--set-header list Set a special tab delimited output header (#samples expands to sample names)
|
91
|
+
-t, --template erb Use ERB template for output
|
92
|
+
--add-header-tag Add bio-vcf status tag to header output
|
93
|
+
--timeout [num] Timeout waiting for thread to complete (default 180)
|
94
|
+
--names Output sample names
|
95
|
+
--statistics Output statistics
|
96
|
+
-q, --quiet Run quietly
|
97
|
+
-v, --verbose Run verbosely
|
98
|
+
--debug Show debug messages and keep intermediate output
|
99
|
+
|
100
|
+
--id name Identifier
|
101
|
+
--tags list Add tags
|
102
|
+
-h, --help display this help and exit
|
103
|
+
```
|
104
|
+
|
105
|
+
## Performance
|
106
|
+
|
107
|
+
Bio-vcf has better performance than other tools because of lazy
|
108
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
109
|
+
line filtering. Adding cores, bio-vcf just
|
110
|
+
does better. The more complicated the filters, the larger the
|
111
|
+
gain. First a base line test to show IO performance
|
20
112
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
113
|
+
```sh
|
114
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
115
|
+
1987143 15897724 1003214613
|
116
|
+
real 0m7.823s
|
117
|
+
user 0m7.002s
|
118
|
+
sys 0m2.972s
|
119
|
+
```
|
120
|
+
|
121
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
27
122
|
|
28
123
|
```sh
|
29
|
-
time
|
30
|
-
real
|
31
|
-
user
|
32
|
-
sys
|
124
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
125
|
+
real 0m32.491s
|
126
|
+
user 2m34.767s
|
127
|
+
sys 0m12.733s
|
33
128
|
```
|
34
129
|
|
35
|
-
|
130
|
+
The same with SnpSift v4.0 takes
|
36
131
|
|
37
132
|
```sh
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
133
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > /dev/null
|
134
|
+
real 12m36.121s
|
135
|
+
user 12m53.273s
|
136
|
+
sys 0m9.913s
|
42
137
|
```
|
43
138
|
|
44
|
-
|
45
|
-
|
46
|
-
|
139
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
140
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
141
|
+
filters and parsing large data files on powerful machines. Parsing a
|
142
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
143
|
+
BED format on a 16 core machine takes
|
47
144
|
|
48
145
|
```sh
|
49
146
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -52,29 +149,38 @@ a 16 core machine takes
|
|
52
149
|
sys 0m5.039s
|
53
150
|
```
|
54
151
|
|
55
|
-
which shows decent core utilisation (10x).
|
56
|
-
gzip compressed VCF files of 30+ Gb
|
152
|
+
which shows decent core utilisation (10x). Running
|
153
|
+
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
57
154
|
|
58
|
-
|
59
|
-
|
155
|
+
To view some complex filters on an 80Gb SNP file check out a
|
156
|
+
[GTEx exercise](https://github.com/vcflib/bio-vcf/blob/master/doc/GTEx_reduce.md).
|
157
|
+
|
158
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
159
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
60
160
|
|
61
161
|
```sh
|
62
162
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
63
|
-
--sfilter '!s.empty? and s.dp>20'
|
163
|
+
--sfilter '!s.empty? and s.dp>20'
|
64
164
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
65
165
|
```
|
66
166
|
|
67
|
-
bio-vcf comes with a sensible parser definition language
|
68
|
-
|
167
|
+
bio-vcf comes with a sensible parser definition language, an embedded Ragel parser for INFO and
|
168
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
69
169
|
assumptions are made about the actual contents of the VCF file (field
|
70
|
-
names are resolved on the fly), so bio-vcf should
|
71
|
-
|
170
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
171
|
+
files.
|
72
172
|
|
73
|
-
To fetch all entries where all samples have depth larger than 20
|
74
|
-
a sample filter
|
173
|
+
To fetch all entries where all samples have depth larger than 20 and
|
174
|
+
filter set to PASS use a sample filter
|
75
175
|
|
76
176
|
```ruby
|
77
|
-
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
177
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter=="PASS"' < file.vcf
|
178
|
+
```
|
179
|
+
|
180
|
+
or with a regex
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter !~ /LowQD/' < file.vcf
|
78
184
|
```
|
79
185
|
|
80
186
|
To only filter on some samples number 0 and 3:
|
@@ -87,7 +193,7 @@ Where 's.dp' is the shorter name for 'sample.dp'.
|
|
87
193
|
|
88
194
|
It is also possible to specify sample names, or info fields:
|
89
195
|
|
90
|
-
For example, to filter somatic data
|
196
|
+
For example, to filter somatic data
|
91
197
|
|
92
198
|
```ruby
|
93
199
|
bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
@@ -140,7 +246,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
140
246
|
1 10303 25 31 28 32 17 23 22
|
141
247
|
```
|
142
248
|
|
143
|
-
To calculate alt frequencies from s.ad
|
249
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
250
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
144
251
|
|
145
252
|
```ruby
|
146
253
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -214,30 +321,64 @@ The VCF format is commonly used for variant calling between NGS
|
|
214
321
|
samples. The fast parser needs to carry some state, recorded for each
|
215
322
|
file in VcfHeader, which contains the VCF file header. Individual
|
216
323
|
lines (variant calls) first go through a raw parser returning an array
|
217
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
324
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
218
325
|
|
219
326
|
At this point the filter is pretty generic with multi-sample support.
|
220
327
|
If something is not working, check out the feature descriptions and
|
221
328
|
the source code. It is not hard to add features. Otherwise, send a short
|
222
329
|
example of a VCF statement you need to work on.
|
223
330
|
|
224
|
-
##
|
331
|
+
## Install
|
332
|
+
|
333
|
+
Requirements:
|
225
334
|
|
226
|
-
|
227
|
-
a performance improvement. Bio-vcf will show the Ruby version when
|
228
|
-
typing the command 'bio-vcf -h'.
|
335
|
+
* ruby
|
229
336
|
|
230
|
-
To
|
337
|
+
To install bio-vcf with Ruby gems, install Ruby first, e.g. on Debian
|
338
|
+
(as root)
|
339
|
+
|
340
|
+
```sh
|
341
|
+
apt-get install ruby
|
342
|
+
```
|
343
|
+
|
344
|
+
Installing ruby includes the `gem` command to install bio-vcf:
|
231
345
|
|
232
346
|
```sh
|
233
347
|
gem install bio-vcf
|
348
|
+
export PATH=/usr/local/bin:$PATH
|
234
349
|
bio-vcf -h
|
235
350
|
```
|
236
351
|
|
237
|
-
|
352
|
+
displays the help
|
353
|
+
|
354
|
+
```
|
355
|
+
bio-vcf x.x (biogem Ruby with pcows) by Pjotr Prins 2015-2020
|
356
|
+
Usage: bio-vcf [options] filename
|
357
|
+
e.g. bio-vcf < test/data/input/somaticsniper.vcf
|
358
|
+
-i, --ignore-missing Ignore missing data
|
359
|
+
--filter cmd Evaluate filter on each record
|
360
|
+
(etc.)
|
361
|
+
```
|
362
|
+
|
363
|
+
To install without root you may install a gem locally with
|
238
364
|
|
239
365
|
```sh
|
240
|
-
gem install
|
366
|
+
gem install --install-dir ~/bio-vcf bio-vcf
|
367
|
+
```
|
368
|
+
|
369
|
+
and run it with something like
|
370
|
+
|
371
|
+
```sh
|
372
|
+
~/bio-vcf/gems/bio-vcf-0.9.4/bin/bio-vcf -h
|
373
|
+
```
|
374
|
+
|
375
|
+
Finally, it is possible to checkout the git repository and simply
|
376
|
+
run the tool with
|
377
|
+
|
378
|
+
```sh
|
379
|
+
git clone https://github.com/vcflib/bio-vcf.git
|
380
|
+
cd bio-vcf
|
381
|
+
ruby ./bin/bio-vcf -h
|
241
382
|
```
|
242
383
|
|
243
384
|
## Command line interface (CLI)
|
@@ -263,56 +404,72 @@ Get the sample names
|
|
263
404
|
NORMAL,TUMOR
|
264
405
|
```
|
265
406
|
|
407
|
+
Alternatively use the command line switch for --names, e.g.
|
408
|
+
|
409
|
+
```ruby
|
410
|
+
bio-vcf --names < file.vcf
|
411
|
+
NORMAL,TUMOR
|
412
|
+
```
|
413
|
+
|
414
|
+
Get information from the header (META)
|
415
|
+
|
416
|
+
```ruby
|
417
|
+
bio-vcf -q --skip-header --eval-once 'header.meta["GATKCommandLine"]' < gatk_exome.vcf
|
418
|
+
```
|
419
|
+
|
266
420
|
The 'fields' array contains unprocessed data (strings). Print first
|
267
421
|
five raw fields
|
268
422
|
|
269
423
|
```ruby
|
270
|
-
bio-vcf --eval 'fields[0..4]' < file.vcf
|
424
|
+
bio-vcf --eval 'fields[0..4]' < file.vcf
|
271
425
|
```
|
272
426
|
|
273
427
|
Add a filter to display the fields on chromosome 12
|
274
428
|
|
275
429
|
```ruby
|
276
|
-
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
430
|
+
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
277
431
|
```
|
278
432
|
|
279
433
|
It gets better when we start using processed data, represented by an
|
280
434
|
object named 'rec'. Position is a value, so we can filter a range
|
281
435
|
|
282
436
|
```ruby
|
283
|
-
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
437
|
+
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
284
438
|
```
|
285
439
|
|
286
440
|
The shorter name for 'rec.chrom' is 'r.chrom', so you may write
|
287
441
|
|
288
442
|
```ruby
|
289
|
-
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
443
|
+
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
290
444
|
```
|
291
445
|
|
292
446
|
To ignore and continue parsing on missing data use the
|
293
447
|
--ignore-missing (-i) and or --quiet (-q) switches
|
294
448
|
|
295
449
|
```ruby
|
296
|
-
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
450
|
+
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
297
451
|
```
|
298
452
|
|
299
453
|
Info fields are referenced by
|
300
454
|
|
301
455
|
```ruby
|
302
|
-
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
456
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
303
457
|
```
|
304
458
|
|
305
|
-
|
459
|
+
(alternatively you can use the indexed rec.info['DP'] and list INFO fields with
|
460
|
+
rec.info.fields).
|
461
|
+
|
462
|
+
Subfields defined by rec.format:
|
306
463
|
|
307
464
|
```ruby
|
308
|
-
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
465
|
+
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
309
466
|
```
|
310
467
|
|
311
468
|
Output
|
312
469
|
|
313
470
|
```ruby
|
314
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
315
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
471
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
472
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
316
473
|
< file.vcf
|
317
474
|
```
|
318
475
|
|
@@ -326,26 +483,26 @@ Show the count of the bases that were scored as somatic
|
|
326
483
|
Actually, we have a convenience implementation for bcount, so this is the same
|
327
484
|
|
328
485
|
```ruby
|
329
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
486
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
330
487
|
< file.vcf
|
331
488
|
```
|
332
489
|
|
333
490
|
Filter on the somatic results that were scored at least 4 times
|
334
|
-
|
491
|
+
|
335
492
|
```ruby
|
336
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
493
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
337
494
|
```
|
338
495
|
|
339
496
|
Similar for base quality scores
|
340
497
|
|
341
498
|
```ruby
|
342
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
499
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
343
500
|
```
|
344
501
|
|
345
502
|
Filter out on sample values
|
346
503
|
|
347
504
|
```ruby
|
348
|
-
bio-vcf --sfilter 's.dp>20' < test.vcf
|
505
|
+
bio-vcf --sfilter 's.dp>20' < test.vcf
|
349
506
|
```
|
350
507
|
|
351
508
|
To filter missing on samples:
|
@@ -360,6 +517,23 @@ or for all
|
|
360
517
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
361
518
|
```
|
362
519
|
|
520
|
+
To set a soft filter, i.e. the filter column is updated
|
521
|
+
|
522
|
+
```sh
|
523
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
524
|
+
```
|
525
|
+
|
526
|
+
may render something like
|
527
|
+
|
528
|
+
```
|
529
|
+
1 46527674 4 LowQD
|
530
|
+
1 108417572 4 LowQD
|
531
|
+
1 155449089 4 LowQD
|
532
|
+
1 169847826 4 LowQD
|
533
|
+
1 203098164 3 LowQD
|
534
|
+
2 39213209 4 LowQD
|
535
|
+
```
|
536
|
+
|
363
537
|
Likewise you can check for record validity
|
364
538
|
|
365
539
|
```sh
|
@@ -410,17 +584,17 @@ Even shorter r is an alias for rec
|
|
410
584
|
Note: special functions are not yet implemented! Look below
|
411
585
|
for genotype processing which has indexing in 'gti'.
|
412
586
|
|
413
|
-
Sometime you want to use a special function in a filter. For
|
414
|
-
example percentage variant reads can be defined as [a,c,g,t]
|
415
|
-
with frequencies against sample read depth (dp) as
|
416
|
-
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
587
|
+
Sometime you want to use a special function in a filter. For
|
588
|
+
example percentage variant reads can be defined as [a,c,g,t]
|
589
|
+
with frequencies against sample read depth (dp) as
|
590
|
+
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
417
591
|
which we named freq
|
418
592
|
|
419
593
|
```sh
|
420
594
|
bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
|
421
595
|
```
|
422
596
|
|
423
|
-
which is equal to
|
597
|
+
which is equal to
|
424
598
|
|
425
599
|
```sh
|
426
600
|
bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
|
@@ -440,7 +614,7 @@ ref should always be identical across samples.
|
|
440
614
|
|
441
615
|
## DbSNP
|
442
616
|
|
443
|
-
One clinical variant DbSNP example
|
617
|
+
One clinical variant DbSNP example
|
444
618
|
|
445
619
|
```sh
|
446
620
|
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
@@ -465,16 +639,16 @@ renders
|
|
465
639
|
|
466
640
|
bio-vcf allows for set analysis. With the complement filter, for
|
467
641
|
example, samples are selected that evaluate to true, all others should
|
468
|
-
evaluate to false. For this we create three filters, one for all
|
642
|
+
evaluate to false. For this we create three filters, one for all
|
469
643
|
samples that are included (the --ifilter or -if), for all samples that
|
470
644
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
471
645
|
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
472
|
-
filter).
|
646
|
+
filter).
|
473
647
|
|
474
648
|
The equivalent of the union filter is by using the --sfilter, so
|
475
649
|
|
476
650
|
```sh
|
477
|
-
bio-vcf --sfilter 's.dp>20'
|
651
|
+
bio-vcf --sfilter 's.dp>20'
|
478
652
|
```
|
479
653
|
|
480
654
|
Filters DP on all samples and is true if all samples match the
|
@@ -482,7 +656,7 @@ criterium (AND). To filter on a subset you can add a
|
|
482
656
|
selector
|
483
657
|
|
484
658
|
```sh
|
485
|
-
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
659
|
+
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
486
660
|
```
|
487
661
|
|
488
662
|
For set analysis there are the additional ifilter (include) and
|
@@ -502,7 +676,7 @@ values
|
|
502
676
|
|
503
677
|
The equivalent of the complement filter is by specifying what samples
|
504
678
|
to include, here with a regex and define filters on the included
|
505
|
-
and excluded samples (the ones not in ifilter-samples) and the
|
679
|
+
and excluded samples (the ones not in ifilter-samples) and the
|
506
680
|
|
507
681
|
```sh
|
508
682
|
./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
|
@@ -523,7 +697,7 @@ To print out the GT's add --seval
|
|
523
697
|
To set an additional filter on the excluded samples:
|
524
698
|
|
525
699
|
```sh
|
526
|
-
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
700
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
527
701
|
```
|
528
702
|
|
529
703
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
@@ -536,15 +710,15 @@ In the near future it is also possible to select samples on a regex (here
|
|
536
710
|
select all samples where the name starts with s3)
|
537
711
|
|
538
712
|
```sh
|
539
|
-
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
713
|
+
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
540
714
|
```
|
541
715
|
|
542
716
|
```sh
|
543
|
-
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
717
|
+
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
544
718
|
--set-intersect include=true
|
545
|
-
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
719
|
+
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
546
720
|
--set-catesian one in include=true, rest=false
|
547
|
-
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
721
|
+
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
548
722
|
```
|
549
723
|
|
550
724
|
With the filter commands you can use --ignore-missing to skip errors.
|
@@ -567,7 +741,7 @@ results in a string value
|
|
567
741
|
to access components of the genotype field we can use standard Ruby
|
568
742
|
|
569
743
|
```ruby
|
570
|
-
bio-vcf --seval 's.gt.split(/\//)[0]'
|
744
|
+
bio-vcf --seval 's.gt.split(/\//)[0]'
|
571
745
|
1 10665 . . 0 0 . 0 0
|
572
746
|
1 10694 . . 1 1 . . .
|
573
747
|
1 12783 0 0 0 0 0 0 0
|
@@ -578,7 +752,7 @@ or special functions, such as 'gti' which gives the genotype as an
|
|
578
752
|
indexed value array
|
579
753
|
|
580
754
|
```ruby
|
581
|
-
bio-vcf --seval 's.gti[0]'
|
755
|
+
bio-vcf --seval 's.gti[0]'
|
582
756
|
1 10665 0 0 0 0
|
583
757
|
1 10694 1 1
|
584
758
|
1 12783 0 0 0 0 0 0 0
|
@@ -588,7 +762,7 @@ indexed value array
|
|
588
762
|
and 'gts' as a nucleotide string array
|
589
763
|
|
590
764
|
```ruby
|
591
|
-
bio-vcf --seval 's.gts
|
765
|
+
bio-vcf --seval 's.gts'
|
592
766
|
1 10665 C C C C
|
593
767
|
1 10694 G G
|
594
768
|
1 12783 G G G G G G G
|
@@ -597,6 +771,28 @@ and 'gts' as a nucleotide string array
|
|
597
771
|
|
598
772
|
where gts represents the indexed genotype on [ref] + [alt].
|
599
773
|
|
774
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
775
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
776
|
+
example signficance, use
|
777
|
+
|
778
|
+
```ruby
|
779
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
780
|
+
11 58949455 0 1
|
781
|
+
11 65481082 0 1
|
782
|
+
11 94180424 0 1
|
783
|
+
11 121036021 0 1
|
784
|
+
```
|
785
|
+
|
786
|
+
Now you can index other fields, e.g. GL
|
787
|
+
|
788
|
+
```ruby
|
789
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
790
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
791
|
+
```
|
792
|
+
|
793
|
+
shows a number of SNPs have been scored with high significance and a
|
794
|
+
number are missing, here marked as -1.
|
795
|
+
|
600
796
|
These values can also be used in filters and output allele depth, for
|
601
797
|
example
|
602
798
|
|
@@ -618,6 +814,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
618
814
|
1 13757 47 47 4 47 47 4 47
|
619
815
|
```
|
620
816
|
|
817
|
+
## Sample counting
|
818
|
+
|
819
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
820
|
+
some expert advice here.
|
821
|
+
|
822
|
+
To count valid genotype field in samples you can do something like
|
823
|
+
|
824
|
+
```ruby
|
825
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
826
|
+
```
|
827
|
+
|
828
|
+
A similar complex count would be
|
829
|
+
|
830
|
+
```ruby
|
831
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
832
|
+
```
|
833
|
+
|
834
|
+
which tests for perfect SNPs scored (for example).
|
835
|
+
|
836
|
+
## Reorder filter with lambda
|
837
|
+
|
838
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
839
|
+
example where the greedy sample counts are done only for those
|
840
|
+
samples that match the other criteria:
|
841
|
+
|
842
|
+
```ruby
|
843
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
844
|
+
```
|
621
845
|
|
622
846
|
## Modify VCF files
|
623
847
|
|
@@ -633,6 +857,17 @@ To remove/select 3 samples:
|
|
633
857
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
634
858
|
```
|
635
859
|
|
860
|
+
You can also select samples by name (as long as they do not contain
|
861
|
+
spaces)
|
862
|
+
|
863
|
+
|
864
|
+
```sh
|
865
|
+
bio-vcf --names < mytest.vcf
|
866
|
+
Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
867
|
+
bio-vcf --samples "Original,s1t1,s3t1" < mytest.vcf
|
868
|
+
```
|
869
|
+
|
870
|
+
|
636
871
|
Filter on a BED file and annotate the gene name in the resulting VCF
|
637
872
|
|
638
873
|
```sh
|
@@ -679,11 +914,11 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
|
|
679
914
|
|
680
915
|
## Templates
|
681
916
|
|
682
|
-
To have more output options
|
917
|
+
To have more output options bio-vcf can use an [ERB
|
683
918
|
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
684
919
|
very flexible option that can output textual formats such as JSON, YAML, HTML
|
685
920
|
and RDF. Examples are provided in
|
686
|
-
[./templates](https://github.com/
|
921
|
+
[./templates](https://github.com/vcflib/bio-vcf/templates/). A JSON
|
687
922
|
template could be
|
688
923
|
|
689
924
|
```Javascript
|
@@ -693,11 +928,11 @@ template could be
|
|
693
928
|
"seq:ref": "<%= rec.ref %>" ,
|
694
929
|
"seq:alt": "<%= rec.alt[0] %>" ,
|
695
930
|
"seq:maf": <%= rec.info.maf[0] %> ,
|
696
|
-
"dp": <%= rec.info.dp %>
|
931
|
+
"dp": <%= rec.info.dp %>
|
697
932
|
};
|
698
933
|
```
|
699
934
|
|
700
|
-
To get JSON, run with something like (combining
|
935
|
+
To get JSON, run with something like (combining
|
701
936
|
with a filter)
|
702
937
|
|
703
938
|
```sh
|
@@ -713,7 +948,7 @@ which renders
|
|
713
948
|
"seq:ref": "C" ,
|
714
949
|
"seq:alt": "T" ,
|
715
950
|
"seq:maf": 0.0151 ,
|
716
|
-
"dp": 86
|
951
|
+
"dp": 86
|
717
952
|
};
|
718
953
|
```
|
719
954
|
|
@@ -723,11 +958,11 @@ Likewise for RDF output:
|
|
723
958
|
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
724
959
|
```
|
725
960
|
|
726
|
-
renders the ERB template
|
961
|
+
renders the ERB template
|
727
962
|
|
728
963
|
```ruby
|
729
964
|
<%
|
730
|
-
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
965
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
731
966
|
%>
|
732
967
|
:<%= id %>
|
733
968
|
:query_id "<%= id %>",
|
@@ -740,7 +975,7 @@ renders the ERB template
|
|
740
975
|
db:vcf true .
|
741
976
|
```
|
742
977
|
|
743
|
-
into
|
978
|
+
into
|
744
979
|
|
745
980
|
```
|
746
981
|
:ch13_33703698_A
|
@@ -765,50 +1000,56 @@ can be
|
|
765
1000
|
```Javascript
|
766
1001
|
=HEADER
|
767
1002
|
<% require 'json' %>
|
768
|
-
|
769
|
-
{ "HEADER": {
|
1003
|
+
{ "HEADER": {
|
770
1004
|
"options": <%= options.to_h.to_json %>,
|
771
1005
|
"files": <%= ARGV %>,
|
772
1006
|
"version": "<%= BIOVCF_VERSION %>"
|
773
1007
|
},
|
774
|
-
|
1008
|
+
"BODY":[
|
775
1009
|
=BODY
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
},
|
1010
|
+
{
|
1011
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
1012
|
+
"seq:pos": <%= rec.pos %> ,
|
1013
|
+
"seq:ref": "<%= rec.ref %>" ,
|
1014
|
+
"seq:alt": "<%= rec.alt[0] %>" ,
|
1015
|
+
"dp": <%= rec.info.dp %>
|
1016
|
+
},
|
784
1017
|
=FOOTER
|
785
|
-
]
|
1018
|
+
]
|
1019
|
+
}
|
1020
|
+
```
|
1021
|
+
|
1022
|
+
with
|
1023
|
+
|
1024
|
+
```sh
|
1025
|
+
bio-vcf --template template/vcf2json.erb < dbsnp.vcf
|
786
1026
|
```
|
787
1027
|
|
788
1028
|
may generate something like
|
789
1029
|
|
790
1030
|
```Javascript
|
791
|
-
|
792
|
-
{ "HEADER": {
|
1031
|
+
{ "HEADER": {
|
793
1032
|
"options": {"show_help":false,"source":"https://github.com/CuppenResearch/bioruby-vcf","version":"0.8.1-pre3 (Pjotr Prins)","date":"2014-11-26 12:51:36 +0000","thread_lines":40000,"template":"template/vcf2json.erb","skip_header":true},
|
794
1033
|
"files": [],
|
795
1034
|
"version": "0.8.1-pre3"
|
796
1035
|
},
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
1036
|
+
"BODY":[
|
1037
|
+
{
|
1038
|
+
"seq:chr": "1" ,
|
1039
|
+
"seq:pos": 883516 ,
|
1040
|
+
"seq:ref": "G" ,
|
1041
|
+
"seq:alt": "A" ,
|
1042
|
+
"dp":
|
1043
|
+
},
|
1044
|
+
{
|
1045
|
+
"seq:chr": "1" ,
|
1046
|
+
"seq:pos": 891344 ,
|
1047
|
+
"seq:ref": "G" ,
|
1048
|
+
"seq:alt": "A" ,
|
1049
|
+
"dp": ,
|
1050
|
+
},
|
1051
|
+
]
|
1052
|
+
}
|
812
1053
|
```
|
813
1054
|
|
814
1055
|
Note that the template is not smart enough to remove the final comma
|
@@ -816,6 +1057,19 @@ from the last BODY element. To make it valid JSON that needs to be
|
|
816
1057
|
removed. A future version may add a parameter to the BODY element or a
|
817
1058
|
global rewrite function for this purpose. YAML and RDF have no such issue.
|
818
1059
|
|
1060
|
+
### Using full VCF header (meta) info
|
1061
|
+
|
1062
|
+
To get and put the full information from the header, simple use
|
1063
|
+
vcf.meta.to_json. See ./template/vcf2json_full_header.erb for an
|
1064
|
+
example. This meta information can also be used to output info fields
|
1065
|
+
and sample values on the fly! For an example, see the template at
|
1066
|
+
[./template/vcf2json_use_meta.erb](https://github.com/vcflib/bio-vcf/tree/master/template/vcf2json_use_meta.erb)
|
1067
|
+
and the generated output at
|
1068
|
+
[./test/data/regression/vcf2json_use_meta.ref](https://github.com/vcflib/bio-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
|
1069
|
+
|
1070
|
+
This way, it is possible to write templates that can convert the content of
|
1071
|
+
*any* VCF file without prior knowledge to JSON, RDF, etc.
|
1072
|
+
|
819
1073
|
## Statistics
|
820
1074
|
|
821
1075
|
Simple statistics are available for REF>ALT changes:
|
@@ -828,7 +1082,7 @@ Simple statistics are available for REF>ALT changes:
|
|
828
1082
|
G>A 59 45%
|
829
1083
|
C>T 30 23%
|
830
1084
|
A>G 5 4%
|
831
|
-
C>G 5 4%
|
1085
|
+
C>G 5 4%
|
832
1086
|
C>A 5 4%
|
833
1087
|
G>T 4 3%
|
834
1088
|
T>C 4 3%
|
@@ -848,7 +1102,10 @@ Simple statistics are available for REF>ALT changes:
|
|
848
1102
|
|
849
1103
|
## Other examples
|
850
1104
|
|
851
|
-
For more examples see
|
1105
|
+
For more exercises and examples see
|
1106
|
+
[doc](https://github.com/vcflib/bio-vcf/tree/master/doc) directory
|
1107
|
+
and the the feature
|
1108
|
+
[section](https://github.com/vcflib/bio-vcf/tree/master/features).
|
852
1109
|
|
853
1110
|
## API
|
854
1111
|
|
@@ -879,27 +1136,109 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
879
1136
|
end
|
880
1137
|
```
|
881
1138
|
|
1139
|
+
### VCFFile
|
1140
|
+
|
1141
|
+
The class ```BioVcf::VCFfile``` wraps a file and provides an ```enum``` with the
|
1142
|
+
method each, that can be used as in iterator.
|
1143
|
+
|
1144
|
+
```ruby
|
1145
|
+
vcf_file = "dbsnp.vcf"
|
1146
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: false )
|
1147
|
+
it vcf.each
|
1148
|
+
puts it.peek
|
1149
|
+
|
1150
|
+
vcf_file = "dbsnp.vcf.gz"
|
1151
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: true )
|
1152
|
+
it vcf.each
|
1153
|
+
puts it.peek
|
1154
|
+
```
|
1155
|
+
|
882
1156
|
## Trouble shooting
|
883
1157
|
|
1158
|
+
### MRI supports threading
|
1159
|
+
|
884
1160
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
885
1161
|
in single threaded mode (for now).
|
886
1162
|
|
1163
|
+
### Set TMPDIR when running out of space
|
1164
|
+
|
887
1165
|
The multi-threading creates temporary files using the system TMPDIR.
|
888
1166
|
This behaviour can be overridden by setting the environment variable.
|
889
|
-
|
890
|
-
|
1167
|
+
|
1168
|
+
### Reorder filter on time out
|
1169
|
+
|
1170
|
+
Make sure to minimize expensive calculations by moving them
|
1171
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1172
|
+
|
1173
|
+
```ruby
|
1174
|
+
fast_check and slow_check
|
1175
|
+
```
|
1176
|
+
|
1177
|
+
slow_check only gets executed if fast_check is true.
|
1178
|
+
|
1179
|
+
For more complex filters use lambda inside a conditional
|
1180
|
+
|
1181
|
+
```ruby
|
1182
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1183
|
+
```
|
1184
|
+
|
1185
|
+
where slow_check is the slow section of your query. As is shown
|
1186
|
+
earlier in this document. Don't forget the .call!
|
1187
|
+
|
1188
|
+
### Reduce thread lines on timeout
|
1189
|
+
|
1190
|
+
Depending on your input data and the speed filters it may be useful to
|
1191
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1192
|
+
|
1193
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1194
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1195
|
+
the computations are intensive (per line) reduce the number of
|
1196
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1197
|
+
the one to try.
|
1198
|
+
|
1199
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1200
|
+
|
1201
|
+
Different values may show different core use on a machine.
|
1202
|
+
|
1203
|
+
### Development
|
1204
|
+
|
1205
|
+
To run the tests from source
|
1206
|
+
|
1207
|
+
```sh
|
1208
|
+
bundle install --path vendor/bundle
|
1209
|
+
bundle exec rake
|
1210
|
+
```
|
1211
|
+
|
1212
|
+
Note: we develop in a GNU Guix environment, see the header of
|
1213
|
+
[guix.scm](guix.scm) which does not use bundler.
|
1214
|
+
|
1215
|
+
### Debugging
|
1216
|
+
|
1217
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1218
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1219
|
+
are problems.
|
1220
|
+
|
1221
|
+
### Could not find rake-10.4.2 in any of the sources
|
1222
|
+
|
1223
|
+
Remove Gemfile.lock before running other tools.
|
1224
|
+
|
1225
|
+
### Tmpdir contains (old) bio-vcf directories
|
1226
|
+
|
1227
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1228
|
+
processing. When a process gets interrupted for some reason the
|
1229
|
+
temporary directory may remain.
|
891
1230
|
|
892
1231
|
## Project home page
|
893
1232
|
|
894
1233
|
Information on the source tree, documentation, examples, issues and
|
895
1234
|
how to contribute, see
|
896
1235
|
|
897
|
-
http://github.com/
|
1236
|
+
http://github.com/vcflib/bio-vcf
|
898
1237
|
|
899
1238
|
## Cite
|
900
1239
|
|
901
1240
|
If you use this software, please cite one of
|
902
|
-
|
1241
|
+
|
903
1242
|
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
904
1243
|
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
905
1244
|
|
@@ -909,5 +1248,4 @@ This Biogem is published at (http://biogems.info/index.html#bio-vcf)
|
|
909
1248
|
|
910
1249
|
## Copyright
|
911
1250
|
|
912
|
-
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
913
|
-
|
1251
|
+
Copyright (c) 2014-2020 Pjotr Prins. See LICENSE.txt for further details.
|