bio-vcf 0.8.0 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +4 -5
- data/Gemfile.lock +28 -65
- data/LICENSE.txt +1 -1
- data/README.md +387 -107
- data/RELEASE_NOTES.md +20 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +3 -40
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +176 -109
- data/bio-vcf.gemspec +14 -70
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +25 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +0 -9
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/bedfilter.rb +43 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/template.rb +75 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
- data/lib/bio-vcf/vcfheader.rb +146 -6
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +27 -3
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +19 -7
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/template/vcf2rdf_header.erb +24 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +65 -64
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f5d7a81871906abfffc93455b4d664d5755fe8d79312134eae94e84659506198
|
4
|
+
data.tar.gz: 8029269859aedd53c613ea9bbb17f951972b062060b5a40c22bdbe65c6c3dfa7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed231c3a918e5f9ab9cd8a618f3f25f0c39613ac934b496af334d77dabe64831ff08cfc722a467fc51ab8c583358ca21be769ba1d9654437d54e7d21b811ee2c
|
7
|
+
data.tar.gz: df49786c4f4aa5e3a3659c678fb66aeb4b7dd4bb575aacf34cc468663c18fa893502699d238b5034c308ff51a4dc05e0fadf929b923c1d646f61c3f07fef26c7
|
data/.travis.yml
CHANGED
@@ -1,13 +1,3 @@
|
|
1
1
|
language: ruby
|
2
|
-
rvm:
|
3
|
-
# - 1.9.3 <- No longer working
|
4
|
-
- 2.0.0
|
5
|
-
- 2.1.0
|
6
|
-
# - jruby-head
|
7
|
-
# - jruby-19mode # JRuby in 1.9 mode
|
8
|
-
# - 1.8.7
|
9
|
-
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
-
# - rbx-18mode
|
11
2
|
|
12
|
-
|
13
|
-
# script: bundle exec rspec spec
|
3
|
+
arch: arm64
|
data/Gemfile
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
3
|
# Add dependencies to develop your gem here.
|
7
4
|
# Include everything needed to run rake, tests, features, etc.
|
8
5
|
group :development do
|
9
6
|
# gem "minitest"
|
7
|
+
gem "rake"
|
10
8
|
gem "rspec"
|
11
9
|
gem "cucumber"
|
12
|
-
gem "
|
13
|
-
gem "regressiontest", "~> 0.0.3"
|
10
|
+
gem "regressiontest", ">= 0.0.3"
|
14
11
|
end
|
12
|
+
|
13
|
+
|
data/Gemfile.lock
CHANGED
@@ -1,81 +1,44 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
addressable (2.3.5)
|
5
4
|
builder (3.2.2)
|
6
|
-
cucumber (1.
|
5
|
+
cucumber (2.1.0)
|
7
6
|
builder (>= 2.1.2)
|
7
|
+
cucumber-core (~> 1.3.0)
|
8
8
|
diff-lcs (>= 1.1.3)
|
9
|
-
|
9
|
+
gherkin3 (~> 3.1.0)
|
10
10
|
multi_json (>= 1.7.5, < 2.0)
|
11
|
-
multi_test (>= 0.
|
12
|
-
|
11
|
+
multi_test (>= 0.1.2)
|
12
|
+
cucumber-core (1.3.0)
|
13
|
+
gherkin3 (~> 3.1.0)
|
13
14
|
diff-lcs (1.2.5)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
gherkin (2.12.2-java)
|
19
|
-
multi_json (~> 1.3)
|
20
|
-
git (1.2.6)
|
21
|
-
github_api (0.11.3)
|
22
|
-
addressable (~> 2.3)
|
23
|
-
descendants_tracker (~> 0.0.1)
|
24
|
-
faraday (~> 0.8, < 0.10)
|
25
|
-
hashie (>= 1.2)
|
26
|
-
multi_json (>= 1.7.5, < 2.0)
|
27
|
-
nokogiri (~> 1.6.0)
|
28
|
-
oauth2
|
29
|
-
hashie (2.0.5)
|
30
|
-
highline (1.6.21)
|
31
|
-
jeweler (2.0.1)
|
32
|
-
builder
|
33
|
-
bundler (>= 1.0)
|
34
|
-
git (>= 1.2.5)
|
35
|
-
github_api
|
36
|
-
highline (>= 1.6.15)
|
37
|
-
nokogiri (>= 1.5.10)
|
38
|
-
rake
|
39
|
-
rdoc
|
40
|
-
json (1.8.1)
|
41
|
-
json (1.8.1-java)
|
42
|
-
jwt (0.1.11)
|
43
|
-
multi_json (>= 1.5)
|
44
|
-
mini_portile (0.5.2)
|
45
|
-
multi_json (1.9.0)
|
46
|
-
multi_test (0.0.3)
|
47
|
-
multi_xml (0.5.5)
|
48
|
-
multipart-post (2.0.0)
|
49
|
-
nokogiri (1.6.1)
|
50
|
-
mini_portile (~> 0.5.0)
|
51
|
-
nokogiri (1.6.1-java)
|
52
|
-
mini_portile (~> 0.5.0)
|
53
|
-
oauth2 (0.9.3)
|
54
|
-
faraday (>= 0.8, < 0.10)
|
55
|
-
jwt (~> 0.1.8)
|
56
|
-
multi_json (~> 1.3)
|
57
|
-
multi_xml (~> 0.5)
|
58
|
-
rack (~> 1.2)
|
59
|
-
rack (1.5.2)
|
60
|
-
rake (10.1.1)
|
61
|
-
rdoc (4.1.1)
|
62
|
-
json (~> 1.4)
|
15
|
+
gherkin3 (3.1.1)
|
16
|
+
multi_json (1.11.2)
|
17
|
+
multi_test (0.1.2)
|
18
|
+
rake (10.4.2)
|
63
19
|
regressiontest (0.0.3)
|
64
|
-
rspec (
|
65
|
-
rspec-core (~>
|
66
|
-
rspec-expectations (~>
|
67
|
-
rspec-mocks (~>
|
68
|
-
rspec-core (
|
69
|
-
|
70
|
-
|
71
|
-
|
20
|
+
rspec (3.3.0)
|
21
|
+
rspec-core (~> 3.3.0)
|
22
|
+
rspec-expectations (~> 3.3.0)
|
23
|
+
rspec-mocks (~> 3.3.0)
|
24
|
+
rspec-core (3.3.2)
|
25
|
+
rspec-support (~> 3.3.0)
|
26
|
+
rspec-expectations (3.3.1)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.3.0)
|
29
|
+
rspec-mocks (3.3.2)
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
+
rspec-support (~> 3.3.0)
|
32
|
+
rspec-support (3.3.0)
|
72
33
|
|
73
34
|
PLATFORMS
|
74
|
-
java
|
75
35
|
ruby
|
76
36
|
|
77
37
|
DEPENDENCIES
|
78
38
|
cucumber
|
79
|
-
|
80
|
-
regressiontest (
|
39
|
+
rake
|
40
|
+
regressiontest (>= 0.0.3)
|
81
41
|
rspec
|
42
|
+
|
43
|
+
BUNDLED WITH
|
44
|
+
1.10.6
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,49 +1,68 @@
|
|
1
1
|
# bio-vcf
|
2
2
|
|
3
|
-
[![Build Status](https://secure.travis-ci.org/
|
3
|
+
[![Build Status](https://secure.travis-ci.org/vcflib/bio-vcf.png)](http://travis-ci.org/vcflib/bio-vcf)
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
|
6
|
+
## Bio-vcf
|
7
|
+
|
8
|
+
Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf
|
9
|
+
is not only very fast for genome-wide (WGS) data, it also comes with a
|
10
|
+
really nice filtering, evaluation and rewrite language and it can
|
11
|
+
output any type of textual data, including VCF header and contents in
|
12
|
+
RDF and JSON.
|
13
|
+
|
14
|
+
So, why would you use bio-vcf over other parsers? Because
|
9
15
|
|
10
16
|
1. Bio-vcf is fast and scales on multi-core computers
|
11
17
|
2. Bio-vcf has an expressive filtering and evaluation language
|
12
18
|
3. Bio-vcf has great multi-sample support
|
13
19
|
4. Bio-vcf has multiple global filters and sample filters
|
14
20
|
5. Bio-vcf can access any VCF format
|
15
|
-
6. Bio-vcf can
|
16
|
-
7. Bio-vcf
|
17
|
-
8. Bio-vcf
|
18
|
-
9. Bio-vcf has
|
19
|
-
10. Bio-vcf
|
21
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
22
|
+
7. Bio-vcf can do calculations on fields
|
23
|
+
8. Bio-vcf allows for genotype processing
|
24
|
+
9. Bio-vcf has support for set analysis
|
25
|
+
10. Bio-vcf has sane error handling
|
26
|
+
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
27
|
+
12. Bio-vcf has soft filters
|
28
|
+
|
29
|
+
Bio-vcf has better performance than other tools because of lazy
|
30
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
31
|
+
line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
|
32
|
+
does better. The more complicated the filters, the larger the
|
33
|
+
gain. First a base line test to show IO performance
|
20
34
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
35
|
+
```sh
|
36
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
37
|
+
1987143 15897724 1003214613
|
38
|
+
real 0m7.823s
|
39
|
+
user 0m7.002s
|
40
|
+
sys 0m2.972s
|
41
|
+
```
|
42
|
+
|
43
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
27
44
|
|
28
45
|
```sh
|
29
|
-
time
|
30
|
-
real
|
31
|
-
user
|
32
|
-
sys
|
46
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
47
|
+
real 0m32.491s
|
48
|
+
user 2m34.767s
|
49
|
+
sys 0m12.733s
|
33
50
|
```
|
34
51
|
|
35
|
-
|
52
|
+
The same with SnpSift v4.0 takes
|
36
53
|
|
37
54
|
```sh
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > /dev/null
|
56
|
+
real 12m36.121s
|
57
|
+
user 12m53.273s
|
58
|
+
sys 0m9.913s
|
42
59
|
```
|
43
60
|
|
44
|
-
|
45
|
-
|
46
|
-
|
61
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
62
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
63
|
+
filters and parsing large data files on powerful machines. Parsing a
|
64
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
65
|
+
BED format on a 16 core machine takes
|
47
66
|
|
48
67
|
```sh
|
49
68
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -52,29 +71,39 @@ a 16 core machine takes
|
|
52
71
|
sys 0m5.039s
|
53
72
|
```
|
54
73
|
|
55
|
-
which shows
|
56
|
-
gzip compressed VCF files of 30+ Gb
|
74
|
+
which shows decent core utilisation (10x). Running
|
75
|
+
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
76
|
+
|
77
|
+
To view some complex filters on an 80Gb SNP file check out a
|
78
|
+
[GTEx exercise](https://github.com/vcflib/bio-vcf/blob/master/doc/GTEx_reduce.md).
|
57
79
|
|
58
|
-
Use zcat to
|
59
|
-
|
80
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
81
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
60
82
|
|
61
83
|
```sh
|
62
84
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
63
|
-
--sfilter '!s.empty? and s.dp>20'
|
85
|
+
--sfilter '!s.empty? and s.dp>20'
|
64
86
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
65
87
|
```
|
66
88
|
|
67
|
-
bio-vcf comes with a sensible parser definition language
|
68
|
-
Ruby),
|
89
|
+
bio-vcf comes with a sensible parser definition language
|
90
|
+
(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
|
91
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
69
92
|
assumptions are made about the actual contents of the VCF file (field
|
70
|
-
names are resolved on the fly), so bio-vcf should
|
71
|
-
|
93
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
94
|
+
files.
|
72
95
|
|
73
|
-
To fetch all entries where all samples have depth larger than 20
|
74
|
-
a sample filter
|
96
|
+
To fetch all entries where all samples have depth larger than 20 and
|
97
|
+
filter set to PASS use a sample filter
|
75
98
|
|
76
99
|
```ruby
|
77
|
-
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
100
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter=="PASS"' < file.vcf
|
101
|
+
```
|
102
|
+
|
103
|
+
or with a regex
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter !~ /LowQD/' < file.vcf
|
78
107
|
```
|
79
108
|
|
80
109
|
To only filter on some samples number 0 and 3:
|
@@ -87,7 +116,7 @@ Where 's.dp' is the shorter name for 'sample.dp'.
|
|
87
116
|
|
88
117
|
It is also possible to specify sample names, or info fields:
|
89
118
|
|
90
|
-
For example, to filter somatic data
|
119
|
+
For example, to filter somatic data
|
91
120
|
|
92
121
|
```ruby
|
93
122
|
bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
@@ -140,7 +169,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
140
169
|
1 10303 25 31 28 32 17 23 22
|
141
170
|
```
|
142
171
|
|
143
|
-
To calculate alt frequencies from s.ad
|
172
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
173
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
144
174
|
|
145
175
|
```ruby
|
146
176
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -214,7 +244,7 @@ The VCF format is commonly used for variant calling between NGS
|
|
214
244
|
samples. The fast parser needs to carry some state, recorded for each
|
215
245
|
file in VcfHeader, which contains the VCF file header. Individual
|
216
246
|
lines (variant calls) first go through a raw parser returning an array
|
217
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
247
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
218
248
|
|
219
249
|
At this point the filter is pretty generic with multi-sample support.
|
220
250
|
If something is not working, check out the feature descriptions and
|
@@ -223,22 +253,15 @@ example of a VCF statement you need to work on.
|
|
223
253
|
|
224
254
|
## Installation
|
225
255
|
|
226
|
-
|
227
|
-
a performance improvement. Bio-vcf will show the Ruby version when
|
228
|
-
typing the command 'bio-vcf -h'.
|
256
|
+
The bio-vcf has no other dependencies but Ruby.
|
229
257
|
|
230
|
-
To
|
258
|
+
To install bio-vcf with Ruby gems:
|
231
259
|
|
232
260
|
```sh
|
233
261
|
gem install bio-vcf
|
234
262
|
bio-vcf -h
|
235
263
|
```
|
236
264
|
|
237
|
-
For multi-core also install the parallel gem
|
238
|
-
|
239
|
-
```sh
|
240
|
-
gem install parallel
|
241
|
-
```
|
242
265
|
|
243
266
|
## Command line interface (CLI)
|
244
267
|
|
@@ -263,56 +286,72 @@ Get the sample names
|
|
263
286
|
NORMAL,TUMOR
|
264
287
|
```
|
265
288
|
|
289
|
+
Alternatively use the command line switch for --names, e.g.
|
290
|
+
|
291
|
+
```ruby
|
292
|
+
bio-vcf --names < file.vcf
|
293
|
+
NORMAL,TUMOR
|
294
|
+
```
|
295
|
+
|
296
|
+
Get information from the header (META)
|
297
|
+
|
298
|
+
```ruby
|
299
|
+
bio-vcf -q --skip-header --eval-once 'header.meta["GATKCommandLine"]' < gatk_exome.vcf
|
300
|
+
```
|
301
|
+
|
266
302
|
The 'fields' array contains unprocessed data (strings). Print first
|
267
303
|
five raw fields
|
268
304
|
|
269
305
|
```ruby
|
270
|
-
bio-vcf --eval 'fields[0..4]' < file.vcf
|
306
|
+
bio-vcf --eval 'fields[0..4]' < file.vcf
|
271
307
|
```
|
272
308
|
|
273
309
|
Add a filter to display the fields on chromosome 12
|
274
310
|
|
275
311
|
```ruby
|
276
|
-
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
312
|
+
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
277
313
|
```
|
278
314
|
|
279
315
|
It gets better when we start using processed data, represented by an
|
280
316
|
object named 'rec'. Position is a value, so we can filter a range
|
281
317
|
|
282
318
|
```ruby
|
283
|
-
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
319
|
+
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
284
320
|
```
|
285
321
|
|
286
322
|
The shorter name for 'rec.chrom' is 'r.chrom', so you may write
|
287
323
|
|
288
324
|
```ruby
|
289
|
-
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
325
|
+
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
290
326
|
```
|
291
327
|
|
292
328
|
To ignore and continue parsing on missing data use the
|
293
329
|
--ignore-missing (-i) and or --quiet (-q) switches
|
294
330
|
|
295
331
|
```ruby
|
296
|
-
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
332
|
+
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
297
333
|
```
|
298
334
|
|
299
335
|
Info fields are referenced by
|
300
336
|
|
301
337
|
```ruby
|
302
|
-
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
338
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
303
339
|
```
|
304
340
|
|
305
|
-
|
341
|
+
(alternatively you can use the indexed rec.info['DP'] and list INFO fields with
|
342
|
+
rec.info.fields).
|
343
|
+
|
344
|
+
Subfields defined by rec.format:
|
306
345
|
|
307
346
|
```ruby
|
308
|
-
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
347
|
+
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
309
348
|
```
|
310
349
|
|
311
350
|
Output
|
312
351
|
|
313
352
|
```ruby
|
314
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
315
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
353
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
354
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
316
355
|
< file.vcf
|
317
356
|
```
|
318
357
|
|
@@ -326,26 +365,26 @@ Show the count of the bases that were scored as somatic
|
|
326
365
|
Actually, we have a convenience implementation for bcount, so this is the same
|
327
366
|
|
328
367
|
```ruby
|
329
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
368
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
330
369
|
< file.vcf
|
331
370
|
```
|
332
371
|
|
333
372
|
Filter on the somatic results that were scored at least 4 times
|
334
|
-
|
373
|
+
|
335
374
|
```ruby
|
336
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
375
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
337
376
|
```
|
338
377
|
|
339
378
|
Similar for base quality scores
|
340
379
|
|
341
380
|
```ruby
|
342
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
381
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
343
382
|
```
|
344
383
|
|
345
384
|
Filter out on sample values
|
346
385
|
|
347
386
|
```ruby
|
348
|
-
bio-vcf --sfilter 's.dp>20' < test.vcf
|
387
|
+
bio-vcf --sfilter 's.dp>20' < test.vcf
|
349
388
|
```
|
350
389
|
|
351
390
|
To filter missing on samples:
|
@@ -360,6 +399,23 @@ or for all
|
|
360
399
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
361
400
|
```
|
362
401
|
|
402
|
+
To set a soft filter, i.e. the filter column is updated
|
403
|
+
|
404
|
+
```sh
|
405
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
406
|
+
```
|
407
|
+
|
408
|
+
may render something like
|
409
|
+
|
410
|
+
```
|
411
|
+
1 46527674 4 LowQD
|
412
|
+
1 108417572 4 LowQD
|
413
|
+
1 155449089 4 LowQD
|
414
|
+
1 169847826 4 LowQD
|
415
|
+
1 203098164 3 LowQD
|
416
|
+
2 39213209 4 LowQD
|
417
|
+
```
|
418
|
+
|
363
419
|
Likewise you can check for record validity
|
364
420
|
|
365
421
|
```sh
|
@@ -410,17 +466,17 @@ Even shorter r is an alias for rec
|
|
410
466
|
Note: special functions are not yet implemented! Look below
|
411
467
|
for genotype processing which has indexing in 'gti'.
|
412
468
|
|
413
|
-
Sometime you want to use a special function in a filter. For
|
414
|
-
example percentage variant reads can be defined as [a,c,g,t]
|
415
|
-
with frequencies against sample read depth (dp) as
|
416
|
-
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
469
|
+
Sometime you want to use a special function in a filter. For
|
470
|
+
example percentage variant reads can be defined as [a,c,g,t]
|
471
|
+
with frequencies against sample read depth (dp) as
|
472
|
+
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
417
473
|
which we named freq
|
418
474
|
|
419
475
|
```sh
|
420
476
|
bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
|
421
477
|
```
|
422
478
|
|
423
|
-
which is equal to
|
479
|
+
which is equal to
|
424
480
|
|
425
481
|
```sh
|
426
482
|
bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
|
@@ -440,7 +496,7 @@ ref should always be identical across samples.
|
|
440
496
|
|
441
497
|
## DbSNP
|
442
498
|
|
443
|
-
One clinical variant DbSNP example
|
499
|
+
One clinical variant DbSNP example
|
444
500
|
|
445
501
|
```sh
|
446
502
|
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
@@ -465,16 +521,16 @@ renders
|
|
465
521
|
|
466
522
|
bio-vcf allows for set analysis. With the complement filter, for
|
467
523
|
example, samples are selected that evaluate to true, all others should
|
468
|
-
evaluate to false. For this we create three filters, one for all
|
524
|
+
evaluate to false. For this we create three filters, one for all
|
469
525
|
samples that are included (the --ifilter or -if), for all samples that
|
470
526
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
471
527
|
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
472
|
-
filter).
|
528
|
+
filter).
|
473
529
|
|
474
530
|
The equivalent of the union filter is by using the --sfilter, so
|
475
531
|
|
476
532
|
```sh
|
477
|
-
bio-vcf --sfilter 's.dp>20'
|
533
|
+
bio-vcf --sfilter 's.dp>20'
|
478
534
|
```
|
479
535
|
|
480
536
|
Filters DP on all samples and is true if all samples match the
|
@@ -482,7 +538,7 @@ criterium (AND). To filter on a subset you can add a
|
|
482
538
|
selector
|
483
539
|
|
484
540
|
```sh
|
485
|
-
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
541
|
+
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
486
542
|
```
|
487
543
|
|
488
544
|
For set analysis there are the additional ifilter (include) and
|
@@ -502,7 +558,7 @@ values
|
|
502
558
|
|
503
559
|
The equivalent of the complement filter is by specifying what samples
|
504
560
|
to include, here with a regex and define filters on the included
|
505
|
-
and excluded samples (the ones not in ifilter-samples) and the
|
561
|
+
and excluded samples (the ones not in ifilter-samples) and the
|
506
562
|
|
507
563
|
```sh
|
508
564
|
./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
|
@@ -523,7 +579,7 @@ To print out the GT's add --seval
|
|
523
579
|
To set an additional filter on the excluded samples:
|
524
580
|
|
525
581
|
```sh
|
526
|
-
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
582
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
527
583
|
```
|
528
584
|
|
529
585
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
@@ -536,15 +592,15 @@ In the near future it is also possible to select samples on a regex (here
|
|
536
592
|
select all samples where the name starts with s3)
|
537
593
|
|
538
594
|
```sh
|
539
|
-
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
595
|
+
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
540
596
|
```
|
541
597
|
|
542
598
|
```sh
|
543
|
-
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
599
|
+
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
544
600
|
--set-intersect include=true
|
545
|
-
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
601
|
+
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
546
602
|
--set-catesian one in include=true, rest=false
|
547
|
-
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
603
|
+
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
548
604
|
```
|
549
605
|
|
550
606
|
With the filter commands you can use --ignore-missing to skip errors.
|
@@ -567,7 +623,7 @@ results in a string value
|
|
567
623
|
to access components of the genotype field we can use standard Ruby
|
568
624
|
|
569
625
|
```ruby
|
570
|
-
bio-vcf --seval 's.gt.split(/\//)[0]'
|
626
|
+
bio-vcf --seval 's.gt.split(/\//)[0]'
|
571
627
|
1 10665 . . 0 0 . 0 0
|
572
628
|
1 10694 . . 1 1 . . .
|
573
629
|
1 12783 0 0 0 0 0 0 0
|
@@ -578,7 +634,7 @@ or special functions, such as 'gti' which gives the genotype as an
|
|
578
634
|
indexed value array
|
579
635
|
|
580
636
|
```ruby
|
581
|
-
bio-vcf --seval 's.gti[0]'
|
637
|
+
bio-vcf --seval 's.gti[0]'
|
582
638
|
1 10665 0 0 0 0
|
583
639
|
1 10694 1 1
|
584
640
|
1 12783 0 0 0 0 0 0 0
|
@@ -588,7 +644,7 @@ indexed value array
|
|
588
644
|
and 'gts' as a nucleotide string array
|
589
645
|
|
590
646
|
```ruby
|
591
|
-
bio-vcf --seval 's.gts
|
647
|
+
bio-vcf --seval 's.gts'
|
592
648
|
1 10665 C C C C
|
593
649
|
1 10694 G G
|
594
650
|
1 12783 G G G G G G G
|
@@ -597,6 +653,28 @@ and 'gts' as a nucleotide string array
|
|
597
653
|
|
598
654
|
where gts represents the indexed genotype on [ref] + [alt].
|
599
655
|
|
656
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
657
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
658
|
+
example signficance, use
|
659
|
+
|
660
|
+
```ruby
|
661
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
662
|
+
11 58949455 0 1
|
663
|
+
11 65481082 0 1
|
664
|
+
11 94180424 0 1
|
665
|
+
11 121036021 0 1
|
666
|
+
```
|
667
|
+
|
668
|
+
Now you can index other fields, e.g. GL
|
669
|
+
|
670
|
+
```ruby
|
671
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
672
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
673
|
+
```
|
674
|
+
|
675
|
+
shows a number of SNPs have been scored with high significance and a
|
676
|
+
number are missing, here marked as -1.
|
677
|
+
|
600
678
|
These values can also be used in filters and output allele depth, for
|
601
679
|
example
|
602
680
|
|
@@ -618,6 +696,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
618
696
|
1 13757 47 47 4 47 47 4 47
|
619
697
|
```
|
620
698
|
|
699
|
+
## Sample counting
|
700
|
+
|
701
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
702
|
+
some expert advice here.
|
703
|
+
|
704
|
+
To count valid genotype field in samples you can do something like
|
705
|
+
|
706
|
+
```ruby
|
707
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
708
|
+
```
|
709
|
+
|
710
|
+
A similar complex count would be
|
711
|
+
|
712
|
+
```ruby
|
713
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
714
|
+
```
|
715
|
+
|
716
|
+
which tests for perfect SNPs scored (for example).
|
717
|
+
|
718
|
+
## Reorder filter with lambda
|
719
|
+
|
720
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
721
|
+
example where the greedy sample counts are done only for those
|
722
|
+
samples that match the other criteria:
|
723
|
+
|
724
|
+
```ruby
|
725
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
726
|
+
```
|
621
727
|
|
622
728
|
## Modify VCF files
|
623
729
|
|
@@ -633,6 +739,23 @@ To remove/select 3 samples:
|
|
633
739
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
634
740
|
```
|
635
741
|
|
742
|
+
You can also select samples by name (as long as they do not contain
|
743
|
+
spaces)
|
744
|
+
|
745
|
+
|
746
|
+
```sh
|
747
|
+
bio-vcf --names < mytest.vcf
|
748
|
+
Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
749
|
+
bio-vcf --samples "Original,s1t1,s3t1" < mytest.vcf
|
750
|
+
```
|
751
|
+
|
752
|
+
|
753
|
+
Filter on a BED file and annotate the gene name in the resulting VCF
|
754
|
+
|
755
|
+
```sh
|
756
|
+
bio-vcf -v --bed test/data/input/test.bed --rewrite 'rec.info["gene"]=bed[3]' < test/data/input/somaticsniper.vcf
|
757
|
+
```
|
758
|
+
|
636
759
|
## RDF output
|
637
760
|
|
638
761
|
You can use --rdf for turtle RDF output from simple one-liners, note the use of --id and
|
@@ -673,11 +796,11 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
|
|
673
796
|
|
674
797
|
## Templates
|
675
798
|
|
676
|
-
To have more output options
|
799
|
+
To have more output options bio-vcf can use an [ERB
|
677
800
|
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
678
801
|
very flexible option that can output textual formats such as JSON, YAML, HTML
|
679
802
|
and RDF. Examples are provided in
|
680
|
-
[./templates](https://github.com/
|
803
|
+
[./templates](https://github.com/vcflib/bio-vcf/templates/). A JSON
|
681
804
|
template could be
|
682
805
|
|
683
806
|
```Javascript
|
@@ -687,14 +810,15 @@ template could be
|
|
687
810
|
"seq:ref": "<%= rec.ref %>" ,
|
688
811
|
"seq:alt": "<%= rec.alt[0] %>" ,
|
689
812
|
"seq:maf": <%= rec.info.maf[0] %> ,
|
690
|
-
"dp": <%= rec.info.dp %>
|
813
|
+
"dp": <%= rec.info.dp %>
|
691
814
|
};
|
692
815
|
```
|
693
816
|
|
694
|
-
To get JSON, run with something like
|
817
|
+
To get JSON, run with something like (combining
|
818
|
+
with a filter)
|
695
819
|
|
696
820
|
```sh
|
697
|
-
bio-vcf --template template/vcf2json.erb --filter 'r.info.
|
821
|
+
bio-vcf --template template/vcf2json.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
698
822
|
```
|
699
823
|
|
700
824
|
which renders
|
@@ -706,21 +830,21 @@ which renders
|
|
706
830
|
"seq:ref": "C" ,
|
707
831
|
"seq:alt": "T" ,
|
708
832
|
"seq:maf": 0.0151 ,
|
709
|
-
"dp": 86
|
833
|
+
"dp": 86
|
710
834
|
};
|
711
835
|
```
|
712
836
|
|
713
837
|
Likewise for RDF output:
|
714
838
|
|
715
839
|
```sh
|
716
|
-
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.
|
840
|
+
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
717
841
|
```
|
718
842
|
|
719
|
-
renders the ERB template
|
843
|
+
renders the ERB template
|
720
844
|
|
721
845
|
```ruby
|
722
846
|
<%
|
723
|
-
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
847
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
724
848
|
%>
|
725
849
|
:<%= id %>
|
726
850
|
:query_id "<%= id %>",
|
@@ -728,12 +852,12 @@ renders the ERB template
|
|
728
852
|
seq:pos <%= rec.pos %> ,
|
729
853
|
seq:ref "<%= rec.ref %>" ,
|
730
854
|
seq:alt "<%= rec.alt[0] %>" ,
|
731
|
-
seq:maf <%= rec.info.maf[0] %> ,
|
855
|
+
seq:maf <%= (rec.info.maf[0]*100).round %> ,
|
732
856
|
seq:dp <%= rec.info.dp %> ,
|
733
857
|
db:vcf true .
|
734
858
|
```
|
735
859
|
|
736
|
-
into
|
860
|
+
into
|
737
861
|
|
738
862
|
```
|
739
863
|
:ch13_33703698_A
|
@@ -742,12 +866,91 @@ into
|
|
742
866
|
seq:pos 33703698 ,
|
743
867
|
seq:ref "C" ,
|
744
868
|
seq:alt "A" ,
|
745
|
-
seq:maf
|
869
|
+
seq:maf 16 ,
|
746
870
|
seq:dp 92 ,
|
747
871
|
db:vcf true .
|
748
872
|
```
|
749
873
|
|
750
|
-
Be creative! You can write templates for csv, HTML, XML, LaTeX, RDF, JSON, YAML, JSON-LD, etc. etc.!
|
874
|
+
Note the calculated field value for maf. Be creative! You can write templates for csv, HTML, XML, LaTeX, RDF, JSON, YAML, JSON-LD, etc. etc.!
|
875
|
+
|
876
|
+
### Metadata
|
877
|
+
|
878
|
+
Templates can also print data as a header of the JSON/YAML/RDF output. For this
|
879
|
+
use the '=' prefix with HEADER, BODY, FOOTER keywords in the template. A small example
|
880
|
+
can be
|
881
|
+
|
882
|
+
```Javascript
|
883
|
+
=HEADER
|
884
|
+
<% require 'json' %>
|
885
|
+
{ "HEADER": {
|
886
|
+
"options": <%= options.to_h.to_json %>,
|
887
|
+
"files": <%= ARGV %>,
|
888
|
+
"version": "<%= BIOVCF_VERSION %>"
|
889
|
+
},
|
890
|
+
"BODY":[
|
891
|
+
=BODY
|
892
|
+
{
|
893
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
894
|
+
"seq:pos": <%= rec.pos %> ,
|
895
|
+
"seq:ref": "<%= rec.ref %>" ,
|
896
|
+
"seq:alt": "<%= rec.alt[0] %>" ,
|
897
|
+
"dp": <%= rec.info.dp %>
|
898
|
+
},
|
899
|
+
=FOOTER
|
900
|
+
]
|
901
|
+
}
|
902
|
+
```
|
903
|
+
|
904
|
+
with
|
905
|
+
|
906
|
+
```sh
|
907
|
+
bio-vcf --template template/vcf2json.erb < dbsnp.vcf
|
908
|
+
```
|
909
|
+
|
910
|
+
may generate something like
|
911
|
+
|
912
|
+
```Javascript
|
913
|
+
{ "HEADER": {
|
914
|
+
"options": {"show_help":false,"source":"https://github.com/CuppenResearch/bioruby-vcf","version":"0.8.1-pre3 (Pjotr Prins)","date":"2014-11-26 12:51:36 +0000","thread_lines":40000,"template":"template/vcf2json.erb","skip_header":true},
|
915
|
+
"files": [],
|
916
|
+
"version": "0.8.1-pre3"
|
917
|
+
},
|
918
|
+
"BODY":[
|
919
|
+
{
|
920
|
+
"seq:chr": "1" ,
|
921
|
+
"seq:pos": 883516 ,
|
922
|
+
"seq:ref": "G" ,
|
923
|
+
"seq:alt": "A" ,
|
924
|
+
"dp":
|
925
|
+
},
|
926
|
+
{
|
927
|
+
"seq:chr": "1" ,
|
928
|
+
"seq:pos": 891344 ,
|
929
|
+
"seq:ref": "G" ,
|
930
|
+
"seq:alt": "A" ,
|
931
|
+
"dp": ,
|
932
|
+
},
|
933
|
+
]
|
934
|
+
}
|
935
|
+
```
|
936
|
+
|
937
|
+
Note that the template is not smart enough to remove the final comma
|
938
|
+
from the last BODY element. To make it valid JSON that needs to be
|
939
|
+
removed. A future version may add a parameter to the BODY element or a
|
940
|
+
global rewrite function for this purpose. YAML and RDF have no such issue.
|
941
|
+
|
942
|
+
### Using full VCF header (meta) info
|
943
|
+
|
944
|
+
To get and put the full information from the header, simple use
|
945
|
+
vcf.meta.to_json. See ./template/vcf2json_full_header.erb for an
|
946
|
+
example. This meta information can also be used to output info fields
|
947
|
+
and sample values on the fly! For an example, see the template at
|
948
|
+
[./template/vcf2json_use_meta.erb](https://github.com/vcflib/bio-vcf/tree/master/template/vcf2json_use_meta.erb)
|
949
|
+
and the generated output at
|
950
|
+
[./test/data/regression/vcf2json_use_meta.ref](https://github.com/vcflib/bio-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
|
951
|
+
|
952
|
+
This way, it is possible to write templates that can convert the content of
|
953
|
+
*any* VCF file without prior knowledge to JSON, RDF, etc.
|
751
954
|
|
752
955
|
## Statistics
|
753
956
|
|
@@ -761,7 +964,7 @@ Simple statistics are available for REF>ALT changes:
|
|
761
964
|
G>A 59 45%
|
762
965
|
C>T 30 23%
|
763
966
|
A>G 5 4%
|
764
|
-
C>G 5 4%
|
967
|
+
C>G 5 4%
|
765
968
|
C>A 5 4%
|
766
969
|
G>T 4 3%
|
767
970
|
T>C 4 3%
|
@@ -781,7 +984,10 @@ Simple statistics are available for REF>ALT changes:
|
|
781
984
|
|
782
985
|
## Other examples
|
783
986
|
|
784
|
-
For more examples see
|
987
|
+
For more exercises and examples see
|
988
|
+
[doc](https://github.com/vcflib/bio-vcf/tree/master/doc) directory
|
989
|
+
and the the feature
|
990
|
+
[section](https://github.com/vcflib/bio-vcf/tree/master/features).
|
785
991
|
|
786
992
|
## API
|
787
993
|
|
@@ -812,27 +1018,102 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
812
1018
|
end
|
813
1019
|
```
|
814
1020
|
|
1021
|
+
### VCFFile
|
1022
|
+
|
1023
|
+
The class ```BioVcf::VCFfile``` wraps a file and provides an ```enum``` with the
|
1024
|
+
method each, that can be used as in iterator.
|
1025
|
+
|
1026
|
+
```ruby
|
1027
|
+
vcf_file = "dbsnp.vcf"
|
1028
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: false )
|
1029
|
+
it vcf.each
|
1030
|
+
puts it.peek
|
1031
|
+
|
1032
|
+
vcf_file = "dbsnp.vcf.gz"
|
1033
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: true )
|
1034
|
+
it vcf.each
|
1035
|
+
puts it.peek
|
1036
|
+
```
|
1037
|
+
|
815
1038
|
## Trouble shooting
|
816
1039
|
|
1040
|
+
### MRI supports threading
|
1041
|
+
|
817
1042
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
818
1043
|
in single threaded mode (for now).
|
819
1044
|
|
1045
|
+
### Set TMPDIR when running out of space
|
1046
|
+
|
820
1047
|
The multi-threading creates temporary files using the system TMPDIR.
|
821
1048
|
This behaviour can be overridden by setting the environment variable.
|
822
|
-
|
823
|
-
|
1049
|
+
|
1050
|
+
### Reorder filter on time out
|
1051
|
+
|
1052
|
+
Make sure to minimize expensive calculations by moving them
|
1053
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1054
|
+
|
1055
|
+
```ruby
|
1056
|
+
fast_check and slow_check
|
1057
|
+
```
|
1058
|
+
|
1059
|
+
slow_check only gets executed if fast_check is true.
|
1060
|
+
|
1061
|
+
For more complex filters use lambda inside a conditional
|
1062
|
+
|
1063
|
+
```ruby
|
1064
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1065
|
+
```
|
1066
|
+
|
1067
|
+
where slow_check is the slow section of your query. As is shown
|
1068
|
+
earlier in this document. Don't forget the .call!
|
1069
|
+
|
1070
|
+
### Reduce thread lines on timeout
|
1071
|
+
|
1072
|
+
Depending on your input data and the speed filters it may be useful to
|
1073
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1074
|
+
|
1075
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1076
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1077
|
+
the computations are intensive (per line) reduce the number of
|
1078
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1079
|
+
the one to try.
|
1080
|
+
|
1081
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1082
|
+
|
1083
|
+
Different values may show different core use on a machine.
|
1084
|
+
|
1085
|
+
### Development
|
1086
|
+
|
1087
|
+
To run the tests from source
|
1088
|
+
|
1089
|
+
```sh
|
1090
|
+
bundle install --path vendor/bundle
|
1091
|
+
bundle exec rake
|
1092
|
+
```
|
1093
|
+
|
1094
|
+
### Debugging
|
1095
|
+
|
1096
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1097
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1098
|
+
are problems.
|
1099
|
+
|
1100
|
+
### Tmpdir contains (old) bio-vcf directories
|
1101
|
+
|
1102
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1103
|
+
processing. When a process gets interrupted for some reason the
|
1104
|
+
temporary directory may remain.
|
824
1105
|
|
825
1106
|
## Project home page
|
826
1107
|
|
827
1108
|
Information on the source tree, documentation, examples, issues and
|
828
1109
|
how to contribute, see
|
829
1110
|
|
830
|
-
http://github.com/
|
1111
|
+
http://github.com/vcflib/bio-vcf
|
831
1112
|
|
832
1113
|
## Cite
|
833
1114
|
|
834
1115
|
If you use this software, please cite one of
|
835
|
-
|
1116
|
+
|
836
1117
|
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
837
1118
|
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
838
1119
|
|
@@ -842,5 +1123,4 @@ This Biogem is published at (http://biogems.info/index.html#bio-vcf)
|
|
842
1123
|
|
843
1124
|
## Copyright
|
844
1125
|
|
845
|
-
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
846
|
-
|
1126
|
+
Copyright (c) 2014-2020 Pjotr Prins. See LICENSE.txt for further details.
|