bio-vcf 0.8.0 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +4 -5
- data/Gemfile.lock +28 -65
- data/LICENSE.txt +1 -1
- data/README.md +387 -107
- data/RELEASE_NOTES.md +20 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +3 -40
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +176 -109
- data/bio-vcf.gemspec +14 -70
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +25 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +0 -9
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/bedfilter.rb +43 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/template.rb +75 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
- data/lib/bio-vcf/vcfheader.rb +146 -6
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +27 -3
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +19 -7
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/template/vcf2rdf_header.erb +24 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +65 -64
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f5d7a81871906abfffc93455b4d664d5755fe8d79312134eae94e84659506198
|
4
|
+
data.tar.gz: 8029269859aedd53c613ea9bbb17f951972b062060b5a40c22bdbe65c6c3dfa7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed231c3a918e5f9ab9cd8a618f3f25f0c39613ac934b496af334d77dabe64831ff08cfc722a467fc51ab8c583358ca21be769ba1d9654437d54e7d21b811ee2c
|
7
|
+
data.tar.gz: df49786c4f4aa5e3a3659c678fb66aeb4b7dd4bb575aacf34cc468663c18fa893502699d238b5034c308ff51a4dc05e0fadf929b923c1d646f61c3f07fef26c7
|
data/.travis.yml
CHANGED
@@ -1,13 +1,3 @@
|
|
1
1
|
language: ruby
|
2
|
-
rvm:
|
3
|
-
# - 1.9.3 <- No longer working
|
4
|
-
- 2.0.0
|
5
|
-
- 2.1.0
|
6
|
-
# - jruby-head
|
7
|
-
# - jruby-19mode # JRuby in 1.9 mode
|
8
|
-
# - 1.8.7
|
9
|
-
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
-
# - rbx-18mode
|
11
2
|
|
12
|
-
|
13
|
-
# script: bundle exec rspec spec
|
3
|
+
arch: arm64
|
data/Gemfile
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
3
|
# Add dependencies to develop your gem here.
|
7
4
|
# Include everything needed to run rake, tests, features, etc.
|
8
5
|
group :development do
|
9
6
|
# gem "minitest"
|
7
|
+
gem "rake"
|
10
8
|
gem "rspec"
|
11
9
|
gem "cucumber"
|
12
|
-
gem "
|
13
|
-
gem "regressiontest", "~> 0.0.3"
|
10
|
+
gem "regressiontest", ">= 0.0.3"
|
14
11
|
end
|
12
|
+
|
13
|
+
|
data/Gemfile.lock
CHANGED
@@ -1,81 +1,44 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
addressable (2.3.5)
|
5
4
|
builder (3.2.2)
|
6
|
-
cucumber (1.
|
5
|
+
cucumber (2.1.0)
|
7
6
|
builder (>= 2.1.2)
|
7
|
+
cucumber-core (~> 1.3.0)
|
8
8
|
diff-lcs (>= 1.1.3)
|
9
|
-
|
9
|
+
gherkin3 (~> 3.1.0)
|
10
10
|
multi_json (>= 1.7.5, < 2.0)
|
11
|
-
multi_test (>= 0.
|
12
|
-
|
11
|
+
multi_test (>= 0.1.2)
|
12
|
+
cucumber-core (1.3.0)
|
13
|
+
gherkin3 (~> 3.1.0)
|
13
14
|
diff-lcs (1.2.5)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
gherkin (2.12.2-java)
|
19
|
-
multi_json (~> 1.3)
|
20
|
-
git (1.2.6)
|
21
|
-
github_api (0.11.3)
|
22
|
-
addressable (~> 2.3)
|
23
|
-
descendants_tracker (~> 0.0.1)
|
24
|
-
faraday (~> 0.8, < 0.10)
|
25
|
-
hashie (>= 1.2)
|
26
|
-
multi_json (>= 1.7.5, < 2.0)
|
27
|
-
nokogiri (~> 1.6.0)
|
28
|
-
oauth2
|
29
|
-
hashie (2.0.5)
|
30
|
-
highline (1.6.21)
|
31
|
-
jeweler (2.0.1)
|
32
|
-
builder
|
33
|
-
bundler (>= 1.0)
|
34
|
-
git (>= 1.2.5)
|
35
|
-
github_api
|
36
|
-
highline (>= 1.6.15)
|
37
|
-
nokogiri (>= 1.5.10)
|
38
|
-
rake
|
39
|
-
rdoc
|
40
|
-
json (1.8.1)
|
41
|
-
json (1.8.1-java)
|
42
|
-
jwt (0.1.11)
|
43
|
-
multi_json (>= 1.5)
|
44
|
-
mini_portile (0.5.2)
|
45
|
-
multi_json (1.9.0)
|
46
|
-
multi_test (0.0.3)
|
47
|
-
multi_xml (0.5.5)
|
48
|
-
multipart-post (2.0.0)
|
49
|
-
nokogiri (1.6.1)
|
50
|
-
mini_portile (~> 0.5.0)
|
51
|
-
nokogiri (1.6.1-java)
|
52
|
-
mini_portile (~> 0.5.0)
|
53
|
-
oauth2 (0.9.3)
|
54
|
-
faraday (>= 0.8, < 0.10)
|
55
|
-
jwt (~> 0.1.8)
|
56
|
-
multi_json (~> 1.3)
|
57
|
-
multi_xml (~> 0.5)
|
58
|
-
rack (~> 1.2)
|
59
|
-
rack (1.5.2)
|
60
|
-
rake (10.1.1)
|
61
|
-
rdoc (4.1.1)
|
62
|
-
json (~> 1.4)
|
15
|
+
gherkin3 (3.1.1)
|
16
|
+
multi_json (1.11.2)
|
17
|
+
multi_test (0.1.2)
|
18
|
+
rake (10.4.2)
|
63
19
|
regressiontest (0.0.3)
|
64
|
-
rspec (
|
65
|
-
rspec-core (~>
|
66
|
-
rspec-expectations (~>
|
67
|
-
rspec-mocks (~>
|
68
|
-
rspec-core (
|
69
|
-
|
70
|
-
|
71
|
-
|
20
|
+
rspec (3.3.0)
|
21
|
+
rspec-core (~> 3.3.0)
|
22
|
+
rspec-expectations (~> 3.3.0)
|
23
|
+
rspec-mocks (~> 3.3.0)
|
24
|
+
rspec-core (3.3.2)
|
25
|
+
rspec-support (~> 3.3.0)
|
26
|
+
rspec-expectations (3.3.1)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.3.0)
|
29
|
+
rspec-mocks (3.3.2)
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
+
rspec-support (~> 3.3.0)
|
32
|
+
rspec-support (3.3.0)
|
72
33
|
|
73
34
|
PLATFORMS
|
74
|
-
java
|
75
35
|
ruby
|
76
36
|
|
77
37
|
DEPENDENCIES
|
78
38
|
cucumber
|
79
|
-
|
80
|
-
regressiontest (
|
39
|
+
rake
|
40
|
+
regressiontest (>= 0.0.3)
|
81
41
|
rspec
|
42
|
+
|
43
|
+
BUNDLED WITH
|
44
|
+
1.10.6
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,49 +1,68 @@
|
|
1
1
|
# bio-vcf
|
2
2
|
|
3
|
-
[](http://travis-ci.org/vcflib/bio-vcf)
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
|
6
|
+
## Bio-vcf
|
7
|
+
|
8
|
+
Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf
|
9
|
+
is not only very fast for genome-wide (WGS) data, it also comes with a
|
10
|
+
really nice filtering, evaluation and rewrite language and it can
|
11
|
+
output any type of textual data, including VCF header and contents in
|
12
|
+
RDF and JSON.
|
13
|
+
|
14
|
+
So, why would you use bio-vcf over other parsers? Because
|
9
15
|
|
10
16
|
1. Bio-vcf is fast and scales on multi-core computers
|
11
17
|
2. Bio-vcf has an expressive filtering and evaluation language
|
12
18
|
3. Bio-vcf has great multi-sample support
|
13
19
|
4. Bio-vcf has multiple global filters and sample filters
|
14
20
|
5. Bio-vcf can access any VCF format
|
15
|
-
6. Bio-vcf can
|
16
|
-
7. Bio-vcf
|
17
|
-
8. Bio-vcf
|
18
|
-
9. Bio-vcf has
|
19
|
-
10. Bio-vcf
|
21
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
22
|
+
7. Bio-vcf can do calculations on fields
|
23
|
+
8. Bio-vcf allows for genotype processing
|
24
|
+
9. Bio-vcf has support for set analysis
|
25
|
+
10. Bio-vcf has sane error handling
|
26
|
+
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
27
|
+
12. Bio-vcf has soft filters
|
28
|
+
|
29
|
+
Bio-vcf has better performance than other tools because of lazy
|
30
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
31
|
+
line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
|
32
|
+
does better. The more complicated the filters, the larger the
|
33
|
+
gain. First a base line test to show IO performance
|
20
34
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
35
|
+
```sh
|
36
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
37
|
+
1987143 15897724 1003214613
|
38
|
+
real 0m7.823s
|
39
|
+
user 0m7.002s
|
40
|
+
sys 0m2.972s
|
41
|
+
```
|
42
|
+
|
43
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
27
44
|
|
28
45
|
```sh
|
29
|
-
time
|
30
|
-
real
|
31
|
-
user
|
32
|
-
sys
|
46
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
47
|
+
real 0m32.491s
|
48
|
+
user 2m34.767s
|
49
|
+
sys 0m12.733s
|
33
50
|
```
|
34
51
|
|
35
|
-
|
52
|
+
The same with SnpSift v4.0 takes
|
36
53
|
|
37
54
|
```sh
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > /dev/null
|
56
|
+
real 12m36.121s
|
57
|
+
user 12m53.273s
|
58
|
+
sys 0m9.913s
|
42
59
|
```
|
43
60
|
|
44
|
-
|
45
|
-
|
46
|
-
|
61
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
62
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
63
|
+
filters and parsing large data files on powerful machines. Parsing a
|
64
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
65
|
+
BED format on a 16 core machine takes
|
47
66
|
|
48
67
|
```sh
|
49
68
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -52,29 +71,39 @@ a 16 core machine takes
|
|
52
71
|
sys 0m5.039s
|
53
72
|
```
|
54
73
|
|
55
|
-
which shows
|
56
|
-
gzip compressed VCF files of 30+ Gb
|
74
|
+
which shows decent core utilisation (10x). Running
|
75
|
+
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
76
|
+
|
77
|
+
To view some complex filters on an 80Gb SNP file check out a
|
78
|
+
[GTEx exercise](https://github.com/vcflib/bio-vcf/blob/master/doc/GTEx_reduce.md).
|
57
79
|
|
58
|
-
Use zcat to
|
59
|
-
|
80
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
81
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
60
82
|
|
61
83
|
```sh
|
62
84
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
63
|
-
--sfilter '!s.empty? and s.dp>20'
|
85
|
+
--sfilter '!s.empty? and s.dp>20'
|
64
86
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
65
87
|
```
|
66
88
|
|
67
|
-
bio-vcf comes with a sensible parser definition language
|
68
|
-
Ruby),
|
89
|
+
bio-vcf comes with a sensible parser definition language
|
90
|
+
(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
|
91
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
69
92
|
assumptions are made about the actual contents of the VCF file (field
|
70
|
-
names are resolved on the fly), so bio-vcf should
|
71
|
-
|
93
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
94
|
+
files.
|
72
95
|
|
73
|
-
To fetch all entries where all samples have depth larger than 20
|
74
|
-
a sample filter
|
96
|
+
To fetch all entries where all samples have depth larger than 20 and
|
97
|
+
filter set to PASS use a sample filter
|
75
98
|
|
76
99
|
```ruby
|
77
|
-
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
100
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter=="PASS"' < file.vcf
|
101
|
+
```
|
102
|
+
|
103
|
+
or with a regex
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
bio-vcf --sfilter 'sample.dp>20 and rec.filter !~ /LowQD/' < file.vcf
|
78
107
|
```
|
79
108
|
|
80
109
|
To only filter on some samples number 0 and 3:
|
@@ -87,7 +116,7 @@ Where 's.dp' is the shorter name for 'sample.dp'.
|
|
87
116
|
|
88
117
|
It is also possible to specify sample names, or info fields:
|
89
118
|
|
90
|
-
For example, to filter somatic data
|
119
|
+
For example, to filter somatic data
|
91
120
|
|
92
121
|
```ruby
|
93
122
|
bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
@@ -140,7 +169,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
140
169
|
1 10303 25 31 28 32 17 23 22
|
141
170
|
```
|
142
171
|
|
143
|
-
To calculate alt frequencies from s.ad
|
172
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
173
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
144
174
|
|
145
175
|
```ruby
|
146
176
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -214,7 +244,7 @@ The VCF format is commonly used for variant calling between NGS
|
|
214
244
|
samples. The fast parser needs to carry some state, recorded for each
|
215
245
|
file in VcfHeader, which contains the VCF file header. Individual
|
216
246
|
lines (variant calls) first go through a raw parser returning an array
|
217
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
247
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
218
248
|
|
219
249
|
At this point the filter is pretty generic with multi-sample support.
|
220
250
|
If something is not working, check out the feature descriptions and
|
@@ -223,22 +253,15 @@ example of a VCF statement you need to work on.
|
|
223
253
|
|
224
254
|
## Installation
|
225
255
|
|
226
|
-
|
227
|
-
a performance improvement. Bio-vcf will show the Ruby version when
|
228
|
-
typing the command 'bio-vcf -h'.
|
256
|
+
The bio-vcf has no other dependencies but Ruby.
|
229
257
|
|
230
|
-
To
|
258
|
+
To install bio-vcf with Ruby gems:
|
231
259
|
|
232
260
|
```sh
|
233
261
|
gem install bio-vcf
|
234
262
|
bio-vcf -h
|
235
263
|
```
|
236
264
|
|
237
|
-
For multi-core also install the parallel gem
|
238
|
-
|
239
|
-
```sh
|
240
|
-
gem install parallel
|
241
|
-
```
|
242
265
|
|
243
266
|
## Command line interface (CLI)
|
244
267
|
|
@@ -263,56 +286,72 @@ Get the sample names
|
|
263
286
|
NORMAL,TUMOR
|
264
287
|
```
|
265
288
|
|
289
|
+
Alternatively use the command line switch for --names, e.g.
|
290
|
+
|
291
|
+
```ruby
|
292
|
+
bio-vcf --names < file.vcf
|
293
|
+
NORMAL,TUMOR
|
294
|
+
```
|
295
|
+
|
296
|
+
Get information from the header (META)
|
297
|
+
|
298
|
+
```ruby
|
299
|
+
bio-vcf -q --skip-header --eval-once 'header.meta["GATKCommandLine"]' < gatk_exome.vcf
|
300
|
+
```
|
301
|
+
|
266
302
|
The 'fields' array contains unprocessed data (strings). Print first
|
267
303
|
five raw fields
|
268
304
|
|
269
305
|
```ruby
|
270
|
-
bio-vcf --eval 'fields[0..4]' < file.vcf
|
306
|
+
bio-vcf --eval 'fields[0..4]' < file.vcf
|
271
307
|
```
|
272
308
|
|
273
309
|
Add a filter to display the fields on chromosome 12
|
274
310
|
|
275
311
|
```ruby
|
276
|
-
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
312
|
+
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
277
313
|
```
|
278
314
|
|
279
315
|
It gets better when we start using processed data, represented by an
|
280
316
|
object named 'rec'. Position is a value, so we can filter a range
|
281
317
|
|
282
318
|
```ruby
|
283
|
-
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
319
|
+
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
284
320
|
```
|
285
321
|
|
286
322
|
The shorter name for 'rec.chrom' is 'r.chrom', so you may write
|
287
323
|
|
288
324
|
```ruby
|
289
|
-
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
325
|
+
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
290
326
|
```
|
291
327
|
|
292
328
|
To ignore and continue parsing on missing data use the
|
293
329
|
--ignore-missing (-i) and or --quiet (-q) switches
|
294
330
|
|
295
331
|
```ruby
|
296
|
-
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
332
|
+
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
297
333
|
```
|
298
334
|
|
299
335
|
Info fields are referenced by
|
300
336
|
|
301
337
|
```ruby
|
302
|
-
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
338
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
303
339
|
```
|
304
340
|
|
305
|
-
|
341
|
+
(alternatively you can use the indexed rec.info['DP'] and list INFO fields with
|
342
|
+
rec.info.fields).
|
343
|
+
|
344
|
+
Subfields defined by rec.format:
|
306
345
|
|
307
346
|
```ruby
|
308
|
-
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
347
|
+
bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
|
309
348
|
```
|
310
349
|
|
311
350
|
Output
|
312
351
|
|
313
352
|
```ruby
|
314
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
315
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
353
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
354
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
316
355
|
< file.vcf
|
317
356
|
```
|
318
357
|
|
@@ -326,26 +365,26 @@ Show the count of the bases that were scored as somatic
|
|
326
365
|
Actually, we have a convenience implementation for bcount, so this is the same
|
327
366
|
|
328
367
|
```ruby
|
329
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
368
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
330
369
|
< file.vcf
|
331
370
|
```
|
332
371
|
|
333
372
|
Filter on the somatic results that were scored at least 4 times
|
334
|
-
|
373
|
+
|
335
374
|
```ruby
|
336
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
375
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
|
337
376
|
```
|
338
377
|
|
339
378
|
Similar for base quality scores
|
340
379
|
|
341
380
|
```ruby
|
342
|
-
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
381
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
343
382
|
```
|
344
383
|
|
345
384
|
Filter out on sample values
|
346
385
|
|
347
386
|
```ruby
|
348
|
-
bio-vcf --sfilter 's.dp>20' < test.vcf
|
387
|
+
bio-vcf --sfilter 's.dp>20' < test.vcf
|
349
388
|
```
|
350
389
|
|
351
390
|
To filter missing on samples:
|
@@ -360,6 +399,23 @@ or for all
|
|
360
399
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
361
400
|
```
|
362
401
|
|
402
|
+
To set a soft filter, i.e. the filter column is updated
|
403
|
+
|
404
|
+
```sh
|
405
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
406
|
+
```
|
407
|
+
|
408
|
+
may render something like
|
409
|
+
|
410
|
+
```
|
411
|
+
1 46527674 4 LowQD
|
412
|
+
1 108417572 4 LowQD
|
413
|
+
1 155449089 4 LowQD
|
414
|
+
1 169847826 4 LowQD
|
415
|
+
1 203098164 3 LowQD
|
416
|
+
2 39213209 4 LowQD
|
417
|
+
```
|
418
|
+
|
363
419
|
Likewise you can check for record validity
|
364
420
|
|
365
421
|
```sh
|
@@ -410,17 +466,17 @@ Even shorter r is an alias for rec
|
|
410
466
|
Note: special functions are not yet implemented! Look below
|
411
467
|
for genotype processing which has indexing in 'gti'.
|
412
468
|
|
413
|
-
Sometime you want to use a special function in a filter. For
|
414
|
-
example percentage variant reads can be defined as [a,c,g,t]
|
415
|
-
with frequencies against sample read depth (dp) as
|
416
|
-
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
469
|
+
Sometime you want to use a special function in a filter. For
|
470
|
+
example percentage variant reads can be defined as [a,c,g,t]
|
471
|
+
with frequencies against sample read depth (dp) as
|
472
|
+
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
417
473
|
which we named freq
|
418
474
|
|
419
475
|
```sh
|
420
476
|
bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
|
421
477
|
```
|
422
478
|
|
423
|
-
which is equal to
|
479
|
+
which is equal to
|
424
480
|
|
425
481
|
```sh
|
426
482
|
bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
|
@@ -440,7 +496,7 @@ ref should always be identical across samples.
|
|
440
496
|
|
441
497
|
## DbSNP
|
442
498
|
|
443
|
-
One clinical variant DbSNP example
|
499
|
+
One clinical variant DbSNP example
|
444
500
|
|
445
501
|
```sh
|
446
502
|
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
@@ -465,16 +521,16 @@ renders
|
|
465
521
|
|
466
522
|
bio-vcf allows for set analysis. With the complement filter, for
|
467
523
|
example, samples are selected that evaluate to true, all others should
|
468
|
-
evaluate to false. For this we create three filters, one for all
|
524
|
+
evaluate to false. For this we create three filters, one for all
|
469
525
|
samples that are included (the --ifilter or -if), for all samples that
|
470
526
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
471
527
|
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
472
|
-
filter).
|
528
|
+
filter).
|
473
529
|
|
474
530
|
The equivalent of the union filter is by using the --sfilter, so
|
475
531
|
|
476
532
|
```sh
|
477
|
-
bio-vcf --sfilter 's.dp>20'
|
533
|
+
bio-vcf --sfilter 's.dp>20'
|
478
534
|
```
|
479
535
|
|
480
536
|
Filters DP on all samples and is true if all samples match the
|
@@ -482,7 +538,7 @@ criterium (AND). To filter on a subset you can add a
|
|
482
538
|
selector
|
483
539
|
|
484
540
|
```sh
|
485
|
-
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
541
|
+
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
486
542
|
```
|
487
543
|
|
488
544
|
For set analysis there are the additional ifilter (include) and
|
@@ -502,7 +558,7 @@ values
|
|
502
558
|
|
503
559
|
The equivalent of the complement filter is by specifying what samples
|
504
560
|
to include, here with a regex and define filters on the included
|
505
|
-
and excluded samples (the ones not in ifilter-samples) and the
|
561
|
+
and excluded samples (the ones not in ifilter-samples) and the
|
506
562
|
|
507
563
|
```sh
|
508
564
|
./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
|
@@ -523,7 +579,7 @@ To print out the GT's add --seval
|
|
523
579
|
To set an additional filter on the excluded samples:
|
524
580
|
|
525
581
|
```sh
|
526
|
-
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
582
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
527
583
|
```
|
528
584
|
|
529
585
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
@@ -536,15 +592,15 @@ In the near future it is also possible to select samples on a regex (here
|
|
536
592
|
select all samples where the name starts with s3)
|
537
593
|
|
538
594
|
```sh
|
539
|
-
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
595
|
+
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
540
596
|
```
|
541
597
|
|
542
598
|
```sh
|
543
|
-
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
599
|
+
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
544
600
|
--set-intersect include=true
|
545
|
-
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
601
|
+
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
546
602
|
--set-catesian one in include=true, rest=false
|
547
|
-
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
603
|
+
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
548
604
|
```
|
549
605
|
|
550
606
|
With the filter commands you can use --ignore-missing to skip errors.
|
@@ -567,7 +623,7 @@ results in a string value
|
|
567
623
|
to access components of the genotype field we can use standard Ruby
|
568
624
|
|
569
625
|
```ruby
|
570
|
-
bio-vcf --seval 's.gt.split(/\//)[0]'
|
626
|
+
bio-vcf --seval 's.gt.split(/\//)[0]'
|
571
627
|
1 10665 . . 0 0 . 0 0
|
572
628
|
1 10694 . . 1 1 . . .
|
573
629
|
1 12783 0 0 0 0 0 0 0
|
@@ -578,7 +634,7 @@ or special functions, such as 'gti' which gives the genotype as an
|
|
578
634
|
indexed value array
|
579
635
|
|
580
636
|
```ruby
|
581
|
-
bio-vcf --seval 's.gti[0]'
|
637
|
+
bio-vcf --seval 's.gti[0]'
|
582
638
|
1 10665 0 0 0 0
|
583
639
|
1 10694 1 1
|
584
640
|
1 12783 0 0 0 0 0 0 0
|
@@ -588,7 +644,7 @@ indexed value array
|
|
588
644
|
and 'gts' as a nucleotide string array
|
589
645
|
|
590
646
|
```ruby
|
591
|
-
bio-vcf --seval 's.gts
|
647
|
+
bio-vcf --seval 's.gts'
|
592
648
|
1 10665 C C C C
|
593
649
|
1 10694 G G
|
594
650
|
1 12783 G G G G G G G
|
@@ -597,6 +653,28 @@ and 'gts' as a nucleotide string array
|
|
597
653
|
|
598
654
|
where gts represents the indexed genotype on [ref] + [alt].
|
599
655
|
|
656
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
657
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
658
|
+
example signficance, use
|
659
|
+
|
660
|
+
```ruby
|
661
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
662
|
+
11 58949455 0 1
|
663
|
+
11 65481082 0 1
|
664
|
+
11 94180424 0 1
|
665
|
+
11 121036021 0 1
|
666
|
+
```
|
667
|
+
|
668
|
+
Now you can index other fields, e.g. GL
|
669
|
+
|
670
|
+
```ruby
|
671
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
672
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
673
|
+
```
|
674
|
+
|
675
|
+
shows a number of SNPs have been scored with high significance and a
|
676
|
+
number are missing, here marked as -1.
|
677
|
+
|
600
678
|
These values can also be used in filters and output allele depth, for
|
601
679
|
example
|
602
680
|
|
@@ -618,6 +696,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
618
696
|
1 13757 47 47 4 47 47 4 47
|
619
697
|
```
|
620
698
|
|
699
|
+
## Sample counting
|
700
|
+
|
701
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
702
|
+
some expert advice here.
|
703
|
+
|
704
|
+
To count valid genotype field in samples you can do something like
|
705
|
+
|
706
|
+
```ruby
|
707
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
708
|
+
```
|
709
|
+
|
710
|
+
A similar complex count would be
|
711
|
+
|
712
|
+
```ruby
|
713
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
714
|
+
```
|
715
|
+
|
716
|
+
which tests for perfect SNPs scored (for example).
|
717
|
+
|
718
|
+
## Reorder filter with lambda
|
719
|
+
|
720
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
721
|
+
example where the greedy sample counts are done only for those
|
722
|
+
samples that match the other criteria:
|
723
|
+
|
724
|
+
```ruby
|
725
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
726
|
+
```
|
621
727
|
|
622
728
|
## Modify VCF files
|
623
729
|
|
@@ -633,6 +739,23 @@ To remove/select 3 samples:
|
|
633
739
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
634
740
|
```
|
635
741
|
|
742
|
+
You can also select samples by name (as long as they do not contain
|
743
|
+
spaces)
|
744
|
+
|
745
|
+
|
746
|
+
```sh
|
747
|
+
bio-vcf --names < mytest.vcf
|
748
|
+
Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
749
|
+
bio-vcf --samples "Original,s1t1,s3t1" < mytest.vcf
|
750
|
+
```
|
751
|
+
|
752
|
+
|
753
|
+
Filter on a BED file and annotate the gene name in the resulting VCF
|
754
|
+
|
755
|
+
```sh
|
756
|
+
bio-vcf -v --bed test/data/input/test.bed --rewrite 'rec.info["gene"]=bed[3]' < test/data/input/somaticsniper.vcf
|
757
|
+
```
|
758
|
+
|
636
759
|
## RDF output
|
637
760
|
|
638
761
|
You can use --rdf for turtle RDF output from simple one-liners, note the use of --id and
|
@@ -673,11 +796,11 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
|
|
673
796
|
|
674
797
|
## Templates
|
675
798
|
|
676
|
-
To have more output options
|
799
|
+
To have more output options bio-vcf can use an [ERB
|
677
800
|
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
678
801
|
very flexible option that can output textual formats such as JSON, YAML, HTML
|
679
802
|
and RDF. Examples are provided in
|
680
|
-
[./templates](https://github.com/
|
803
|
+
[./templates](https://github.com/vcflib/bio-vcf/templates/). A JSON
|
681
804
|
template could be
|
682
805
|
|
683
806
|
```Javascript
|
@@ -687,14 +810,15 @@ template could be
|
|
687
810
|
"seq:ref": "<%= rec.ref %>" ,
|
688
811
|
"seq:alt": "<%= rec.alt[0] %>" ,
|
689
812
|
"seq:maf": <%= rec.info.maf[0] %> ,
|
690
|
-
"dp": <%= rec.info.dp %>
|
813
|
+
"dp": <%= rec.info.dp %>
|
691
814
|
};
|
692
815
|
```
|
693
816
|
|
694
|
-
To get JSON, run with something like
|
817
|
+
To get JSON, run with something like (combining
|
818
|
+
with a filter)
|
695
819
|
|
696
820
|
```sh
|
697
|
-
bio-vcf --template template/vcf2json.erb --filter 'r.info.
|
821
|
+
bio-vcf --template template/vcf2json.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
698
822
|
```
|
699
823
|
|
700
824
|
which renders
|
@@ -706,21 +830,21 @@ which renders
|
|
706
830
|
"seq:ref": "C" ,
|
707
831
|
"seq:alt": "T" ,
|
708
832
|
"seq:maf": 0.0151 ,
|
709
|
-
"dp": 86
|
833
|
+
"dp": 86
|
710
834
|
};
|
711
835
|
```
|
712
836
|
|
713
837
|
Likewise for RDF output:
|
714
838
|
|
715
839
|
```sh
|
716
|
-
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.
|
840
|
+
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.sao==1' < dbsnp.vcf
|
717
841
|
```
|
718
842
|
|
719
|
-
renders the ERB template
|
843
|
+
renders the ERB template
|
720
844
|
|
721
845
|
```ruby
|
722
846
|
<%
|
723
|
-
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
847
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
724
848
|
%>
|
725
849
|
:<%= id %>
|
726
850
|
:query_id "<%= id %>",
|
@@ -728,12 +852,12 @@ renders the ERB template
|
|
728
852
|
seq:pos <%= rec.pos %> ,
|
729
853
|
seq:ref "<%= rec.ref %>" ,
|
730
854
|
seq:alt "<%= rec.alt[0] %>" ,
|
731
|
-
seq:maf <%= rec.info.maf[0] %> ,
|
855
|
+
seq:maf <%= (rec.info.maf[0]*100).round %> ,
|
732
856
|
seq:dp <%= rec.info.dp %> ,
|
733
857
|
db:vcf true .
|
734
858
|
```
|
735
859
|
|
736
|
-
into
|
860
|
+
into
|
737
861
|
|
738
862
|
```
|
739
863
|
:ch13_33703698_A
|
@@ -742,12 +866,91 @@ into
|
|
742
866
|
seq:pos 33703698 ,
|
743
867
|
seq:ref "C" ,
|
744
868
|
seq:alt "A" ,
|
745
|
-
seq:maf
|
869
|
+
seq:maf 16 ,
|
746
870
|
seq:dp 92 ,
|
747
871
|
db:vcf true .
|
748
872
|
```
|
749
873
|
|
750
|
-
Be creative! You can write templates for csv, HTML, XML, LaTeX, RDF, JSON, YAML, JSON-LD, etc. etc.!
|
874
|
+
Note the calculated field value for maf. Be creative! You can write templates for csv, HTML, XML, LaTeX, RDF, JSON, YAML, JSON-LD, etc. etc.!
|
875
|
+
|
876
|
+
### Metadata
|
877
|
+
|
878
|
+
Templates can also print data as a header of the JSON/YAML/RDF output. For this
|
879
|
+
use the '=' prefix with HEADER, BODY, FOOTER keywords in the template. A small example
|
880
|
+
can be
|
881
|
+
|
882
|
+
```Javascript
|
883
|
+
=HEADER
|
884
|
+
<% require 'json' %>
|
885
|
+
{ "HEADER": {
|
886
|
+
"options": <%= options.to_h.to_json %>,
|
887
|
+
"files": <%= ARGV %>,
|
888
|
+
"version": "<%= BIOVCF_VERSION %>"
|
889
|
+
},
|
890
|
+
"BODY":[
|
891
|
+
=BODY
|
892
|
+
{
|
893
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
894
|
+
"seq:pos": <%= rec.pos %> ,
|
895
|
+
"seq:ref": "<%= rec.ref %>" ,
|
896
|
+
"seq:alt": "<%= rec.alt[0] %>" ,
|
897
|
+
"dp": <%= rec.info.dp %>
|
898
|
+
},
|
899
|
+
=FOOTER
|
900
|
+
]
|
901
|
+
}
|
902
|
+
```
|
903
|
+
|
904
|
+
with
|
905
|
+
|
906
|
+
```sh
|
907
|
+
bio-vcf --template template/vcf2json.erb < dbsnp.vcf
|
908
|
+
```
|
909
|
+
|
910
|
+
may generate something like
|
911
|
+
|
912
|
+
```Javascript
|
913
|
+
{ "HEADER": {
|
914
|
+
"options": {"show_help":false,"source":"https://github.com/CuppenResearch/bioruby-vcf","version":"0.8.1-pre3 (Pjotr Prins)","date":"2014-11-26 12:51:36 +0000","thread_lines":40000,"template":"template/vcf2json.erb","skip_header":true},
|
915
|
+
"files": [],
|
916
|
+
"version": "0.8.1-pre3"
|
917
|
+
},
|
918
|
+
"BODY":[
|
919
|
+
{
|
920
|
+
"seq:chr": "1" ,
|
921
|
+
"seq:pos": 883516 ,
|
922
|
+
"seq:ref": "G" ,
|
923
|
+
"seq:alt": "A" ,
|
924
|
+
"dp":
|
925
|
+
},
|
926
|
+
{
|
927
|
+
"seq:chr": "1" ,
|
928
|
+
"seq:pos": 891344 ,
|
929
|
+
"seq:ref": "G" ,
|
930
|
+
"seq:alt": "A" ,
|
931
|
+
"dp": ,
|
932
|
+
},
|
933
|
+
]
|
934
|
+
}
|
935
|
+
```
|
936
|
+
|
937
|
+
Note that the template is not smart enough to remove the final comma
|
938
|
+
from the last BODY element. To make it valid JSON that needs to be
|
939
|
+
removed. A future version may add a parameter to the BODY element or a
|
940
|
+
global rewrite function for this purpose. YAML and RDF have no such issue.
|
941
|
+
|
942
|
+
### Using full VCF header (meta) info
|
943
|
+
|
944
|
+
To get and put the full information from the header, simple use
|
945
|
+
vcf.meta.to_json. See ./template/vcf2json_full_header.erb for an
|
946
|
+
example. This meta information can also be used to output info fields
|
947
|
+
and sample values on the fly! For an example, see the template at
|
948
|
+
[./template/vcf2json_use_meta.erb](https://github.com/vcflib/bio-vcf/tree/master/template/vcf2json_use_meta.erb)
|
949
|
+
and the generated output at
|
950
|
+
[./test/data/regression/vcf2json_use_meta.ref](https://github.com/vcflib/bio-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
|
951
|
+
|
952
|
+
This way, it is possible to write templates that can convert the content of
|
953
|
+
*any* VCF file without prior knowledge to JSON, RDF, etc.
|
751
954
|
|
752
955
|
## Statistics
|
753
956
|
|
@@ -761,7 +964,7 @@ Simple statistics are available for REF>ALT changes:
|
|
761
964
|
G>A 59 45%
|
762
965
|
C>T 30 23%
|
763
966
|
A>G 5 4%
|
764
|
-
C>G 5 4%
|
967
|
+
C>G 5 4%
|
765
968
|
C>A 5 4%
|
766
969
|
G>T 4 3%
|
767
970
|
T>C 4 3%
|
@@ -781,7 +984,10 @@ Simple statistics are available for REF>ALT changes:
|
|
781
984
|
|
782
985
|
## Other examples
|
783
986
|
|
784
|
-
For more examples see
|
987
|
+
For more exercises and examples see
|
988
|
+
[doc](https://github.com/vcflib/bio-vcf/tree/master/doc) directory
|
989
|
+
and the the feature
|
990
|
+
[section](https://github.com/vcflib/bio-vcf/tree/master/features).
|
785
991
|
|
786
992
|
## API
|
787
993
|
|
@@ -812,27 +1018,102 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
812
1018
|
end
|
813
1019
|
```
|
814
1020
|
|
1021
|
+
### VCFFile
|
1022
|
+
|
1023
|
+
The class ```BioVcf::VCFfile``` wraps a file and provides an ```enum``` with the
|
1024
|
+
method each, that can be used as in iterator.
|
1025
|
+
|
1026
|
+
```ruby
|
1027
|
+
vcf_file = "dbsnp.vcf"
|
1028
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: false )
|
1029
|
+
it vcf.each
|
1030
|
+
puts it.peek
|
1031
|
+
|
1032
|
+
vcf_file = "dbsnp.vcf.gz"
|
1033
|
+
vcf = BioVcf::VCFfile.new(file:file, is_gz: true )
|
1034
|
+
it vcf.each
|
1035
|
+
puts it.peek
|
1036
|
+
```
|
1037
|
+
|
815
1038
|
## Trouble shooting
|
816
1039
|
|
1040
|
+
### MRI supports threading
|
1041
|
+
|
817
1042
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
818
1043
|
in single threaded mode (for now).
|
819
1044
|
|
1045
|
+
### Set TMPDIR when running out of space
|
1046
|
+
|
820
1047
|
The multi-threading creates temporary files using the system TMPDIR.
|
821
1048
|
This behaviour can be overridden by setting the environment variable.
|
822
|
-
|
823
|
-
|
1049
|
+
|
1050
|
+
### Reorder filter on time out
|
1051
|
+
|
1052
|
+
Make sure to minimize expensive calculations by moving them
|
1053
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1054
|
+
|
1055
|
+
```ruby
|
1056
|
+
fast_check and slow_check
|
1057
|
+
```
|
1058
|
+
|
1059
|
+
slow_check only gets executed if fast_check is true.
|
1060
|
+
|
1061
|
+
For more complex filters use lambda inside a conditional
|
1062
|
+
|
1063
|
+
```ruby
|
1064
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1065
|
+
```
|
1066
|
+
|
1067
|
+
where slow_check is the slow section of your query. As is shown
|
1068
|
+
earlier in this document. Don't forget the .call!
|
1069
|
+
|
1070
|
+
### Reduce thread lines on timeout
|
1071
|
+
|
1072
|
+
Depending on your input data and the speed filters it may be useful to
|
1073
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1074
|
+
|
1075
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1076
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1077
|
+
the computations are intensive (per line) reduce the number of
|
1078
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1079
|
+
the one to try.
|
1080
|
+
|
1081
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1082
|
+
|
1083
|
+
Different values may show different core use on a machine.
|
1084
|
+
|
1085
|
+
### Development
|
1086
|
+
|
1087
|
+
To run the tests from source
|
1088
|
+
|
1089
|
+
```sh
|
1090
|
+
bundle install --path vendor/bundle
|
1091
|
+
bundle exec rake
|
1092
|
+
```
|
1093
|
+
|
1094
|
+
### Debugging
|
1095
|
+
|
1096
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1097
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1098
|
+
are problems.
|
1099
|
+
|
1100
|
+
### Tmpdir contains (old) bio-vcf directories
|
1101
|
+
|
1102
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1103
|
+
processing. When a process gets interrupted for some reason the
|
1104
|
+
temporary directory may remain.
|
824
1105
|
|
825
1106
|
## Project home page
|
826
1107
|
|
827
1108
|
Information on the source tree, documentation, examples, issues and
|
828
1109
|
how to contribute, see
|
829
1110
|
|
830
|
-
http://github.com/
|
1111
|
+
http://github.com/vcflib/bio-vcf
|
831
1112
|
|
832
1113
|
## Cite
|
833
1114
|
|
834
1115
|
If you use this software, please cite one of
|
835
|
-
|
1116
|
+
|
836
1117
|
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
837
1118
|
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
838
1119
|
|
@@ -842,5 +1123,4 @@ This Biogem is published at (http://biogems.info/index.html#bio-vcf)
|
|
842
1123
|
|
843
1124
|
## Copyright
|
844
1125
|
|
845
|
-
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
846
|
-
|
1126
|
+
Copyright (c) 2014-2020 Pjotr Prins. See LICENSE.txt for further details.
|