bio-vcf 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/Gemfile +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +32 -11
- data/VERSION +1 -1
- data/bin/bio-vcf +18 -9
- data/bio-vcf.gemspec +25 -17
- data/features/cli.feature +11 -1
- data/features/step_definitions/cli-feature.rb +1 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/vcfheader.rb +88 -4
- data/lib/bio-vcf/vcfheader_line.rb +483 -0
- data/lib/bio-vcf/vcfsample.rb +10 -1
- data/ragel/gen_vcfheaderline_parser.rb +483 -0
- data/ragel/gen_vcfheaderline_parser.rl +122 -0
- data/ragel/generate.sh +5 -0
- data/template/vcf2json_full_header.erb +23 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/regression/vcf2json_full_header.ref +261 -0
- metadata +20 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 515319faec0710075f13a0265a4027130ec5f10a
|
4
|
+
data.tar.gz: aed2ff09861568291363ca21944567ad36987813
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 94ff3bfda4357fc187a89c9a55116ceefe15fc2b8fa28af45e92afcad452c8d2bd65e5eae17dd2c40b046f89288c640ba5d4b40b8efb711781caed766e48f518
|
7
|
+
data.tar.gz: 3d810db35d1ad862aad6f4ec81d695c6d7d74d46336d4e5563e925da267d04521387994d794ff7d8384cf10d8c94701e0e2af9380ddc0b4505e00edbbedb7c3e
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -7,9 +7,9 @@ source "http://rubygems.org"
|
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
group :development do
|
9
9
|
# gem "minitest"
|
10
|
-
gem "rspec"
|
11
|
-
gem "cucumber"
|
12
|
-
gem "jeweler", "
|
13
|
-
gem "regressiontest", "
|
10
|
+
gem "rspec", ">= 2.14.0"
|
11
|
+
gem "cucumber", ">= 1.3.11"
|
12
|
+
gem "jeweler", ">= 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
+
gem "regressiontest", ">= 0.0.3"
|
14
14
|
end
|
15
15
|
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,9 @@
|
|
5
5
|
A new generation VCF parser. Bio-vcf is not only fast for genome-wide
|
6
6
|
(WGS) data, it also comes with a really nice filtering, evaluation and
|
7
7
|
rewrite language and it can output any type of textual data, including
|
8
|
-
|
8
|
+
VCF header and contents in RDF and JSON.
|
9
|
+
|
10
|
+
So, why would you use bio-vcf over other parsers? Because
|
9
11
|
|
10
12
|
1. Bio-vcf is fast and scales on multi-core computers
|
11
13
|
2. Bio-vcf has an expressive filtering and evaluation language
|
@@ -16,14 +18,14 @@ RDF and JSON. Why would you use bio-vcf over other parsers?
|
|
16
18
|
7. Bio-vcf allows for genotype processing
|
17
19
|
8. Bio-vcf has support for set analysis
|
18
20
|
9. Bio-vcf has sane error handling
|
19
|
-
10. Bio-vcf can output tabular data, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs using (erb) templates
|
21
|
+
10. Bio-vcf can convert *any* VCF to *any* output, including tabular data, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
20
22
|
|
21
23
|
Bio-vcf has better performance than other tools
|
22
24
|
because of lazy parsing, multi-threading, and useful combinations of
|
23
25
|
(fancy) command line filtering. For example on an 2 core machine
|
24
|
-
bio-vcf is typically 50% faster than JVM based SnpSift.
|
25
|
-
bio-vcf
|
26
|
-
|
26
|
+
bio-vcf is typically 50% faster than JVM based SnpSift. Adding
|
27
|
+
cores, bio-vcf just does better. The more complicated the filters,
|
28
|
+
the larger the gain.
|
27
29
|
|
28
30
|
```sh
|
29
31
|
time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
|
@@ -52,8 +54,8 @@ a 16 core machine takes
|
|
52
54
|
sys 0m5.039s
|
53
55
|
```
|
54
56
|
|
55
|
-
which shows decent core utilisation (10x).
|
56
|
-
gzip compressed VCF files of 30+ Gb
|
57
|
+
which shows decent core utilisation (10x). Running
|
58
|
+
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
57
59
|
|
58
60
|
Use zcat to
|
59
61
|
pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
@@ -64,10 +66,10 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
|
64
66
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
65
67
|
```
|
66
68
|
|
67
|
-
bio-vcf comes with a sensible parser definition language (it is 100%
|
68
|
-
Ruby), as well as primitives for set analysis. Few
|
69
|
+
bio-vcf comes with a sensible parser definition language (interestingly it is 100%
|
70
|
+
Ruby), an embedded Ragel parser for INFO and FORMAT header definitions, as well as primitives for set analysis. Few
|
69
71
|
assumptions are made about the actual contents of the VCF file (field
|
70
|
-
names are resolved on the fly), so bio-vcf should
|
72
|
+
names are resolved on the fly), so bio-vcf should work with
|
71
73
|
all VCF files.
|
72
74
|
|
73
75
|
To fetch all entries where all samples have depth larger than 20 use
|
@@ -679,7 +681,7 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
|
|
679
681
|
|
680
682
|
## Templates
|
681
683
|
|
682
|
-
To have more output options
|
684
|
+
To have more output options bio-vcf can use an [ERB
|
683
685
|
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
684
686
|
very flexible option that can output textual formats such as JSON, YAML, HTML
|
685
687
|
and RDF. Examples are provided in
|
@@ -785,6 +787,12 @@ can be
|
|
785
787
|
]
|
786
788
|
```
|
787
789
|
|
790
|
+
with
|
791
|
+
|
792
|
+
```sh
|
793
|
+
bio-vcf --template template/vcf2json.erb < dbsnp.vcf
|
794
|
+
```
|
795
|
+
|
788
796
|
may generate something like
|
789
797
|
|
790
798
|
```Javascript
|
@@ -816,6 +824,19 @@ from the last BODY element. To make it valid JSON that needs to be
|
|
816
824
|
removed. A future version may add a parameter to the BODY element or a
|
817
825
|
global rewrite function for this purpose. YAML and RDF have no such issue.
|
818
826
|
|
827
|
+
### Using full VCF header (meta) info
|
828
|
+
|
829
|
+
To get and put the full information from the header, simple use
|
830
|
+
vcf.meta.to_json. See ./template/vcf2json_full_header.erb for an
|
831
|
+
example. This meta information can also be used to output info fields
|
832
|
+
and sample values on the fly! For an example, see the template at
|
833
|
+
[./template/vcf2json_use_meta.erb](https://github.com/pjotrp/bioruby-vcf/tree/master/template/vcf2json_use_meta.erb)
|
834
|
+
and the generated output at
|
835
|
+
[./test/data/regression/vcf2json_use_meta.ref](https://github.com/pjotrp/bioruby-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
|
836
|
+
|
837
|
+
This way, it is possible to write templates that can convert the content of
|
838
|
+
*any* VCF file without prior knowledge to JSON, RDF, etc.
|
839
|
+
|
819
840
|
## Statistics
|
820
841
|
|
821
842
|
Simple statistics are available for REF>ALT changes:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.2
|
data/bin/bio-vcf
CHANGED
@@ -200,7 +200,7 @@ end
|
|
200
200
|
|
201
201
|
include BioVcf
|
202
202
|
|
203
|
-
# Parse the header section of a VCF file
|
203
|
+
# Parse the header section of a VCF file (chomping STDIN)
|
204
204
|
def parse_header line, samples, options
|
205
205
|
header = VcfHeader.new
|
206
206
|
header.add(line)
|
@@ -374,22 +374,31 @@ begin
|
|
374
374
|
end
|
375
375
|
} # end output
|
376
376
|
|
377
|
-
print template.header(binding) if template
|
378
377
|
# ---- Main loop
|
379
378
|
STDIN.each_line do | line |
|
380
379
|
line_number += 1
|
381
380
|
# ---- In this section header information is handled
|
381
|
+
|
382
|
+
# ---- Skip embedded headers down the line...
|
382
383
|
next if header_output_completed and line =~ /^#/
|
383
|
-
|
384
|
+
|
385
|
+
# ---- Parse the header lines (chomps from STDIN)
|
386
|
+
# and returns header info and the current line
|
387
|
+
if line =~ /^#/
|
384
388
|
header,line = parse_header(line,samples,options)
|
385
389
|
end
|
386
|
-
|
387
|
-
|
388
|
-
if not
|
389
|
-
#
|
390
|
-
options[:efilter_samples]
|
390
|
+
# p [line_number,line]
|
391
|
+
# ---- After the header continue processing
|
392
|
+
if not header_output_completed
|
393
|
+
# one-time post-header processing
|
394
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
395
|
+
# Create exclude set as a complement of include set
|
396
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
397
|
+
end
|
398
|
+
print template.header(binding) if template
|
399
|
+
header_output_completed = true
|
391
400
|
end
|
392
|
-
|
401
|
+
|
393
402
|
# ---- In this section the VCF variant lines are parsed
|
394
403
|
lines << line
|
395
404
|
if NUM_THREADS == 1
|
data/bio-vcf.gemspec
CHANGED
@@ -2,16 +2,14 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-vcf 0.8.1 ruby lib
|
6
5
|
|
7
6
|
Gem::Specification.new do |s|
|
8
7
|
s.name = "bio-vcf"
|
9
|
-
s.version = "0.8.
|
8
|
+
s.version = "0.8.2"
|
10
9
|
|
11
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
11
|
s.authors = ["Pjotr Prins"]
|
14
|
-
s.date = "2014-
|
12
|
+
s.date = "2014-12-28"
|
15
13
|
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
|
16
14
|
s.email = "pjotr.public01@thebird.nl"
|
17
15
|
s.executables = ["bio-vcf"]
|
@@ -40,7 +38,9 @@ Gem::Specification.new do |s|
|
|
40
38
|
"features/step_definitions/multisample.rb",
|
41
39
|
"features/step_definitions/sfilter.rb",
|
42
40
|
"features/step_definitions/somaticsniper.rb",
|
41
|
+
"features/step_definitions/vcf_header.rb",
|
43
42
|
"features/support/env.rb",
|
43
|
+
"features/vcf_header.feature",
|
44
44
|
"lib/bio-vcf.rb",
|
45
45
|
"lib/bio-vcf/bedfilter.rb",
|
46
46
|
"lib/bio-vcf/template.rb",
|
@@ -49,13 +49,19 @@ Gem::Specification.new do |s|
|
|
49
49
|
"lib/bio-vcf/vcf.rb",
|
50
50
|
"lib/bio-vcf/vcfgenotypefield.rb",
|
51
51
|
"lib/bio-vcf/vcfheader.rb",
|
52
|
+
"lib/bio-vcf/vcfheader_line.rb",
|
52
53
|
"lib/bio-vcf/vcfline.rb",
|
53
54
|
"lib/bio-vcf/vcfrdf.rb",
|
54
55
|
"lib/bio-vcf/vcfrecord.rb",
|
55
56
|
"lib/bio-vcf/vcfsample.rb",
|
56
57
|
"lib/bio-vcf/vcfstatistics.rb",
|
58
|
+
"ragel/gen_vcfheaderline_parser.rb",
|
59
|
+
"ragel/gen_vcfheaderline_parser.rl",
|
60
|
+
"ragel/generate.sh",
|
57
61
|
"template/gatk_vcf2rdf.erb",
|
58
62
|
"template/vcf2json.erb",
|
63
|
+
"template/vcf2json_full_header.erb",
|
64
|
+
"template/vcf2json_use_meta.erb",
|
59
65
|
"template/vcf2rdf.erb",
|
60
66
|
"template/vcf2rdf_header.erb",
|
61
67
|
"test/data/input/dbsnp.vcf",
|
@@ -71,33 +77,35 @@ Gem::Specification.new do |s|
|
|
71
77
|
"test/data/regression/thread4.ref",
|
72
78
|
"test/data/regression/thread4_4.ref",
|
73
79
|
"test/data/regression/thread4_4_failed_filter-stderr.ref",
|
80
|
+
"test/data/regression/vcf2json_full_header.ref",
|
74
81
|
"test/performance/metrics.md"
|
75
82
|
]
|
76
83
|
s.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
77
84
|
s.licenses = ["MIT"]
|
85
|
+
s.require_paths = ["lib"]
|
78
86
|
s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
|
79
|
-
s.rubygems_version = "2.
|
87
|
+
s.rubygems_version = "2.0.3"
|
80
88
|
s.summary = "Fast multi-threaded VCF parser"
|
81
89
|
|
82
90
|
if s.respond_to? :specification_version then
|
83
91
|
s.specification_version = 4
|
84
92
|
|
85
93
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
86
|
-
s.add_development_dependency(%q<rspec>, [">= 0"])
|
87
|
-
s.add_development_dependency(%q<cucumber>, [">=
|
88
|
-
s.add_development_dependency(%q<jeweler>, ["
|
89
|
-
s.add_development_dependency(%q<regressiontest>, ["
|
94
|
+
s.add_development_dependency(%q<rspec>, [">= 2.14.0"])
|
95
|
+
s.add_development_dependency(%q<cucumber>, [">= 1.3.11"])
|
96
|
+
s.add_development_dependency(%q<jeweler>, [">= 2.0.1"])
|
97
|
+
s.add_development_dependency(%q<regressiontest>, [">= 0.0.3"])
|
90
98
|
else
|
91
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
92
|
-
s.add_dependency(%q<cucumber>, [">=
|
93
|
-
s.add_dependency(%q<jeweler>, ["
|
94
|
-
s.add_dependency(%q<regressiontest>, ["
|
99
|
+
s.add_dependency(%q<rspec>, [">= 2.14.0"])
|
100
|
+
s.add_dependency(%q<cucumber>, [">= 1.3.11"])
|
101
|
+
s.add_dependency(%q<jeweler>, [">= 2.0.1"])
|
102
|
+
s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
|
95
103
|
end
|
96
104
|
else
|
97
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
98
|
-
s.add_dependency(%q<cucumber>, [">=
|
99
|
-
s.add_dependency(%q<jeweler>, ["
|
100
|
-
s.add_dependency(%q<regressiontest>, ["
|
105
|
+
s.add_dependency(%q<rspec>, [">= 2.14.0"])
|
106
|
+
s.add_dependency(%q<cucumber>, [">= 1.3.11"])
|
107
|
+
s.add_dependency(%q<jeweler>, [">= 2.0.1"])
|
108
|
+
s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
|
101
109
|
end
|
102
110
|
end
|
103
111
|
|
data/features/cli.feature
CHANGED
@@ -43,14 +43,24 @@ Feature: Command-line interface (CLI)
|
|
43
43
|
When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
|
44
44
|
Then I expect the named output to match the named output "sfilter_seval_s.dp"
|
45
45
|
|
46
|
-
|
47
46
|
Scenario: Rewrite an info field
|
48
47
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
49
48
|
When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
|
50
49
|
Then I expect the named output to match the named output "rewrite.info.sample"
|
51
50
|
|
51
|
+
Scenario: Test JSON output with header meta data
|
52
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
53
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
|
54
|
+
Then I expect the named output to match the named output "vcf2json_full_header"
|
55
|
+
|
56
|
+
Scenario: Test JSON output with header meta data and query samples
|
57
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
58
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
|
59
|
+
Then I expect the named output to match the named output "vcf2json_use_meta"
|
60
|
+
|
52
61
|
Scenario: Test deadlock on failed filter with threads
|
53
62
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
54
63
|
When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
55
64
|
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
56
65
|
|
66
|
+
|
@@ -8,7 +8,7 @@ When /^I execute "(.*?)"$/ do |arg1|
|
|
8
8
|
end
|
9
9
|
|
10
10
|
Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
11
|
-
RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf
|
11
|
+
RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_true
|
12
12
|
end
|
13
13
|
|
14
14
|
Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Given(/^the VCF header lines$/) do |string|
|
2
|
+
header = VcfHeader.new
|
3
|
+
header.add string
|
4
|
+
@vcf = header
|
5
|
+
end
|
6
|
+
|
7
|
+
When(/^I parse the VCF header$/) do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
|
11
|
+
expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
|
15
|
+
expect(@vcf.fileformat).to eq arg1
|
16
|
+
end
|
17
|
+
|
18
|
+
Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
|
19
|
+
expect(@vcf.fileDate).to eq arg1
|
20
|
+
end
|
21
|
+
|
22
|
+
Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
|
23
|
+
expect(@vcf.field['fileDate']).to eq arg1
|
24
|
+
end
|
25
|
+
|
26
|
+
Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
|
27
|
+
expect(@vcf.phasing).to eq arg1
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
|
31
|
+
expect(@vcf.reference).to eq arg1
|
32
|
+
end
|
33
|
+
|
34
|
+
Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
35
|
+
expect(@vcf.format[arg1].to_s).to eq arg2
|
36
|
+
end
|
37
|
+
|
38
|
+
Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
39
|
+
expect(@vcf.info[arg1].to_s).to eq arg2
|
40
|
+
end
|
41
|
+
|
42
|
+
Then(/^I expect vcf\.meta to contain all header meta information$/) do
|
43
|
+
m = @vcf.meta
|
44
|
+
expect(m['fileformat']).to eq "VCFv4.1"
|
45
|
+
expect(m['FORMAT']['DP']['Number']).to eq "1"
|
46
|
+
expect(m.size).to be 6
|
47
|
+
end
|
48
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@meta
|
2
|
+
Feature: Parsing VCF meta information from the header
|
3
|
+
|
4
|
+
Take a header and parse that information as defined by the VCF standard.
|
5
|
+
|
6
|
+
Scenario: When parsing a header line
|
7
|
+
|
8
|
+
Given the VCF header lines
|
9
|
+
"""
|
10
|
+
##fileformat=VCFv4.1
|
11
|
+
##fileDate=20140121
|
12
|
+
##phasing=none
|
13
|
+
##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
|
14
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
15
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
|
16
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
17
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
18
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
|
19
|
+
"""
|
20
|
+
When I parse the VCF header
|
21
|
+
Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
22
|
+
And I expect vcf.fileformat to be "VCFv4.1"
|
23
|
+
And I expect vcf.fileDate to be "20140121"
|
24
|
+
And I expect vcf.field['fileDate'] to be "20140121"
|
25
|
+
And I expect vcf.phasing to be "none"
|
26
|
+
And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
|
27
|
+
And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
|
28
|
+
And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
|
29
|
+
And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
|
30
|
+
And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
|
31
|
+
And I expect vcf.meta to contain all header meta information
|
32
|
+
|
33
|
+
Scenario: When parsing the header of somatic_sniper.vcf
|
34
|
+
|
35
|
+
Do something
|
data/lib/bio-vcf.rb
CHANGED
data/lib/bio-vcf/vcfheader.rb
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
# This module parses the VCF header. A header consists of lines
|
2
|
+
# containing fields. Most fields are of 'key=value' type and appear
|
3
|
+
# only once. These can be retrieved with the find_field method.
|
4
|
+
#
|
5
|
+
# INFO and FORMAT fields are special as they appear multiple times
|
6
|
+
# and contain multiple key values (identified by an ID field).
|
7
|
+
# To retrieve these call 'info' and 'format' functions respectively,
|
8
|
+
# which return a hash on the contained ID.
|
9
|
+
#
|
10
|
+
# For the INFO and FORMAT fields a Ragel parser is used, mostly to
|
11
|
+
# deal with embedded quoted fields.
|
1
12
|
|
2
13
|
module BioVcf
|
3
14
|
|
@@ -13,21 +24,27 @@ module BioVcf
|
|
13
24
|
end
|
14
25
|
nil
|
15
26
|
end
|
27
|
+
|
28
|
+
def VcfHeaderParser.parse_field(line)
|
29
|
+
BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(line, debug: false)
|
30
|
+
end
|
16
31
|
end
|
17
32
|
|
18
33
|
class VcfHeader
|
19
34
|
|
20
|
-
attr_reader :lines
|
35
|
+
attr_reader :lines, :field
|
21
36
|
|
22
37
|
def initialize
|
23
38
|
@lines = []
|
39
|
+
@field = {}
|
24
40
|
end
|
25
41
|
|
42
|
+
# Add a new field to the header
|
26
43
|
def add line
|
27
|
-
@lines
|
44
|
+
@lines += line.split(/\n/)
|
28
45
|
end
|
29
46
|
|
30
|
-
#
|
47
|
+
# Push a special key value list to the header
|
31
48
|
def tag h
|
32
49
|
h2 = h.dup
|
33
50
|
[:show_help,:skip_header,:verbose,:quiet,:debug].each { |key| h2.delete(key) }
|
@@ -82,6 +99,73 @@ module BioVcf
|
|
82
99
|
@sample_index = index
|
83
100
|
index
|
84
101
|
end
|
85
|
-
end
|
86
102
|
|
103
|
+
# Look for a line in the header with the field name and return the
|
104
|
+
# value, otherwise return nil
|
105
|
+
def find_field name
|
106
|
+
return field[name] if field[name]
|
107
|
+
@lines.each do | line |
|
108
|
+
value = line.scan(/###{name}=(.*)/)
|
109
|
+
if value[0]
|
110
|
+
v = value[0][0]
|
111
|
+
field[name] = v
|
112
|
+
return v
|
113
|
+
end
|
114
|
+
end
|
115
|
+
nil
|
116
|
+
end
|
117
|
+
|
118
|
+
# Look for all the lines that match the field name and return
|
119
|
+
# a hash of hashes. An empty hash is returned when there are
|
120
|
+
# no matches.
|
121
|
+
def find_fields name
|
122
|
+
res = {}
|
123
|
+
@lines.each do | line |
|
124
|
+
value = line.scan(/###{name}=<(.*)>/)
|
125
|
+
if value[0]
|
126
|
+
str = value[0][0]
|
127
|
+
# p str
|
128
|
+
v = VcfHeaderParser.parse_field(line)
|
129
|
+
id = v['ID']
|
130
|
+
res[id] = v
|
131
|
+
end
|
132
|
+
end
|
133
|
+
# p res
|
134
|
+
res
|
135
|
+
end
|
136
|
+
|
137
|
+
def format
|
138
|
+
find_fields('FORMAT')
|
139
|
+
end
|
140
|
+
|
141
|
+
def info
|
142
|
+
find_fields('INFO')
|
143
|
+
end
|
144
|
+
|
145
|
+
def meta
|
146
|
+
res = { 'INFO' => {}, 'FORMAT' => {} }
|
147
|
+
@lines.each do | line |
|
148
|
+
value = line.scan(/##(.*?)=(.*)/)
|
149
|
+
if value[0]
|
150
|
+
k,v = value[0]
|
151
|
+
if k != 'FORMAT' and k != 'INFO'
|
152
|
+
# p [k,v]
|
153
|
+
res[k] = v
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
res['INFO'] = info
|
158
|
+
res['FORMAT'] = format
|
159
|
+
# p [:res, res]
|
160
|
+
res
|
161
|
+
end
|
162
|
+
|
163
|
+
def method_missing(m, *args, &block)
|
164
|
+
name = m.to_s
|
165
|
+
value = find_field(name)
|
166
|
+
return value if value
|
167
|
+
raise "Unknown VCF header query '#{name}'"
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
87
171
|
end
|