bio-vcf 0.8.0 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +4 -5
- data/Gemfile.lock +28 -65
- data/LICENSE.txt +1 -1
- data/README.md +387 -107
- data/RELEASE_NOTES.md +20 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +3 -40
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +176 -109
- data/bio-vcf.gemspec +14 -70
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +25 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +0 -9
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/bedfilter.rb +43 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/template.rb +75 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
- data/lib/bio-vcf/vcfheader.rb +146 -6
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +27 -3
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +19 -7
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/template/vcf2rdf_header.erb +24 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +65 -64
data/bio-vcf.gemspec
CHANGED
@@ -1,16 +1,14 @@
|
|
1
|
-
#
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
1
|
+
# No longer generated by jeweler
|
4
2
|
# -*- encoding: utf-8 -*-
|
5
3
|
|
6
4
|
Gem::Specification.new do |s|
|
7
5
|
s.name = "bio-vcf"
|
8
|
-
s.version =
|
6
|
+
s.version = File.read("VERSION")
|
9
7
|
|
10
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
9
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "
|
13
|
-
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
|
10
|
+
# s.date = "2015-12-28"
|
11
|
+
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
|
14
12
|
s.email = "pjotr.public01@thebird.nl"
|
15
13
|
s.executables = ["bio-vcf"]
|
16
14
|
s.extra_rdoc_files = [
|
@@ -20,79 +18,25 @@ Gem::Specification.new do |s|
|
|
20
18
|
s.files = [
|
21
19
|
".travis.yml",
|
22
20
|
"Gemfile",
|
23
|
-
"Gemfile.lock",
|
24
21
|
"LICENSE.txt",
|
25
22
|
"README.md",
|
26
23
|
"Rakefile",
|
27
24
|
"VERSION",
|
28
25
|
"bin/bio-vcf",
|
29
26
|
"bio-vcf.gemspec",
|
30
|
-
"
|
31
|
-
"
|
32
|
-
"features/multisample.feature",
|
33
|
-
"features/sfilter.feature",
|
34
|
-
"features/somaticsniper.feature",
|
35
|
-
"features/step_definitions/bio-vcf_steps.rb",
|
36
|
-
"features/step_definitions/cli-feature.rb",
|
37
|
-
"features/step_definitions/diff_count.rb",
|
38
|
-
"features/step_definitions/multisample.rb",
|
39
|
-
"features/step_definitions/sfilter.rb",
|
40
|
-
"features/step_definitions/somaticsniper.rb",
|
41
|
-
"features/support/env.rb",
|
42
|
-
"lib/bio-vcf.rb",
|
43
|
-
"lib/bio-vcf/utils.rb",
|
44
|
-
"lib/bio-vcf/variant.rb",
|
45
|
-
"lib/bio-vcf/vcf.rb",
|
46
|
-
"lib/bio-vcf/vcfgenotypefield.rb",
|
47
|
-
"lib/bio-vcf/vcfheader.rb",
|
48
|
-
"lib/bio-vcf/vcfline.rb",
|
49
|
-
"lib/bio-vcf/vcfrdf.rb",
|
50
|
-
"lib/bio-vcf/vcfrecord.rb",
|
51
|
-
"lib/bio-vcf/vcfsample.rb",
|
52
|
-
"lib/bio-vcf/vcfstatistics.rb",
|
53
|
-
"template/gatk_vcf2rdf.erb",
|
54
|
-
"template/vcf2json.erb",
|
55
|
-
"template/vcf2rdf.erb",
|
56
|
-
"test/data/input/dbsnp.vcf",
|
57
|
-
"test/data/input/multisample.vcf",
|
58
|
-
"test/data/input/somaticsniper.vcf",
|
59
|
-
"test/data/regression/eval_r.info.dp.ref",
|
60
|
-
"test/data/regression/ifilter_s.dp.ref",
|
61
|
-
"test/data/regression/r.info.dp.ref",
|
62
|
-
"test/data/regression/rewrite.info.sample.ref",
|
63
|
-
"test/data/regression/s.dp.ref",
|
64
|
-
"test/data/regression/seval_s.dp.ref",
|
65
|
-
"test/data/regression/sfilter_seval_s.dp.ref",
|
66
|
-
"test/data/regression/thread4.ref",
|
67
|
-
"test/data/regression/thread4_4.ref",
|
68
|
-
"test/data/regression/thread4_4_failed_filter-stderr.ref",
|
69
|
-
"test/performance/metrics.md"
|
27
|
+
"ragel/gen_vcfheaderline_parser.rl",
|
28
|
+
"ragel/generate.sh",
|
70
29
|
]
|
71
|
-
s.
|
30
|
+
s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
|
31
|
+
s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
|
32
|
+
Dir['template/**/*']
|
33
|
+
|
34
|
+
s.homepage = "http://github.com/vcflib/bio-vcf"
|
72
35
|
s.licenses = ["MIT"]
|
73
36
|
s.require_paths = ["lib"]
|
74
|
-
s.
|
75
|
-
s.
|
37
|
+
s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
|
38
|
+
# s.rubygems_version = "2.0.3"
|
39
|
+
s.summary = "Fast multi-purpose multi-threaded VCF parser"
|
76
40
|
|
77
|
-
if s.respond_to? :specification_version then
|
78
|
-
s.specification_version = 4
|
79
|
-
|
80
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
81
|
-
s.add_development_dependency(%q<rspec>, [">= 0"])
|
82
|
-
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
83
|
-
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
84
|
-
s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
85
|
-
else
|
86
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
87
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
88
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
89
|
-
s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
90
|
-
end
|
91
|
-
else
|
92
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
93
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
94
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
95
|
-
s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
96
|
-
end
|
97
41
|
end
|
98
42
|
|
data/features/cli.feature
CHANGED
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
|
|
11
11
|
Scenario: Test the info filter using dp and threads
|
12
12
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
13
13
|
When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
|
14
|
-
Then I expect the named output to match the named output "thread4"
|
14
|
+
Then I expect the named output to match the named output "thread4" in under 30 seconds
|
15
15
|
|
16
16
|
Scenario: Test the info filter using dp and threads with lines
|
17
17
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
18
18
|
When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
|
19
|
-
Then I expect the named output to match the named output "thread4_4"
|
19
|
+
Then I expect the named output to match the named output "thread4_4" in under 30 seconds
|
20
20
|
|
21
21
|
Scenario: Test the sample filter using dp
|
22
22
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
|
|
43
43
|
When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
|
44
44
|
Then I expect the named output to match the named output "sfilter_seval_s.dp"
|
45
45
|
|
46
|
-
|
47
46
|
Scenario: Rewrite an info field
|
48
47
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
49
48
|
When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
|
50
49
|
Then I expect the named output to match the named output "rewrite.info.sample"
|
51
50
|
|
51
|
+
Scenario: Test eval-once
|
52
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
53
|
+
When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
|
54
|
+
Then I expect the named output to match the named output "eval_once"
|
55
|
+
|
56
|
+
Scenario: Test JSON output with header meta data
|
57
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
58
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
|
59
|
+
Then I expect the named output to match the named output "vcf2json_full_header"
|
60
|
+
|
61
|
+
Scenario: Test JSON output with header meta data and query samples
|
62
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
63
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
|
64
|
+
Then I expect the named output to match the named output "vcf2json_use_meta"
|
65
|
+
|
52
66
|
Scenario: Test deadlock on failed filter with threads
|
53
67
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
54
|
-
When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
68
|
+
When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
55
69
|
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
56
70
|
|
71
|
+
Scenario: Test VCF with no records
|
72
|
+
Given I have input file(s) named "test/data/input/empty.vcf"
|
73
|
+
When I execute "./bin/bio-vcf --timeout=5"
|
74
|
+
Then I expect no errors
|
data/features/diff_count.feature
CHANGED
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
|
|
21
21
|
Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
|
22
22
|
When I look for the difference
|
23
23
|
Then I expect the diff to be [0,15,0,11]
|
24
|
-
And the relative diff to be [0,0.23,0,0.85]
|
25
24
|
And I expect the defining tumor nucleotide to be "T"
|
26
25
|
And I expect the tumor count to be 12
|
27
26
|
When I set an inclusion threshold for the reference
|
@@ -0,0 +1,12 @@
|
|
1
|
+
@filter
|
2
|
+
Feature: Adding filters
|
3
|
+
|
4
|
+
bio-vcf can add soft filters. Rather than removing failing items we can
|
5
|
+
inject filter state into the FILTER field. To add state such as PASS or
|
6
|
+
LowDepth simply use a filter and the --set-filter switch. If a filter already
|
7
|
+
has state the new one is appended with a semi-colon.
|
8
|
+
|
9
|
+
Scenario: Test the info filter using dp and threads
|
10
|
+
Given I have input file(s) named "test/data/input/somaticsniper.vcf"
|
11
|
+
When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
|
12
|
+
Then I expect the named output to match the named output "pass1"
|
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
|
|
25
25
|
And I expect rec.info.ac to be 5
|
26
26
|
And I expect rec.info.af to be 0.357
|
27
27
|
And I expect rec.info.dp to be 1537
|
28
|
+
And I expect rec.info['dp'] to be 1537
|
28
29
|
And I expect rec.info.readposranksum to be 0.815
|
30
|
+
And I expect rec.info['ReadPosRankSum'] to be 0.815
|
31
|
+
And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
|
29
32
|
And I expect rec.sample['Original'].ad to be [189,25]
|
30
33
|
And I expect rec.sample['Original'].gt to be "0/1"
|
31
34
|
And I expect rec.sample['s3t2'].ad to be [167,26]
|
@@ -63,3 +66,25 @@ Feature: Multi-sample VCF
|
|
63
66
|
And I expect rec.sample.s3t2? to be true
|
64
67
|
And I expect rec.missing_samples? to be true
|
65
68
|
|
69
|
+
# Phased genotype
|
70
|
+
Given multisample vcf line
|
71
|
+
"""
|
72
|
+
1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
|
73
|
+
"""
|
74
|
+
When I parse the record
|
75
|
+
Then I expect rec.pos to contain 10723
|
76
|
+
Then I expect rec.valid? to be true
|
77
|
+
And I expect r.original? to be true
|
78
|
+
And I expect r.original.gts? to be true
|
79
|
+
And I expect r.original.gts to be ["C","G"]
|
80
|
+
And I expect r.original.gts[0] to be "C"
|
81
|
+
And I expect r.original.gts[1] to be "G"
|
82
|
+
|
83
|
+
# INFO fields with matching tails
|
84
|
+
Given multisample vcf line
|
85
|
+
"""
|
86
|
+
1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
|
87
|
+
"""
|
88
|
+
When I parse the record
|
89
|
+
Then I expect r.info.end to be 111
|
90
|
+
And I expect r.info.ciend to be 999
|
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
|
|
46
46
|
And I expect rec.tumor.amq.to_ary to be [37,37]
|
47
47
|
And I expect rec.tumor.mq to be 37
|
48
48
|
And I expect rec.tumor.ss to be 2
|
49
|
+
And I expect rec.tumor.ssc to be 33
|
50
|
+
And I expect rec.normal.ssc to be nil
|
49
51
|
# The following are additional functions
|
50
52
|
And I expect rec.call_diff to be [-4,2,-2,0]
|
51
53
|
And I expect rec.call_nuc to be "C"
|
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
|
|
7
7
|
@cmd = arg1 + ' < ' + @filenames[0]
|
8
8
|
end
|
9
9
|
|
10
|
-
Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
11
|
-
|
12
|
-
end
|
10
|
+
# Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
11
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
|
12
|
+
# end
|
13
13
|
|
14
|
-
Then(/^I expect
|
15
|
-
|
16
|
-
end
|
14
|
+
# Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
|
15
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
|
16
|
+
# end
|
17
|
+
|
18
|
+
|
19
|
+
# Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
|
20
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
|
21
|
+
# end
|
22
|
+
|
23
|
+
# Then(/^I expect no errors$/) do
|
24
|
+
# RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
|
25
|
+
# end
|
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
|
|
34
34
|
end
|
35
35
|
|
36
36
|
Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
37
|
-
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
|
37
|
+
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
|
38
38
|
expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
|
39
39
|
end
|
40
40
|
|
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
|
37
37
|
expect(@rec1.info.readposranksum).to eq 0.815
|
38
38
|
end
|
39
39
|
|
40
|
+
Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
|
41
|
+
expect(@rec1.info['dp']).to eq 1537
|
42
|
+
end
|
43
|
+
|
44
|
+
Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
45
|
+
expect(@rec1.info['ReadPosRankSum']).to eq 0.815
|
46
|
+
end
|
47
|
+
|
48
|
+
Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
|
49
|
+
expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
|
50
|
+
end
|
51
|
+
|
40
52
|
Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
|
41
53
|
expect(@rec1.sample['Original'].gt).to eq "0/1"
|
42
54
|
end
|
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
|
|
161
173
|
expect(@rec1.original.gts[arg1.to_i]).to eq arg2
|
162
174
|
end
|
163
175
|
|
176
|
+
Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
|
177
|
+
expect(@rec1.info.end).to eq arg1.to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
|
181
|
+
expect(@rec1.info.ciend).to eq arg1.to_i
|
182
|
+
end
|
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
|
|
99
99
|
end
|
100
100
|
|
101
101
|
|
102
|
+
Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
|
103
|
+
expect(@rec.tumor.ssc).to be 33
|
104
|
+
end
|
105
|
+
|
106
|
+
Then(/^I expect rec\.normal\.ssc to be nil$/) do
|
107
|
+
expect(@rec.normal.ssc).to be nil
|
108
|
+
end
|
109
|
+
|
102
110
|
Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
103
111
|
expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
104
112
|
end
|
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
|
|
116
124
|
end
|
117
125
|
|
118
126
|
Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
119
|
-
expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
|
127
|
+
expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
|
120
128
|
end
|
121
129
|
|
122
130
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Given(/^the VCF header lines$/) do |string|
|
2
|
+
header = VcfHeader.new
|
3
|
+
header.add string
|
4
|
+
@vcf = header
|
5
|
+
end
|
6
|
+
|
7
|
+
When(/^I parse the VCF header$/) do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
|
11
|
+
expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
|
15
|
+
expect(@vcf.fileformat).to eq arg1
|
16
|
+
end
|
17
|
+
|
18
|
+
Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
|
19
|
+
expect(@vcf.fileDate).to eq arg1
|
20
|
+
end
|
21
|
+
|
22
|
+
Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
|
23
|
+
expect(@vcf.field['fileDate']).to eq arg1
|
24
|
+
end
|
25
|
+
|
26
|
+
Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
|
27
|
+
expect(@vcf.phasing).to eq arg1
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
|
31
|
+
expect(@vcf.reference).to eq arg1
|
32
|
+
end
|
33
|
+
|
34
|
+
Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
35
|
+
expect(@vcf.format[arg1].to_s).to eq arg2
|
36
|
+
end
|
37
|
+
|
38
|
+
Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
39
|
+
expect(@vcf.info[arg1].to_s).to eq arg2
|
40
|
+
end
|
41
|
+
|
42
|
+
Then(/^I expect vcf\.meta to contain all header meta information$/) do
|
43
|
+
m = @vcf.meta
|
44
|
+
expect(m['fileformat']).to eq "VCFv4.1"
|
45
|
+
expect(m['FORMAT']['DP']['Number']).to eq "1"
|
46
|
+
expect(m.size).to be 9
|
47
|
+
end
|
48
|
+
|
data/features/support/env.rb
CHANGED
@@ -1,12 +1,3 @@
|
|
1
|
-
require 'bundler'
|
2
|
-
begin
|
3
|
-
Bundler.setup(:default, :development)
|
4
|
-
rescue Bundler::BundlerError => e
|
5
|
-
$stderr.puts e.message
|
6
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
-
exit e.status_code
|
8
|
-
end
|
9
|
-
|
10
1
|
# require 'mini/test'
|
11
2
|
|
12
3
|
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@meta
|
2
|
+
Feature: Parsing VCF meta information from the header
|
3
|
+
|
4
|
+
Take a header and parse that information as defined by the VCF standard.
|
5
|
+
|
6
|
+
Scenario: When parsing a header line
|
7
|
+
|
8
|
+
Given the VCF header lines
|
9
|
+
"""
|
10
|
+
##fileformat=VCFv4.1
|
11
|
+
##fileDate=20140121
|
12
|
+
##phasing=none
|
13
|
+
##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
|
14
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
15
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
|
16
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
17
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
18
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
|
19
|
+
"""
|
20
|
+
When I parse the VCF header
|
21
|
+
Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
22
|
+
And I expect vcf.fileformat to be "VCFv4.1"
|
23
|
+
And I expect vcf.fileDate to be "20140121"
|
24
|
+
And I expect vcf.field['fileDate'] to be "20140121"
|
25
|
+
And I expect vcf.phasing to be "none"
|
26
|
+
And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
|
27
|
+
And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
|
28
|
+
And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
|
29
|
+
And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
|
30
|
+
And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
|
31
|
+
And I expect vcf.meta to contain all header meta information
|
32
|
+
|
33
|
+
Scenario: When parsing the header of somatic_sniper.vcf
|
34
|
+
|
35
|
+
Do something
|
data/lib/bio-vcf.rb
CHANGED
@@ -11,9 +11,11 @@
|
|
11
11
|
require 'bio-vcf/utils'
|
12
12
|
require 'bio-vcf/vcf'
|
13
13
|
require 'bio-vcf/vcfsample'
|
14
|
+
require 'bio-vcf/vcfheader_line'
|
14
15
|
require 'bio-vcf/vcfheader'
|
15
16
|
require 'bio-vcf/vcfline'
|
16
17
|
require 'bio-vcf/vcfgenotypefield'
|
17
18
|
require 'bio-vcf/vcfrecord'
|
18
19
|
require 'bio-vcf/variant'
|
19
20
|
require 'bio-vcf/vcfstatistics'
|
21
|
+
require 'bio-vcf/bedfilter'
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
class BedFilter
|
4
|
+
def initialize bedfilen
|
5
|
+
require 'binary_search/native'
|
6
|
+
|
7
|
+
# Parse Bed file and build up search array
|
8
|
+
chrs = {}
|
9
|
+
info = {}
|
10
|
+
File.open(bedfilen).each_line { | line |
|
11
|
+
(chr,start,stop,gene) = line.strip.split(/\t/)[0..3]
|
12
|
+
chrs[chr] ||= []
|
13
|
+
chrs[chr].push(stop.to_i)
|
14
|
+
info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene]
|
15
|
+
}
|
16
|
+
# Make sure chrs is sorted
|
17
|
+
@chrs = {}
|
18
|
+
chrs.each { | k,list |
|
19
|
+
@chrs[k] = list.sort
|
20
|
+
}
|
21
|
+
@info = info
|
22
|
+
end
|
23
|
+
|
24
|
+
def contains(rec)
|
25
|
+
stop_list = @chrs[rec.chrom]
|
26
|
+
if stop_list
|
27
|
+
pos = rec.pos
|
28
|
+
stop = stop_list.bsearch { |bedstop| bedstop >= pos }
|
29
|
+
if stop
|
30
|
+
rinfo = @info[rec.chrom+':'+stop.to_s]
|
31
|
+
raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil
|
32
|
+
start = rinfo[1]
|
33
|
+
if pos >= start
|
34
|
+
# p [rec.chrom,rec.pos,rinfo]
|
35
|
+
return rinfo
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|