bio-vcf 0.8.1 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
data/bio-vcf.gemspec
CHANGED
@@ -1,18 +1,13 @@
|
|
1
|
-
#
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
1
|
+
# No longer generated by jeweler
|
4
2
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-vcf 0.8.1 ruby lib
|
6
3
|
|
7
4
|
Gem::Specification.new do |s|
|
8
5
|
s.name = "bio-vcf"
|
9
|
-
s.version =
|
6
|
+
s.version = File.read("VERSION")
|
10
7
|
|
11
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
9
|
s.authors = ["Pjotr Prins"]
|
14
|
-
s.
|
15
|
-
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
|
10
|
+
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
|
16
11
|
s.email = "pjotr.public01@thebird.nl"
|
17
12
|
s.executables = ["bio-vcf"]
|
18
13
|
s.extra_rdoc_files = [
|
@@ -22,82 +17,25 @@ Gem::Specification.new do |s|
|
|
22
17
|
s.files = [
|
23
18
|
".travis.yml",
|
24
19
|
"Gemfile",
|
25
|
-
"Gemfile.lock",
|
26
20
|
"LICENSE.txt",
|
27
21
|
"README.md",
|
28
22
|
"Rakefile",
|
29
23
|
"VERSION",
|
30
24
|
"bin/bio-vcf",
|
31
25
|
"bio-vcf.gemspec",
|
32
|
-
"
|
33
|
-
"
|
34
|
-
"features/multisample.feature",
|
35
|
-
"features/sfilter.feature",
|
36
|
-
"features/somaticsniper.feature",
|
37
|
-
"features/step_definitions/bio-vcf_steps.rb",
|
38
|
-
"features/step_definitions/cli-feature.rb",
|
39
|
-
"features/step_definitions/diff_count.rb",
|
40
|
-
"features/step_definitions/multisample.rb",
|
41
|
-
"features/step_definitions/sfilter.rb",
|
42
|
-
"features/step_definitions/somaticsniper.rb",
|
43
|
-
"features/support/env.rb",
|
44
|
-
"lib/bio-vcf.rb",
|
45
|
-
"lib/bio-vcf/bedfilter.rb",
|
46
|
-
"lib/bio-vcf/template.rb",
|
47
|
-
"lib/bio-vcf/utils.rb",
|
48
|
-
"lib/bio-vcf/variant.rb",
|
49
|
-
"lib/bio-vcf/vcf.rb",
|
50
|
-
"lib/bio-vcf/vcfgenotypefield.rb",
|
51
|
-
"lib/bio-vcf/vcfheader.rb",
|
52
|
-
"lib/bio-vcf/vcfline.rb",
|
53
|
-
"lib/bio-vcf/vcfrdf.rb",
|
54
|
-
"lib/bio-vcf/vcfrecord.rb",
|
55
|
-
"lib/bio-vcf/vcfsample.rb",
|
56
|
-
"lib/bio-vcf/vcfstatistics.rb",
|
57
|
-
"template/gatk_vcf2rdf.erb",
|
58
|
-
"template/vcf2json.erb",
|
59
|
-
"template/vcf2rdf.erb",
|
60
|
-
"template/vcf2rdf_header.erb",
|
61
|
-
"test/data/input/dbsnp.vcf",
|
62
|
-
"test/data/input/multisample.vcf",
|
63
|
-
"test/data/input/somaticsniper.vcf",
|
64
|
-
"test/data/regression/eval_r.info.dp.ref",
|
65
|
-
"test/data/regression/ifilter_s.dp.ref",
|
66
|
-
"test/data/regression/r.info.dp.ref",
|
67
|
-
"test/data/regression/rewrite.info.sample.ref",
|
68
|
-
"test/data/regression/s.dp.ref",
|
69
|
-
"test/data/regression/seval_s.dp.ref",
|
70
|
-
"test/data/regression/sfilter_seval_s.dp.ref",
|
71
|
-
"test/data/regression/thread4.ref",
|
72
|
-
"test/data/regression/thread4_4.ref",
|
73
|
-
"test/data/regression/thread4_4_failed_filter-stderr.ref",
|
74
|
-
"test/performance/metrics.md"
|
26
|
+
"ragel/gen_vcfheaderline_parser.rl",
|
27
|
+
"ragel/generate.sh",
|
75
28
|
]
|
76
|
-
s.
|
29
|
+
s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
|
30
|
+
s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
|
31
|
+
Dir['template/**/*']
|
32
|
+
|
33
|
+
s.homepage = "http://github.com/vcflib/bio-vcf"
|
77
34
|
s.licenses = ["MIT"]
|
35
|
+
s.require_paths = ["lib"]
|
78
36
|
s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
|
79
|
-
s.rubygems_version = "2.
|
80
|
-
s.summary = "Fast multi-threaded VCF parser"
|
81
|
-
|
82
|
-
if s.respond_to? :specification_version then
|
83
|
-
s.specification_version = 4
|
37
|
+
# s.rubygems_version = "2.0.3"
|
38
|
+
s.summary = "Fast multi-purpose multi-threaded VCF parser"
|
84
39
|
|
85
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
86
|
-
s.add_development_dependency(%q<rspec>, [">= 0"])
|
87
|
-
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
88
|
-
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
89
|
-
s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
90
|
-
else
|
91
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
92
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
93
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
94
|
-
s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
95
|
-
end
|
96
|
-
else
|
97
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
98
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
99
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
100
|
-
s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
|
101
|
-
end
|
102
40
|
end
|
103
41
|
|
data/features/cli.feature
CHANGED
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
|
|
11
11
|
Scenario: Test the info filter using dp and threads
|
12
12
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
13
13
|
When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
|
14
|
-
Then I expect the named output to match the named output "thread4"
|
14
|
+
Then I expect the named output to match the named output "thread4" in under 30 seconds
|
15
15
|
|
16
16
|
Scenario: Test the info filter using dp and threads with lines
|
17
17
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
18
18
|
When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
|
19
|
-
Then I expect the named output to match the named output "thread4_4"
|
19
|
+
Then I expect the named output to match the named output "thread4_4" in under 30 seconds
|
20
20
|
|
21
21
|
Scenario: Test the sample filter using dp
|
22
22
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
|
|
43
43
|
When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
|
44
44
|
Then I expect the named output to match the named output "sfilter_seval_s.dp"
|
45
45
|
|
46
|
-
|
47
46
|
Scenario: Rewrite an info field
|
48
47
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
49
48
|
When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
|
50
49
|
Then I expect the named output to match the named output "rewrite.info.sample"
|
51
50
|
|
51
|
+
Scenario: Test eval-once
|
52
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
53
|
+
When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
|
54
|
+
Then I expect the named output to match the named output "eval_once"
|
55
|
+
|
56
|
+
Scenario: Test JSON output with header meta data
|
57
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
58
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
|
59
|
+
Then I expect the named output to match the named output "vcf2json_full_header"
|
60
|
+
|
61
|
+
Scenario: Test JSON output with header meta data and query samples
|
62
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
63
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
|
64
|
+
Then I expect the named output to match the named output "vcf2json_use_meta"
|
65
|
+
|
52
66
|
Scenario: Test deadlock on failed filter with threads
|
53
67
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
54
|
-
When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
68
|
+
When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
55
69
|
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
56
70
|
|
71
|
+
Scenario: Test VCF with no records
|
72
|
+
Given I have input file(s) named "test/data/input/empty.vcf"
|
73
|
+
When I execute "./bin/bio-vcf --timeout=5"
|
74
|
+
Then I expect no errors
|
data/features/diff_count.feature
CHANGED
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
|
|
21
21
|
Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
|
22
22
|
When I look for the difference
|
23
23
|
Then I expect the diff to be [0,15,0,11]
|
24
|
-
And the relative diff to be [0,0.23,0,0.85]
|
25
24
|
And I expect the defining tumor nucleotide to be "T"
|
26
25
|
And I expect the tumor count to be 12
|
27
26
|
When I set an inclusion threshold for the reference
|
@@ -0,0 +1,12 @@
|
|
1
|
+
@filter
|
2
|
+
Feature: Adding filters
|
3
|
+
|
4
|
+
bio-vcf can add soft filters. Rather than removing failing items we can
|
5
|
+
inject filter state into the FILTER field. To add state such as PASS or
|
6
|
+
LowDepth simply use a filter and the --set-filter switch. If a filter already
|
7
|
+
has state the new one is appended with a semi-colon.
|
8
|
+
|
9
|
+
Scenario: Test the info filter using dp and threads
|
10
|
+
Given I have input file(s) named "test/data/input/somaticsniper.vcf"
|
11
|
+
When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
|
12
|
+
Then I expect the named output to match the named output "pass1"
|
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
|
|
25
25
|
And I expect rec.info.ac to be 5
|
26
26
|
And I expect rec.info.af to be 0.357
|
27
27
|
And I expect rec.info.dp to be 1537
|
28
|
+
And I expect rec.info['dp'] to be 1537
|
28
29
|
And I expect rec.info.readposranksum to be 0.815
|
30
|
+
And I expect rec.info['ReadPosRankSum'] to be 0.815
|
31
|
+
And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
|
29
32
|
And I expect rec.sample['Original'].ad to be [189,25]
|
30
33
|
And I expect rec.sample['Original'].gt to be "0/1"
|
31
34
|
And I expect rec.sample['s3t2'].ad to be [167,26]
|
@@ -76,3 +79,12 @@ Feature: Multi-sample VCF
|
|
76
79
|
And I expect r.original.gts to be ["C","G"]
|
77
80
|
And I expect r.original.gts[0] to be "C"
|
78
81
|
And I expect r.original.gts[1] to be "G"
|
82
|
+
|
83
|
+
# INFO fields with matching tails
|
84
|
+
Given multisample vcf line
|
85
|
+
"""
|
86
|
+
1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
|
87
|
+
"""
|
88
|
+
When I parse the record
|
89
|
+
Then I expect r.info.end to be 111
|
90
|
+
And I expect r.info.ciend to be 999
|
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
|
|
46
46
|
And I expect rec.tumor.amq.to_ary to be [37,37]
|
47
47
|
And I expect rec.tumor.mq to be 37
|
48
48
|
And I expect rec.tumor.ss to be 2
|
49
|
+
And I expect rec.tumor.ssc to be 33
|
50
|
+
And I expect rec.normal.ssc to be nil
|
49
51
|
# The following are additional functions
|
50
52
|
And I expect rec.call_diff to be [-4,2,-2,0]
|
51
53
|
And I expect rec.call_nuc to be "C"
|
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
|
|
7
7
|
@cmd = arg1 + ' < ' + @filenames[0]
|
8
8
|
end
|
9
9
|
|
10
|
-
Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
11
|
-
|
12
|
-
end
|
10
|
+
# Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
11
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
|
12
|
+
# end
|
13
13
|
|
14
|
-
Then(/^I expect
|
15
|
-
|
16
|
-
end
|
14
|
+
# Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
|
15
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
|
16
|
+
# end
|
17
|
+
|
18
|
+
|
19
|
+
# Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
|
20
|
+
# RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
|
21
|
+
# end
|
22
|
+
|
23
|
+
# Then(/^I expect no errors$/) do
|
24
|
+
# RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
|
25
|
+
# end
|
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
|
|
34
34
|
end
|
35
35
|
|
36
36
|
Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
37
|
-
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
|
37
|
+
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
|
38
38
|
expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
|
39
39
|
end
|
40
40
|
|
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
|
37
37
|
expect(@rec1.info.readposranksum).to eq 0.815
|
38
38
|
end
|
39
39
|
|
40
|
+
Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
|
41
|
+
expect(@rec1.info['dp']).to eq 1537
|
42
|
+
end
|
43
|
+
|
44
|
+
Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
45
|
+
expect(@rec1.info['ReadPosRankSum']).to eq 0.815
|
46
|
+
end
|
47
|
+
|
48
|
+
Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
|
49
|
+
expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
|
50
|
+
end
|
51
|
+
|
40
52
|
Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
|
41
53
|
expect(@rec1.sample['Original'].gt).to eq "0/1"
|
42
54
|
end
|
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
|
|
161
173
|
expect(@rec1.original.gts[arg1.to_i]).to eq arg2
|
162
174
|
end
|
163
175
|
|
176
|
+
Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
|
177
|
+
expect(@rec1.info.end).to eq arg1.to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
|
181
|
+
expect(@rec1.info.ciend).to eq arg1.to_i
|
182
|
+
end
|
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
|
|
99
99
|
end
|
100
100
|
|
101
101
|
|
102
|
+
Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
|
103
|
+
expect(@rec.tumor.ssc).to be 33
|
104
|
+
end
|
105
|
+
|
106
|
+
Then(/^I expect rec\.normal\.ssc to be nil$/) do
|
107
|
+
expect(@rec.normal.ssc).to be nil
|
108
|
+
end
|
109
|
+
|
102
110
|
Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
103
111
|
expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
104
112
|
end
|
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
|
|
116
124
|
end
|
117
125
|
|
118
126
|
Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
119
|
-
expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
|
127
|
+
expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
|
120
128
|
end
|
121
129
|
|
122
130
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Given(/^the VCF header lines$/) do |string|
|
2
|
+
header = VcfHeader.new
|
3
|
+
header.add string
|
4
|
+
@vcf = header
|
5
|
+
end
|
6
|
+
|
7
|
+
When(/^I parse the VCF header$/) do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
|
11
|
+
expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
|
15
|
+
expect(@vcf.fileformat).to eq arg1
|
16
|
+
end
|
17
|
+
|
18
|
+
Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
|
19
|
+
expect(@vcf.fileDate).to eq arg1
|
20
|
+
end
|
21
|
+
|
22
|
+
Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
|
23
|
+
expect(@vcf.field['fileDate']).to eq arg1
|
24
|
+
end
|
25
|
+
|
26
|
+
Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
|
27
|
+
expect(@vcf.phasing).to eq arg1
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
|
31
|
+
expect(@vcf.reference).to eq arg1
|
32
|
+
end
|
33
|
+
|
34
|
+
Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
35
|
+
expect(@vcf.format[arg1].to_s).to eq arg2
|
36
|
+
end
|
37
|
+
|
38
|
+
Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
|
39
|
+
expect(@vcf.info[arg1].to_s).to eq arg2
|
40
|
+
end
|
41
|
+
|
42
|
+
Then(/^I expect vcf\.meta to contain all header meta information$/) do
|
43
|
+
m = @vcf.meta
|
44
|
+
expect(m['fileformat']).to eq "VCFv4.1"
|
45
|
+
expect(m['FORMAT']['DP']['Number']).to eq "1"
|
46
|
+
expect(m.size).to be 9
|
47
|
+
end
|
48
|
+
|
data/features/support/env.rb
CHANGED
@@ -1,13 +1,3 @@
|
|
1
|
-
require 'bundler'
|
2
|
-
begin
|
3
|
-
Bundler.setup(:default, :development)
|
4
|
-
rescue Bundler::BundlerError => e
|
5
|
-
$stderr.puts e.message
|
6
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
-
exit e.status_code
|
8
|
-
end
|
9
|
-
|
10
|
-
# require 'mini/test'
|
11
1
|
|
12
2
|
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
13
3
|
require 'bio-vcf'
|
@@ -16,7 +6,7 @@ require 'rspec/expectations'
|
|
16
6
|
|
17
7
|
# Add the regression module if in the path (it can also be a gem)
|
18
8
|
rootdir = File.dirname(__FILE__) + '/../..'
|
19
|
-
$LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
|
9
|
+
$LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib')
|
20
10
|
require 'regressiontest'
|
21
11
|
|
22
12
|
include BioVcf
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@meta
|
2
|
+
Feature: Parsing VCF meta information from the header
|
3
|
+
|
4
|
+
Take a header and parse that information as defined by the VCF standard.
|
5
|
+
|
6
|
+
Scenario: When parsing a header line
|
7
|
+
|
8
|
+
Given the VCF header lines
|
9
|
+
"""
|
10
|
+
##fileformat=VCFv4.1
|
11
|
+
##fileDate=20140121
|
12
|
+
##phasing=none
|
13
|
+
##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
|
14
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
15
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
|
16
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
17
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
18
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
|
19
|
+
"""
|
20
|
+
When I parse the VCF header
|
21
|
+
Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
22
|
+
And I expect vcf.fileformat to be "VCFv4.1"
|
23
|
+
And I expect vcf.fileDate to be "20140121"
|
24
|
+
And I expect vcf.field['fileDate'] to be "20140121"
|
25
|
+
And I expect vcf.phasing to be "none"
|
26
|
+
And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
|
27
|
+
And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
|
28
|
+
And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
|
29
|
+
And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
|
30
|
+
And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
|
31
|
+
And I expect vcf.meta to contain all header meta information
|
32
|
+
|
33
|
+
Scenario: When parsing the header of somatic_sniper.vcf
|
34
|
+
|
35
|
+
Do something
|
data/lib/bio-vcf.rb
CHANGED
@@ -0,0 +1,303 @@
|
|
1
|
+
# Parallel copy-on-write streaming (PCOWS)
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
class PCOWS
|
6
|
+
|
7
|
+
RUNNINGEXT = 'part' # file extension
|
8
|
+
|
9
|
+
def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
|
10
|
+
num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
|
11
|
+
# $stderr.print "Using ",num_threads,"threads \n"
|
12
|
+
@num_threads = num_threads
|
13
|
+
@chunk_size = chunk_size
|
14
|
+
@pid_list = []
|
15
|
+
@name = name
|
16
|
+
@timeout = timeout
|
17
|
+
@quiet = quiet
|
18
|
+
@debug = debug
|
19
|
+
if @debug
|
20
|
+
$stderr.print "PCOWS running in DEBUG MODE\n"
|
21
|
+
end
|
22
|
+
if multi_threaded
|
23
|
+
@tmpdir = Dir::mktmpdir(@name+'_')
|
24
|
+
end
|
25
|
+
@last_output = 0 # counter
|
26
|
+
@output_locked = false
|
27
|
+
end
|
28
|
+
|
29
|
+
# Feed the worker 'func and state' to COWS. Note that func is a
|
30
|
+
# lambda closure so it can pick up surrounding scope at invocation
|
31
|
+
# in addition to the data captured in 'state'.
|
32
|
+
|
33
|
+
def submit_worker(func,state)
|
34
|
+
pid = nil
|
35
|
+
if multi_threaded
|
36
|
+
count = @pid_list.size+1
|
37
|
+
fn = mktmpfilename(count)
|
38
|
+
pid = fork do
|
39
|
+
# ---- This is running a new copy-on-write process
|
40
|
+
tempfn = fn+'.'+RUNNINGEXT
|
41
|
+
STDOUT.reopen(File.open(tempfn, 'w+'))
|
42
|
+
func.call(state).each { | line | print line }
|
43
|
+
STDOUT.flush
|
44
|
+
STDOUT.close
|
45
|
+
# sleep 0.1
|
46
|
+
# f.flush
|
47
|
+
# f.close
|
48
|
+
# sleep 0.2 # interval to make sure we are done writing,
|
49
|
+
# otherwise there may be misses at the end of a
|
50
|
+
# block (maybe the f.close fixed it)
|
51
|
+
|
52
|
+
FileUtils::mv(tempfn,fn)
|
53
|
+
exit(0)
|
54
|
+
end
|
55
|
+
Process.detach(pid)
|
56
|
+
else
|
57
|
+
# ---- Single threaded: call in main process and output immediately
|
58
|
+
func.call(state).each { | line | print line }
|
59
|
+
end
|
60
|
+
@pid_list << [ pid,count,fn ]
|
61
|
+
return true
|
62
|
+
end
|
63
|
+
|
64
|
+
def submit_final_worker(func,state)
|
65
|
+
@final_worker = true
|
66
|
+
submit_worker(func,state)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Make sure no more than num_threads are running at the same time -
|
70
|
+
# this is achieved by checking the PID table and the running files
|
71
|
+
# in the tmpdir
|
72
|
+
|
73
|
+
def wait_for_worker_slot()
|
74
|
+
return if single_threaded
|
75
|
+
Timeout.timeout(@timeout) do
|
76
|
+
printed_timeout_message = false
|
77
|
+
while true
|
78
|
+
# ---- count running pids
|
79
|
+
running = @pid_list.reduce(0) do | sum, info |
|
80
|
+
(pid,count,fn) = info
|
81
|
+
if pid_or_file_running?(pid,fn)
|
82
|
+
sum+1
|
83
|
+
else
|
84
|
+
sum
|
85
|
+
end
|
86
|
+
end
|
87
|
+
return if running < @num_threads
|
88
|
+
if not printed_timeout_message
|
89
|
+
$stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
|
90
|
+
printed_timeout_message = true
|
91
|
+
end
|
92
|
+
sleep 0.1
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# ---- In this section the output gets collected and passed on to a
|
98
|
+
# printer thread. This function makes sure the printing is
|
99
|
+
# ordered and that no printers are running at the same
|
100
|
+
# time. The printer thread should be doing as little processing
|
101
|
+
# as possible.
|
102
|
+
#
|
103
|
+
# In this implementation type==:by_line will call func for
|
104
|
+
# each line. Otherwise it is called once with the filename.
|
105
|
+
def process_output(func=nil,type=:by_line, blocking=false)
|
106
|
+
return if single_threaded
|
107
|
+
output = lambda { |fn|
|
108
|
+
if type == :by_line
|
109
|
+
File.new(fn).each_line { |buf|
|
110
|
+
print buf
|
111
|
+
}
|
112
|
+
else
|
113
|
+
func.call(fn)
|
114
|
+
end
|
115
|
+
}
|
116
|
+
if @output_locked
|
117
|
+
# ---- is the other thread still running? We wait until it
|
118
|
+
# is finished to start the next one
|
119
|
+
(pid,count,fn) = @output_locked
|
120
|
+
$stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
|
121
|
+
return if File.exist?(fn) # continue because thread still processing
|
122
|
+
# Now we should remove the .keep file
|
123
|
+
cleanup_keep_file(fn)
|
124
|
+
@last_output += 1 # get next one in line
|
125
|
+
@output_locked = false
|
126
|
+
end
|
127
|
+
# ---- process the next output chunk. After completion it
|
128
|
+
# gets renamed to chunk.keep. This to avoid missing
|
129
|
+
# output (if we unlink the file prematurely)
|
130
|
+
if info = @pid_list[@last_output]
|
131
|
+
(pid,count,fn) = info
|
132
|
+
$stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
|
133
|
+
if File.exist?(fn)
|
134
|
+
# Yes! We have the next output, create outputter
|
135
|
+
@output_locked = info
|
136
|
+
$stderr.print "Set lock on ",[info],"\n" if not @quiet
|
137
|
+
if not blocking
|
138
|
+
$stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
|
139
|
+
pid = fork do
|
140
|
+
output.call(fn)
|
141
|
+
# after finishing output move it to .keep
|
142
|
+
FileUtils::mv(fn,fn+'.keep')
|
143
|
+
exit(0)
|
144
|
+
end
|
145
|
+
Process.detach(pid)
|
146
|
+
else
|
147
|
+
$stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
|
148
|
+
output.call(fn)
|
149
|
+
FileUtils::mv(fn,fn+'.keep')
|
150
|
+
end
|
151
|
+
else
|
152
|
+
sleep 0.2
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Wait for a worker slot to appear. When working the pid is writing
|
158
|
+
# a file with extension .part(ial). After completion the file is
|
159
|
+
# renamed without .part and a slot is free.
|
160
|
+
def wait_for_worker(info)
|
161
|
+
(pid,count,fn) = info
|
162
|
+
if pid_or_file_running?(pid,fn)
|
163
|
+
$stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
|
164
|
+
begin
|
165
|
+
Timeout.timeout(@timeout) do
|
166
|
+
while not File.exist?(fn) # wait for the result to appear
|
167
|
+
sleep 0.2
|
168
|
+
return if not pid_or_file_running?(pid,fn) # worker is gone
|
169
|
+
end
|
170
|
+
end
|
171
|
+
# Partial file should have been renamed:
|
172
|
+
raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
|
173
|
+
$stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
|
174
|
+
rescue Timeout::Error
|
175
|
+
# Kill it to speed up exit
|
176
|
+
Process.kill 9, pid
|
177
|
+
Process.wait pid
|
178
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
|
179
|
+
$stderr.print "Bailing out"
|
180
|
+
raise
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# This is the final cleanup after the reader thread is done. All workers
|
186
|
+
# need to complete.
|
187
|
+
|
188
|
+
def wait_for_workers()
|
189
|
+
return if single_threaded
|
190
|
+
@pid_list.each do |info|
|
191
|
+
wait_for_worker(info)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def process_remaining_output()
|
196
|
+
return if single_threaded
|
197
|
+
$stderr.print "Processing remaining output...\n" if not @quiet
|
198
|
+
while @output_locked
|
199
|
+
sleep 0.2
|
200
|
+
process_output() # keep trying
|
201
|
+
end
|
202
|
+
@pid_list.each do |info|
|
203
|
+
(pid,count,fn) = info
|
204
|
+
while pid_or_file_running?(pid,fn) or File.exist?(fn)
|
205
|
+
$stderr.print "Trying: ",[info],"\n" if not @quiet
|
206
|
+
process_output(nil,:by_line,true)
|
207
|
+
sleep 0.2
|
208
|
+
end
|
209
|
+
end
|
210
|
+
while @output_locked
|
211
|
+
sleep 0.1
|
212
|
+
process_output(nil,:by_line,true)
|
213
|
+
end
|
214
|
+
cleanup_tmpdir()
|
215
|
+
end
|
216
|
+
|
217
|
+
def cleanup()
|
218
|
+
@pid_list.each do |info|
|
219
|
+
(pid,count,fn) = info
|
220
|
+
if pid_running?(pid)
|
221
|
+
$stderr.print "Killing child ",[info],"\n"
|
222
|
+
begin
|
223
|
+
Process.kill 9, pid
|
224
|
+
Process.wait pid
|
225
|
+
rescue Errno::ENOENT
|
226
|
+
$stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
|
227
|
+
rescue Errno::ESRCH
|
228
|
+
$stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
|
229
|
+
end
|
230
|
+
end
|
231
|
+
File.unlink(fn) if File.exist?(fn)
|
232
|
+
cleanup_keep_file(fn,wait: false)
|
233
|
+
tempfn = fn+'.'+RUNNINGEXT
|
234
|
+
File.unlink(tempfn) if File.exist?(tempfn)
|
235
|
+
end
|
236
|
+
cleanup_tmpdir()
|
237
|
+
end
|
238
|
+
|
239
|
+
private
|
240
|
+
|
241
|
+
def mktmpfilename(num,ext=nil)
|
242
|
+
@tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
|
243
|
+
end
|
244
|
+
|
245
|
+
def pid_or_file_running?(pid,fn)
|
246
|
+
(pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
|
247
|
+
end
|
248
|
+
|
249
|
+
def pid_running?(pid)
|
250
|
+
begin
|
251
|
+
fpid,status=Process.waitpid2(pid,Process::WNOHANG)
|
252
|
+
rescue Errno::ECHILD, Errno::ESRCH
|
253
|
+
return false
|
254
|
+
end
|
255
|
+
return true if nil == fpid && nil == status
|
256
|
+
return ! (status.exited? || status.signaled?)
|
257
|
+
end
|
258
|
+
|
259
|
+
def single_threaded
|
260
|
+
@num_threads == 1
|
261
|
+
end
|
262
|
+
|
263
|
+
def multi_threaded
|
264
|
+
@num_threads > 1
|
265
|
+
end
|
266
|
+
|
267
|
+
def cpu_count
|
268
|
+
begin
|
269
|
+
return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
|
270
|
+
# Actually, the JVM does not allow fork...
|
271
|
+
return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
|
272
|
+
rescue LoadError
|
273
|
+
# Count on MAC
|
274
|
+
return Integer `sysctl -n hw.ncpu 2>/dev/null`
|
275
|
+
end
|
276
|
+
$stderr.print "Could not determine number of CPUs" if not @quiet
|
277
|
+
1
|
278
|
+
end
|
279
|
+
|
280
|
+
def cleanup_keep_file(fn, opts = { wait: true })
|
281
|
+
if not @debug
|
282
|
+
keep = fn+'.keep'
|
283
|
+
return if not opts[:wait] and !File.exist?(keep)
|
284
|
+
$stderr.print "Trying to remove #{keep}\n" if not @quiet
|
285
|
+
while true
|
286
|
+
if File.exist?(keep)
|
287
|
+
$stderr.print "Removing #{keep}\n" if not @quiet
|
288
|
+
File.unlink(keep)
|
289
|
+
break # forever loop
|
290
|
+
end
|
291
|
+
sleep 0.1
|
292
|
+
end #forever
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
def cleanup_tmpdir
|
297
|
+
if not @debug
|
298
|
+
$stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
|
299
|
+
Dir.unlink(@tmpdir) if @tmpdir
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
end
|