bio-vcf 0.8.0 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -1,16 +1,14 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
1
+ # No longer generated by jeweler
4
2
  # -*- encoding: utf-8 -*-
5
3
 
6
4
  Gem::Specification.new do |s|
7
5
  s.name = "bio-vcf"
8
- s.version = "0.8.0"
6
+ s.version = File.read("VERSION")
9
7
 
10
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
9
  s.authors = ["Pjotr Prins"]
12
- s.date = "2014-09-19"
13
- s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
10
+ # s.date = "2015-12-28"
11
+ s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
14
12
  s.email = "pjotr.public01@thebird.nl"
15
13
  s.executables = ["bio-vcf"]
16
14
  s.extra_rdoc_files = [
@@ -20,79 +18,25 @@ Gem::Specification.new do |s|
20
18
  s.files = [
21
19
  ".travis.yml",
22
20
  "Gemfile",
23
- "Gemfile.lock",
24
21
  "LICENSE.txt",
25
22
  "README.md",
26
23
  "Rakefile",
27
24
  "VERSION",
28
25
  "bin/bio-vcf",
29
26
  "bio-vcf.gemspec",
30
- "features/cli.feature",
31
- "features/diff_count.feature",
32
- "features/multisample.feature",
33
- "features/sfilter.feature",
34
- "features/somaticsniper.feature",
35
- "features/step_definitions/bio-vcf_steps.rb",
36
- "features/step_definitions/cli-feature.rb",
37
- "features/step_definitions/diff_count.rb",
38
- "features/step_definitions/multisample.rb",
39
- "features/step_definitions/sfilter.rb",
40
- "features/step_definitions/somaticsniper.rb",
41
- "features/support/env.rb",
42
- "lib/bio-vcf.rb",
43
- "lib/bio-vcf/utils.rb",
44
- "lib/bio-vcf/variant.rb",
45
- "lib/bio-vcf/vcf.rb",
46
- "lib/bio-vcf/vcfgenotypefield.rb",
47
- "lib/bio-vcf/vcfheader.rb",
48
- "lib/bio-vcf/vcfline.rb",
49
- "lib/bio-vcf/vcfrdf.rb",
50
- "lib/bio-vcf/vcfrecord.rb",
51
- "lib/bio-vcf/vcfsample.rb",
52
- "lib/bio-vcf/vcfstatistics.rb",
53
- "template/gatk_vcf2rdf.erb",
54
- "template/vcf2json.erb",
55
- "template/vcf2rdf.erb",
56
- "test/data/input/dbsnp.vcf",
57
- "test/data/input/multisample.vcf",
58
- "test/data/input/somaticsniper.vcf",
59
- "test/data/regression/eval_r.info.dp.ref",
60
- "test/data/regression/ifilter_s.dp.ref",
61
- "test/data/regression/r.info.dp.ref",
62
- "test/data/regression/rewrite.info.sample.ref",
63
- "test/data/regression/s.dp.ref",
64
- "test/data/regression/seval_s.dp.ref",
65
- "test/data/regression/sfilter_seval_s.dp.ref",
66
- "test/data/regression/thread4.ref",
67
- "test/data/regression/thread4_4.ref",
68
- "test/data/regression/thread4_4_failed_filter-stderr.ref",
69
- "test/performance/metrics.md"
27
+ "ragel/gen_vcfheaderline_parser.rl",
28
+ "ragel/generate.sh",
70
29
  ]
71
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
30
+ s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
31
+ s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
32
+ Dir['template/**/*']
33
+
34
+ s.homepage = "http://github.com/vcflib/bio-vcf"
72
35
  s.licenses = ["MIT"]
73
36
  s.require_paths = ["lib"]
74
- s.rubygems_version = "2.0.3"
75
- s.summary = "Fast multi-threaded VCF parser"
37
+ s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
38
+ # s.rubygems_version = "2.0.3"
39
+ s.summary = "Fast multi-purpose multi-threaded VCF parser"
76
40
 
77
- if s.respond_to? :specification_version then
78
- s.specification_version = 4
79
-
80
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
81
- s.add_development_dependency(%q<rspec>, [">= 0"])
82
- s.add_development_dependency(%q<cucumber>, [">= 0"])
83
- s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
84
- s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
85
- else
86
- s.add_dependency(%q<rspec>, [">= 0"])
87
- s.add_dependency(%q<cucumber>, [">= 0"])
88
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
89
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
90
- end
91
- else
92
- s.add_dependency(%q<rspec>, [">= 0"])
93
- s.add_dependency(%q<cucumber>, [">= 0"])
94
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
95
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
96
- end
97
41
  end
98
42
 
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
11
11
  Scenario: Test the info filter using dp and threads
12
12
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
13
  When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
- Then I expect the named output to match the named output "thread4"
14
+ Then I expect the named output to match the named output "thread4" in under 30 seconds
15
15
 
16
16
  Scenario: Test the info filter using dp and threads with lines
17
17
  Given I have input file(s) named "test/data/input/multisample.vcf"
18
18
  When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
- Then I expect the named output to match the named output "thread4_4"
19
+ Then I expect the named output to match the named output "thread4_4" in under 30 seconds
20
20
 
21
21
  Scenario: Test the sample filter using dp
22
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
43
43
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
44
44
  Then I expect the named output to match the named output "sfilter_seval_s.dp"
45
45
 
46
-
47
46
  Scenario: Rewrite an info field
48
47
  Given I have input file(s) named "test/data/input/multisample.vcf"
49
48
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
50
49
  Then I expect the named output to match the named output "rewrite.info.sample"
51
50
 
51
+ Scenario: Test eval-once
52
+ Given I have input file(s) named "test/data/input/multisample.vcf"
53
+ When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
54
+ Then I expect the named output to match the named output "eval_once"
55
+
56
+ Scenario: Test JSON output with header meta data
57
+ Given I have input file(s) named "test/data/input/multisample.vcf"
58
+ When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
59
+ Then I expect the named output to match the named output "vcf2json_full_header"
60
+
61
+ Scenario: Test JSON output with header meta data and query samples
62
+ Given I have input file(s) named "test/data/input/multisample.vcf"
63
+ When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
64
+ Then I expect the named output to match the named output "vcf2json_use_meta"
65
+
52
66
  Scenario: Test deadlock on failed filter with threads
53
67
  Given I have input file(s) named "test/data/input/multisample.vcf"
54
- When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
68
+ When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
55
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
56
70
 
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -0,0 +1,12 @@
1
+ @filter
2
+ Feature: Adding filters
3
+
4
+ bio-vcf can add soft filters. Rather than removing failing items we can
5
+ inject filter state into the FILTER field. To add state such as PASS or
6
+ LowDepth simply use a filter and the --set-filter switch. If a filter already
7
+ has state the new one is appended with a semi-colon.
8
+
9
+ Scenario: Test the info filter using dp and threads
10
+ Given I have input file(s) named "test/data/input/somaticsniper.vcf"
11
+ When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
12
+ Then I expect the named output to match the named output "pass1"
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
25
25
  And I expect rec.info.ac to be 5
26
26
  And I expect rec.info.af to be 0.357
27
27
  And I expect rec.info.dp to be 1537
28
+ And I expect rec.info['dp'] to be 1537
28
29
  And I expect rec.info.readposranksum to be 0.815
30
+ And I expect rec.info['ReadPosRankSum'] to be 0.815
31
+ And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
29
32
  And I expect rec.sample['Original'].ad to be [189,25]
30
33
  And I expect rec.sample['Original'].gt to be "0/1"
31
34
  And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -63,3 +66,25 @@ Feature: Multi-sample VCF
63
66
  And I expect rec.sample.s3t2? to be true
64
67
  And I expect rec.missing_samples? to be true
65
68
 
69
+ # Phased genotype
70
+ Given multisample vcf line
71
+ """
72
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
73
+ """
74
+ When I parse the record
75
+ Then I expect rec.pos to contain 10723
76
+ Then I expect rec.valid? to be true
77
+ And I expect r.original? to be true
78
+ And I expect r.original.gts? to be true
79
+ And I expect r.original.gts to be ["C","G"]
80
+ And I expect r.original.gts[0] to be "C"
81
+ And I expect r.original.gts[1] to be "G"
82
+
83
+ # INFO fields with matching tails
84
+ Given multisample vcf line
85
+ """
86
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
87
+ """
88
+ When I parse the record
89
+ Then I expect r.info.end to be 111
90
+ And I expect r.info.ciend to be 999
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
46
46
  And I expect rec.tumor.amq.to_ary to be [37,37]
47
47
  And I expect rec.tumor.mq to be 37
48
48
  And I expect rec.tumor.ss to be 2
49
+ And I expect rec.tumor.ssc to be 33
50
+ And I expect rec.normal.ssc to be nil
49
51
  # The following are additional functions
50
52
  And I expect rec.call_diff to be [-4,2,-2,0]
51
53
  And I expect rec.call_nuc to be "C"
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
+
18
+
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
37
37
  expect(@rec1.info.readposranksum).to eq 0.815
38
38
  end
39
39
 
40
+ Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
41
+ expect(@rec1.info['dp']).to eq 1537
42
+ end
43
+
44
+ Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
45
+ expect(@rec1.info['ReadPosRankSum']).to eq 0.815
46
+ end
47
+
48
+ Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
49
+ expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
50
+ end
51
+
40
52
  Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
41
53
  expect(@rec1.sample['Original'].gt).to eq "0/1"
42
54
  end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
161
173
  expect(@rec1.original.gts[arg1.to_i]).to eq arg2
162
174
  end
163
175
 
176
+ Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
177
+ expect(@rec1.info.end).to eq arg1.to_i
178
+ end
179
+
180
+ Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
181
+ expect(@rec1.info.ciend).to eq arg1.to_i
182
+ end
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
99
99
  end
100
100
 
101
101
 
102
+ Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
103
+ expect(@rec.tumor.ssc).to be 33
104
+ end
105
+
106
+ Then(/^I expect rec\.normal\.ssc to be nil$/) do
107
+ expect(@rec.normal.ssc).to be nil
108
+ end
109
+
102
110
  Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
103
111
  expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
104
112
  end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
116
124
  end
117
125
 
118
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
119
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
120
128
  end
121
129
 
122
130
 
@@ -0,0 +1,48 @@
1
+ Given(/^the VCF header lines$/) do |string|
2
+ header = VcfHeader.new
3
+ header.add string
4
+ @vcf = header
5
+ end
6
+
7
+ When(/^I parse the VCF header$/) do
8
+ end
9
+
10
+ Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
11
+ expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
12
+ end
13
+
14
+ Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
15
+ expect(@vcf.fileformat).to eq arg1
16
+ end
17
+
18
+ Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
19
+ expect(@vcf.fileDate).to eq arg1
20
+ end
21
+
22
+ Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
23
+ expect(@vcf.field['fileDate']).to eq arg1
24
+ end
25
+
26
+ Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
27
+ expect(@vcf.phasing).to eq arg1
28
+ end
29
+
30
+ Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
31
+ expect(@vcf.reference).to eq arg1
32
+ end
33
+
34
+ Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
35
+ expect(@vcf.format[arg1].to_s).to eq arg2
36
+ end
37
+
38
+ Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
39
+ expect(@vcf.info[arg1].to_s).to eq arg2
40
+ end
41
+
42
+ Then(/^I expect vcf\.meta to contain all header meta information$/) do
43
+ m = @vcf.meta
44
+ expect(m['fileformat']).to eq "VCFv4.1"
45
+ expect(m['FORMAT']['DP']['Number']).to eq "1"
46
+ expect(m.size).to be 9
47
+ end
48
+
@@ -1,12 +1,3 @@
1
- require 'bundler'
2
- begin
3
- Bundler.setup(:default, :development)
4
- rescue Bundler::BundlerError => e
5
- $stderr.puts e.message
6
- $stderr.puts "Run `bundle install` to install missing gems"
7
- exit e.status_code
8
- end
9
-
10
1
  # require 'mini/test'
11
2
 
12
3
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
@@ -0,0 +1,35 @@
1
+ @meta
2
+ Feature: Parsing VCF meta information from the header
3
+
4
+ Take a header and parse that information as defined by the VCF standard.
5
+
6
+ Scenario: When parsing a header line
7
+
8
+ Given the VCF header lines
9
+ """
10
+ ##fileformat=VCFv4.1
11
+ ##fileDate=20140121
12
+ ##phasing=none
13
+ ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
14
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
16
+ ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
17
+ ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
18
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
19
+ """
20
+ When I parse the VCF header
21
+ Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
22
+ And I expect vcf.fileformat to be "VCFv4.1"
23
+ And I expect vcf.fileDate to be "20140121"
24
+ And I expect vcf.field['fileDate'] to be "20140121"
25
+ And I expect vcf.phasing to be "none"
26
+ And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
27
+ And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
28
+ And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
29
+ And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
30
+ And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
31
+ And I expect vcf.meta to contain all header meta information
32
+
33
+ Scenario: When parsing the header of somatic_sniper.vcf
34
+
35
+ Do something
@@ -11,9 +11,11 @@
11
11
  require 'bio-vcf/utils'
12
12
  require 'bio-vcf/vcf'
13
13
  require 'bio-vcf/vcfsample'
14
+ require 'bio-vcf/vcfheader_line'
14
15
  require 'bio-vcf/vcfheader'
15
16
  require 'bio-vcf/vcfline'
16
17
  require 'bio-vcf/vcfgenotypefield'
17
18
  require 'bio-vcf/vcfrecord'
18
19
  require 'bio-vcf/variant'
19
20
  require 'bio-vcf/vcfstatistics'
21
+ require 'bio-vcf/bedfilter'
@@ -0,0 +1,43 @@
1
+ module BioVcf
2
+
3
+ class BedFilter
4
+ def initialize bedfilen
5
+ require 'binary_search/native'
6
+
7
+ # Parse Bed file and build up search array
8
+ chrs = {}
9
+ info = {}
10
+ File.open(bedfilen).each_line { | line |
11
+ (chr,start,stop,gene) = line.strip.split(/\t/)[0..3]
12
+ chrs[chr] ||= []
13
+ chrs[chr].push(stop.to_i)
14
+ info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene]
15
+ }
16
+ # Make sure chrs is sorted
17
+ @chrs = {}
18
+ chrs.each { | k,list |
19
+ @chrs[k] = list.sort
20
+ }
21
+ @info = info
22
+ end
23
+
24
+ def contains(rec)
25
+ stop_list = @chrs[rec.chrom]
26
+ if stop_list
27
+ pos = rec.pos
28
+ stop = stop_list.bsearch { |bedstop| bedstop >= pos }
29
+ if stop
30
+ rinfo = @info[rec.chrom+':'+stop.to_s]
31
+ raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil
32
+ start = rinfo[1]
33
+ if pos >= start
34
+ # p [rec.chrom,rec.pos,rinfo]
35
+ return rinfo
36
+ end
37
+ end
38
+ end
39
+ nil
40
+ end
41
+ end
42
+
43
+ end