bio-vcf 0.8.0 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -1,16 +1,14 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
1
+ # No longer generated by jeweler
4
2
  # -*- encoding: utf-8 -*-
5
3
 
6
4
  Gem::Specification.new do |s|
7
5
  s.name = "bio-vcf"
8
- s.version = "0.8.0"
6
+ s.version = File.read("VERSION")
9
7
 
10
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
9
  s.authors = ["Pjotr Prins"]
12
- s.date = "2014-09-19"
13
- s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
10
+ # s.date = "2015-12-28"
11
+ s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
14
12
  s.email = "pjotr.public01@thebird.nl"
15
13
  s.executables = ["bio-vcf"]
16
14
  s.extra_rdoc_files = [
@@ -20,79 +18,25 @@ Gem::Specification.new do |s|
20
18
  s.files = [
21
19
  ".travis.yml",
22
20
  "Gemfile",
23
- "Gemfile.lock",
24
21
  "LICENSE.txt",
25
22
  "README.md",
26
23
  "Rakefile",
27
24
  "VERSION",
28
25
  "bin/bio-vcf",
29
26
  "bio-vcf.gemspec",
30
- "features/cli.feature",
31
- "features/diff_count.feature",
32
- "features/multisample.feature",
33
- "features/sfilter.feature",
34
- "features/somaticsniper.feature",
35
- "features/step_definitions/bio-vcf_steps.rb",
36
- "features/step_definitions/cli-feature.rb",
37
- "features/step_definitions/diff_count.rb",
38
- "features/step_definitions/multisample.rb",
39
- "features/step_definitions/sfilter.rb",
40
- "features/step_definitions/somaticsniper.rb",
41
- "features/support/env.rb",
42
- "lib/bio-vcf.rb",
43
- "lib/bio-vcf/utils.rb",
44
- "lib/bio-vcf/variant.rb",
45
- "lib/bio-vcf/vcf.rb",
46
- "lib/bio-vcf/vcfgenotypefield.rb",
47
- "lib/bio-vcf/vcfheader.rb",
48
- "lib/bio-vcf/vcfline.rb",
49
- "lib/bio-vcf/vcfrdf.rb",
50
- "lib/bio-vcf/vcfrecord.rb",
51
- "lib/bio-vcf/vcfsample.rb",
52
- "lib/bio-vcf/vcfstatistics.rb",
53
- "template/gatk_vcf2rdf.erb",
54
- "template/vcf2json.erb",
55
- "template/vcf2rdf.erb",
56
- "test/data/input/dbsnp.vcf",
57
- "test/data/input/multisample.vcf",
58
- "test/data/input/somaticsniper.vcf",
59
- "test/data/regression/eval_r.info.dp.ref",
60
- "test/data/regression/ifilter_s.dp.ref",
61
- "test/data/regression/r.info.dp.ref",
62
- "test/data/regression/rewrite.info.sample.ref",
63
- "test/data/regression/s.dp.ref",
64
- "test/data/regression/seval_s.dp.ref",
65
- "test/data/regression/sfilter_seval_s.dp.ref",
66
- "test/data/regression/thread4.ref",
67
- "test/data/regression/thread4_4.ref",
68
- "test/data/regression/thread4_4_failed_filter-stderr.ref",
69
- "test/performance/metrics.md"
27
+ "ragel/gen_vcfheaderline_parser.rl",
28
+ "ragel/generate.sh",
70
29
  ]
71
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
30
+ s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
31
+ s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
32
+ Dir['template/**/*']
33
+
34
+ s.homepage = "http://github.com/vcflib/bio-vcf"
72
35
  s.licenses = ["MIT"]
73
36
  s.require_paths = ["lib"]
74
- s.rubygems_version = "2.0.3"
75
- s.summary = "Fast multi-threaded VCF parser"
37
+ s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
38
+ # s.rubygems_version = "2.0.3"
39
+ s.summary = "Fast multi-purpose multi-threaded VCF parser"
76
40
 
77
- if s.respond_to? :specification_version then
78
- s.specification_version = 4
79
-
80
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
81
- s.add_development_dependency(%q<rspec>, [">= 0"])
82
- s.add_development_dependency(%q<cucumber>, [">= 0"])
83
- s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
84
- s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
85
- else
86
- s.add_dependency(%q<rspec>, [">= 0"])
87
- s.add_dependency(%q<cucumber>, [">= 0"])
88
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
89
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
90
- end
91
- else
92
- s.add_dependency(%q<rspec>, [">= 0"])
93
- s.add_dependency(%q<cucumber>, [">= 0"])
94
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
95
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
96
- end
97
41
  end
98
42
 
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
11
11
  Scenario: Test the info filter using dp and threads
12
12
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
13
  When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
- Then I expect the named output to match the named output "thread4"
14
+ Then I expect the named output to match the named output "thread4" in under 30 seconds
15
15
 
16
16
  Scenario: Test the info filter using dp and threads with lines
17
17
  Given I have input file(s) named "test/data/input/multisample.vcf"
18
18
  When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
- Then I expect the named output to match the named output "thread4_4"
19
+ Then I expect the named output to match the named output "thread4_4" in under 30 seconds
20
20
 
21
21
  Scenario: Test the sample filter using dp
22
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
43
43
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
44
44
  Then I expect the named output to match the named output "sfilter_seval_s.dp"
45
45
 
46
-
47
46
  Scenario: Rewrite an info field
48
47
  Given I have input file(s) named "test/data/input/multisample.vcf"
49
48
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
50
49
  Then I expect the named output to match the named output "rewrite.info.sample"
51
50
 
51
+ Scenario: Test eval-once
52
+ Given I have input file(s) named "test/data/input/multisample.vcf"
53
+ When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
54
+ Then I expect the named output to match the named output "eval_once"
55
+
56
+ Scenario: Test JSON output with header meta data
57
+ Given I have input file(s) named "test/data/input/multisample.vcf"
58
+ When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
59
+ Then I expect the named output to match the named output "vcf2json_full_header"
60
+
61
+ Scenario: Test JSON output with header meta data and query samples
62
+ Given I have input file(s) named "test/data/input/multisample.vcf"
63
+ When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
64
+ Then I expect the named output to match the named output "vcf2json_use_meta"
65
+
52
66
  Scenario: Test deadlock on failed filter with threads
53
67
  Given I have input file(s) named "test/data/input/multisample.vcf"
54
- When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
68
+ When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
55
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
56
70
 
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -0,0 +1,12 @@
1
+ @filter
2
+ Feature: Adding filters
3
+
4
+ bio-vcf can add soft filters. Rather than removing failing items we can
5
+ inject filter state into the FILTER field. To add state such as PASS or
6
+ LowDepth simply use a filter and the --set-filter switch. If a filter already
7
+ has state the new one is appended with a semi-colon.
8
+
9
+ Scenario: Test the info filter using dp and threads
10
+ Given I have input file(s) named "test/data/input/somaticsniper.vcf"
11
+ When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
12
+ Then I expect the named output to match the named output "pass1"
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
25
25
  And I expect rec.info.ac to be 5
26
26
  And I expect rec.info.af to be 0.357
27
27
  And I expect rec.info.dp to be 1537
28
+ And I expect rec.info['dp'] to be 1537
28
29
  And I expect rec.info.readposranksum to be 0.815
30
+ And I expect rec.info['ReadPosRankSum'] to be 0.815
31
+ And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
29
32
  And I expect rec.sample['Original'].ad to be [189,25]
30
33
  And I expect rec.sample['Original'].gt to be "0/1"
31
34
  And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -63,3 +66,25 @@ Feature: Multi-sample VCF
63
66
  And I expect rec.sample.s3t2? to be true
64
67
  And I expect rec.missing_samples? to be true
65
68
 
69
+ # Phased genotype
70
+ Given multisample vcf line
71
+ """
72
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
73
+ """
74
+ When I parse the record
75
+ Then I expect rec.pos to contain 10723
76
+ Then I expect rec.valid? to be true
77
+ And I expect r.original? to be true
78
+ And I expect r.original.gts? to be true
79
+ And I expect r.original.gts to be ["C","G"]
80
+ And I expect r.original.gts[0] to be "C"
81
+ And I expect r.original.gts[1] to be "G"
82
+
83
+ # INFO fields with matching tails
84
+ Given multisample vcf line
85
+ """
86
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
87
+ """
88
+ When I parse the record
89
+ Then I expect r.info.end to be 111
90
+ And I expect r.info.ciend to be 999
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
46
46
  And I expect rec.tumor.amq.to_ary to be [37,37]
47
47
  And I expect rec.tumor.mq to be 37
48
48
  And I expect rec.tumor.ss to be 2
49
+ And I expect rec.tumor.ssc to be 33
50
+ And I expect rec.normal.ssc to be nil
49
51
  # The following are additional functions
50
52
  And I expect rec.call_diff to be [-4,2,-2,0]
51
53
  And I expect rec.call_nuc to be "C"
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
+
18
+
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
37
37
  expect(@rec1.info.readposranksum).to eq 0.815
38
38
  end
39
39
 
40
+ Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
41
+ expect(@rec1.info['dp']).to eq 1537
42
+ end
43
+
44
+ Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
45
+ expect(@rec1.info['ReadPosRankSum']).to eq 0.815
46
+ end
47
+
48
+ Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
49
+ expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
50
+ end
51
+
40
52
  Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
41
53
  expect(@rec1.sample['Original'].gt).to eq "0/1"
42
54
  end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
161
173
  expect(@rec1.original.gts[arg1.to_i]).to eq arg2
162
174
  end
163
175
 
176
+ Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
177
+ expect(@rec1.info.end).to eq arg1.to_i
178
+ end
179
+
180
+ Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
181
+ expect(@rec1.info.ciend).to eq arg1.to_i
182
+ end
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
99
99
  end
100
100
 
101
101
 
102
+ Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
103
+ expect(@rec.tumor.ssc).to be 33
104
+ end
105
+
106
+ Then(/^I expect rec\.normal\.ssc to be nil$/) do
107
+ expect(@rec.normal.ssc).to be nil
108
+ end
109
+
102
110
  Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
103
111
  expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
104
112
  end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
116
124
  end
117
125
 
118
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
119
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
120
128
  end
121
129
 
122
130
 
@@ -0,0 +1,48 @@
1
+ Given(/^the VCF header lines$/) do |string|
2
+ header = VcfHeader.new
3
+ header.add string
4
+ @vcf = header
5
+ end
6
+
7
+ When(/^I parse the VCF header$/) do
8
+ end
9
+
10
+ Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
11
+ expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
12
+ end
13
+
14
+ Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
15
+ expect(@vcf.fileformat).to eq arg1
16
+ end
17
+
18
+ Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
19
+ expect(@vcf.fileDate).to eq arg1
20
+ end
21
+
22
+ Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
23
+ expect(@vcf.field['fileDate']).to eq arg1
24
+ end
25
+
26
+ Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
27
+ expect(@vcf.phasing).to eq arg1
28
+ end
29
+
30
+ Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
31
+ expect(@vcf.reference).to eq arg1
32
+ end
33
+
34
+ Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
35
+ expect(@vcf.format[arg1].to_s).to eq arg2
36
+ end
37
+
38
+ Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
39
+ expect(@vcf.info[arg1].to_s).to eq arg2
40
+ end
41
+
42
+ Then(/^I expect vcf\.meta to contain all header meta information$/) do
43
+ m = @vcf.meta
44
+ expect(m['fileformat']).to eq "VCFv4.1"
45
+ expect(m['FORMAT']['DP']['Number']).to eq "1"
46
+ expect(m.size).to be 9
47
+ end
48
+
@@ -1,12 +1,3 @@
1
- require 'bundler'
2
- begin
3
- Bundler.setup(:default, :development)
4
- rescue Bundler::BundlerError => e
5
- $stderr.puts e.message
6
- $stderr.puts "Run `bundle install` to install missing gems"
7
- exit e.status_code
8
- end
9
-
10
1
  # require 'mini/test'
11
2
 
12
3
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
@@ -0,0 +1,35 @@
1
+ @meta
2
+ Feature: Parsing VCF meta information from the header
3
+
4
+ Take a header and parse that information as defined by the VCF standard.
5
+
6
+ Scenario: When parsing a header line
7
+
8
+ Given the VCF header lines
9
+ """
10
+ ##fileformat=VCFv4.1
11
+ ##fileDate=20140121
12
+ ##phasing=none
13
+ ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
14
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
16
+ ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
17
+ ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
18
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
19
+ """
20
+ When I parse the VCF header
21
+ Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
22
+ And I expect vcf.fileformat to be "VCFv4.1"
23
+ And I expect vcf.fileDate to be "20140121"
24
+ And I expect vcf.field['fileDate'] to be "20140121"
25
+ And I expect vcf.phasing to be "none"
26
+ And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
27
+ And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
28
+ And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
29
+ And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
30
+ And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
31
+ And I expect vcf.meta to contain all header meta information
32
+
33
+ Scenario: When parsing the header of somatic_sniper.vcf
34
+
35
+ Do something
@@ -11,9 +11,11 @@
11
11
  require 'bio-vcf/utils'
12
12
  require 'bio-vcf/vcf'
13
13
  require 'bio-vcf/vcfsample'
14
+ require 'bio-vcf/vcfheader_line'
14
15
  require 'bio-vcf/vcfheader'
15
16
  require 'bio-vcf/vcfline'
16
17
  require 'bio-vcf/vcfgenotypefield'
17
18
  require 'bio-vcf/vcfrecord'
18
19
  require 'bio-vcf/variant'
19
20
  require 'bio-vcf/vcfstatistics'
21
+ require 'bio-vcf/bedfilter'
@@ -0,0 +1,43 @@
1
+ module BioVcf
2
+
3
+ class BedFilter
4
+ def initialize bedfilen
5
+ require 'binary_search/native'
6
+
7
+ # Parse Bed file and build up search array
8
+ chrs = {}
9
+ info = {}
10
+ File.open(bedfilen).each_line { | line |
11
+ (chr,start,stop,gene) = line.strip.split(/\t/)[0..3]
12
+ chrs[chr] ||= []
13
+ chrs[chr].push(stop.to_i)
14
+ info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene]
15
+ }
16
+ # Make sure chrs is sorted
17
+ @chrs = {}
18
+ chrs.each { | k,list |
19
+ @chrs[k] = list.sort
20
+ }
21
+ @info = info
22
+ end
23
+
24
+ def contains(rec)
25
+ stop_list = @chrs[rec.chrom]
26
+ if stop_list
27
+ pos = rec.pos
28
+ stop = stop_list.bsearch { |bedstop| bedstop >= pos }
29
+ if stop
30
+ rinfo = @info[rec.chrom+':'+stop.to_s]
31
+ raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil
32
+ start = rinfo[1]
33
+ if pos >= start
34
+ # p [rec.chrom,rec.pos,rinfo]
35
+ return rinfo
36
+ end
37
+ end
38
+ end
39
+ nil
40
+ end
41
+ end
42
+
43
+ end