RubyGems - bio-vcf - Versions diffs - 0.8.0 → 0.9.4 - Mend

bio-vcf 0.8.0 → 0.9.4

Files changed (85) hide show

checksums.yaml +5 -5
data/.travis.yml +1 -11
data/Gemfile +4 -5
data/Gemfile.lock +28 -65
data/LICENSE.txt +1 -1
data/README.md +387 -107
data/RELEASE_NOTES.md +20 -0
data/RELEASE_NOTES.md~ +11 -0
data/Rakefile +3 -40
data/TAGS +115 -0
data/VERSION +1 -1
data/bin/bio-vcf +176 -109
data/bio-vcf.gemspec +14 -70
data/features/cli.feature +22 -4
data/features/diff_count.feature +0 -1
data/features/filter.feature +12 -0
data/features/multisample.feature +25 -0
data/features/somaticsniper.feature +2 -0
data/features/step_definitions/cli-feature.rb +15 -6
data/features/step_definitions/diff_count.rb +1 -1
data/features/step_definitions/multisample.rb +19 -0
data/features/step_definitions/somaticsniper.rb +9 -1
data/features/step_definitions/vcf_header.rb +48 -0
data/features/support/env.rb +0 -9
data/features/vcf_header.feature +35 -0
data/lib/bio-vcf.rb +2 -0
data/lib/bio-vcf/bedfilter.rb +43 -0
data/lib/bio-vcf/pcows.rb +303 -0
data/lib/bio-vcf/template.rb +75 -0
data/lib/bio-vcf/vcffile.rb +46 -0
data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
data/lib/bio-vcf/vcfheader.rb +146 -6
data/lib/bio-vcf/vcfheader_line.rb +778 -0
data/lib/bio-vcf/vcfrecord.rb +56 -18
data/lib/bio-vcf/vcfsample.rb +27 -3
data/ragel/gen_vcfheaderline_parser.rl +165 -0
data/ragel/generate.sh +8 -0
data/template/vcf2json.erb +19 -7
data/template/vcf2json_full_header.erb +22 -0
data/template/vcf2json_use_meta.erb +41 -0
data/template/vcf2rdf_header.erb +24 -0
data/test/data/input/empty.vcf +2 -0
data/test/data/input/gatk_exome.vcf +237 -0
data/test/data/input/gatk_wgs.vcf +1000 -0
data/test/data/input/test.bed +632 -0
data/test/data/regression/empty-stderr.new +12 -0
data/test/data/regression/empty.new +2 -0
data/test/data/regression/empty.ref +2 -0
data/test/data/regression/eval_once-stderr.new +2 -0
data/test/data/regression/eval_once.new +1 -0
data/test/data/regression/eval_once.ref +1 -0
data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
data/test/data/regression/eval_r.info.dp.new +150 -0
data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
data/test/data/regression/ifilter_s.dp.new +31 -0
data/test/data/regression/pass1-stderr.new +10 -0
data/test/data/regression/pass1.new +88 -0
data/test/data/regression/pass1.ref +88 -0
data/test/data/regression/r.info.dp-stderr.new +4 -0
data/test/data/regression/r.info.dp.new +114 -0
data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
data/test/data/regression/rewrite.info.sample.new +150 -0
data/test/data/regression/s.dp-stderr.new +18 -0
data/test/data/regression/s.dp.new +145 -0
data/test/data/regression/seval_s.dp-stderr.new +10 -0
data/test/data/regression/seval_s.dp.new +36 -0
data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
data/test/data/regression/sfilter_seval_s.dp.new +31 -0
data/test/data/regression/thread4-stderr.new +10 -0
data/test/data/regression/thread4.new +150 -0
data/test/data/regression/thread4_4-stderr.new +25 -0
data/test/data/regression/thread4_4.new +130 -0
data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
data/test/data/regression/thread4_4_failed_filter.new +110 -0
data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
data/test/data/regression/vcf2json_full_header.new +225 -0
data/test/data/regression/vcf2json_full_header.ref +225 -0
data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
data/test/data/regression/vcf2json_use_meta.new +4697 -0
data/test/data/regression/vcf2json_use_meta.ref +4697 -0
data/test/performance/metrics.md +18 -1
data/test/stress/stress_test.sh +15 -0
data/test/tmp/test.vcf +12469 -0
metadata +65 -64

data/bio-vcf.gemspec CHANGED

@@ -1,16 +1,14 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# No longer generated by jeweler
 # -*- encoding: utf-8 -*-
 Gem::Specification.new do |s|
   s.name = "bio-vcf"
-  s.version = "0.8.0"
+  s.version = File.read("VERSION")
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Pjotr Prins"]
-  s.date = "2014-09-19"
-  s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
+  # s.date = "2015-12-28"
+  s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
   s.email = "pjotr.public01@thebird.nl"
   s.executables = ["bio-vcf"]
   s.extra_rdoc_files = [
@@ -20,79 +18,25 @@ Gem::Specification.new do |s|
   s.files = [
     ".travis.yml",
     "Gemfile",
-    "Gemfile.lock",
     "LICENSE.txt",
     "README.md",
     "Rakefile",
     "VERSION",
     "bin/bio-vcf",
     "bio-vcf.gemspec",
-    "features/cli.feature",
-    "features/diff_count.feature",
-    "features/multisample.feature",
-    "features/sfilter.feature",
-    "features/somaticsniper.feature",
-    "features/step_definitions/bio-vcf_steps.rb",
-    "features/step_definitions/cli-feature.rb",
-    "features/step_definitions/diff_count.rb",
-    "features/step_definitions/multisample.rb",
-    "features/step_definitions/sfilter.rb",
-    "features/step_definitions/somaticsniper.rb",
-    "features/support/env.rb",
-    "lib/bio-vcf.rb",
-    "lib/bio-vcf/utils.rb",
-    "lib/bio-vcf/variant.rb",
-    "lib/bio-vcf/vcf.rb",
-    "lib/bio-vcf/vcfgenotypefield.rb",
-    "lib/bio-vcf/vcfheader.rb",
-    "lib/bio-vcf/vcfline.rb",
-    "lib/bio-vcf/vcfrdf.rb",
-    "lib/bio-vcf/vcfrecord.rb",
-    "lib/bio-vcf/vcfsample.rb",
-    "lib/bio-vcf/vcfstatistics.rb",
-    "template/gatk_vcf2rdf.erb",
-    "template/vcf2json.erb",
-    "template/vcf2rdf.erb",
-    "test/data/input/dbsnp.vcf",
-    "test/data/input/multisample.vcf",
-    "test/data/input/somaticsniper.vcf",
-    "test/data/regression/eval_r.info.dp.ref",
-    "test/data/regression/ifilter_s.dp.ref",
-    "test/data/regression/r.info.dp.ref",
-    "test/data/regression/rewrite.info.sample.ref",
-    "test/data/regression/s.dp.ref",
-    "test/data/regression/seval_s.dp.ref",
-    "test/data/regression/sfilter_seval_s.dp.ref",
-    "test/data/regression/thread4.ref",
-    "test/data/regression/thread4_4.ref",
-    "test/data/regression/thread4_4_failed_filter-stderr.ref",
-    "test/performance/metrics.md"
+    "ragel/gen_vcfheaderline_parser.rl",
+    "ragel/generate.sh",
   ]
-  s.homepage = "http://github.com/pjotrp/bioruby-vcf"
+  s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
+  s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
+             Dir['template/**/*']
+  s.homepage = "http://github.com/vcflib/bio-vcf"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
-  s.rubygems_version = "2.0.3"
-  s.summary = "Fast multi-threaded VCF parser"
+  s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
+  # s.rubygems_version = "2.0.3"
+  s.summary = "Fast multi-purpose multi-threaded VCF parser"
-  if s.respond_to? :specification_version then
-    s.specification_version = 4
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<rspec>, [">= 0"])
-      s.add_development_dependency(%q<cucumber>, [">= 0"])
-      s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
-      s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
-    else
-      s.add_dependency(%q<rspec>, [">= 0"])
-      s.add_dependency(%q<cucumber>, [">= 0"])
-      s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
-      s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
-    end
-  else
-    s.add_dependency(%q<rspec>, [">= 0"])
-    s.add_dependency(%q<cucumber>, [">= 0"])
-    s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
-    s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
-  end
 end

data/features/cli.feature CHANGED

@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
   Scenario: Test the info filter using dp and threads
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
-    Then I expect the named output to match the named output "thread4"
+    Then I expect the named output to match the named output "thread4" in under 30 seconds
   Scenario: Test the info filter using dp and threads with lines
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
-    Then I expect the named output to match the named output "thread4_4"
+    Then I expect the named output to match the named output "thread4_4" in under 30 seconds
   Scenario: Test the sample filter using dp
     Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
     When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
     Then I expect the named output to match the named output "sfilter_seval_s.dp"
   Scenario: Rewrite an info field
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
     Then I expect the named output to match the named output "rewrite.info.sample"
+  Scenario: Test eval-once
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
+    Then I expect the named output to match the named output "eval_once"
+  Scenario: Test JSON output with header meta data
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
+    Then I expect the named output to match the named output "vcf2json_full_header"
+  Scenario: Test JSON output with header meta data and query samples
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
+    Then I expect the named output to match the named output "vcf2json_use_meta"
   Scenario: Test deadlock on failed filter with threads
     Given I have input file(s) named "test/data/input/multisample.vcf"
-    When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
+    When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
     Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
+  Scenario: Test VCF with no records
+    Given I have input file(s) named "test/data/input/empty.vcf"
+    When I execute "./bin/bio-vcf --timeout=5"
+    Then I expect no errors

data/features/diff_count.feature CHANGED

@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
     Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
     When I look for the difference
     Then I expect the diff to be [0,15,0,11]
-    And the relative diff to be [0,0.23,0,0.85]
     And I expect the defining tumor nucleotide to be "T"
     And I expect the tumor count to be 12
     When I set an inclusion threshold for the reference

data/features/filter.feature ADDED

@@ -0,0 +1,12 @@
+@filter
+Feature: Adding filters
+  bio-vcf can add soft filters. Rather than removing failing items we can
+  inject filter state into the FILTER field. To add state such as PASS or
+  LowDepth simply use a filter and the --set-filter switch. If a filter already
+  has state the new one is appended with a semi-colon.
+  Scenario: Test the info filter using dp and threads
+    Given I have input file(s) named "test/data/input/somaticsniper.vcf"
+    When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
+    Then I expect the named output to match the named output "pass1"

data/features/multisample.feature CHANGED

@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
     And I expect rec.info.ac to be 5
     And I expect rec.info.af to be 0.357
     And I expect rec.info.dp to be 1537
+    And I expect rec.info['dp'] to be 1537
     And I expect rec.info.readposranksum to be 0.815
+    And I expect rec.info['ReadPosRankSum'] to be 0.815
+    And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
     And I expect rec.sample['Original'].ad to be [189,25]
     And I expect rec.sample['Original'].gt to be "0/1"
     And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -63,3 +66,25 @@ Feature: Multi-sample VCF
     And I expect rec.sample.s3t2? to be true
     And I expect rec.missing_samples? to be true
+    # Phased genotype
+    Given multisample vcf line
+    """
+1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
+    """
+    When I parse the record
+    Then I expect rec.pos to contain 10723
+    Then I expect rec.valid? to be true
+    And I expect r.original? to be true
+    And I expect r.original.gts? to be true
+    And I expect r.original.gts to be ["C","G"]
+    And I expect r.original.gts[0] to be "C"
+    And I expect r.original.gts[1] to be "G"
+    # INFO fields with matching tails
+    Given multisample vcf line
+    """
+1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
+    """
+    When I parse the record
+    Then I expect r.info.end to be 111
+    And I expect r.info.ciend to be 999

data/features/somaticsniper.feature CHANGED

@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
     And I expect rec.tumor.amq.to_ary to be [37,37]
     And I expect rec.tumor.mq to be 37
     And I expect rec.tumor.ss to be 2
+    And I expect rec.tumor.ssc to be 33
+    And I expect rec.normal.ssc to be nil
     # The following are additional functions
     And I expect rec.call_diff to be [-4,2,-2,0]
     And I expect rec.call_nuc to be "C"

data/features/step_definitions/cli-feature.rb CHANGED

@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
   @cmd = arg1 + ' < ' + @filenames[0]
 end
-Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
-end
+# Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
+# end
-Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
-end
+# Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
+# end
+# Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
+# end
+# Then(/^I expect no errors$/) do
+#   RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
+# end

data/features/step_definitions/diff_count.rb CHANGED

@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
 end
 Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
-  res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
+  res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
   expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
 end

data/features/step_definitions/multisample.rb CHANGED

@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
   expect(@rec1.info.readposranksum).to eq 0.815
 end
+Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
+  expect(@rec1.info['dp']).to eq 1537
+end
+Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
+  expect(@rec1.info['ReadPosRankSum']).to eq 0.815
+end
+Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
+  expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
+end
 Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
   expect(@rec1.sample['Original'].gt).to eq "0/1"
 end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
   expect(@rec1.original.gts[arg1.to_i]).to eq arg2
 end
+Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
+  expect(@rec1.info.end).to eq arg1.to_i
+end
+Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
+  expect(@rec1.info.ciend).to eq arg1.to_i
+end

data/features/step_definitions/somaticsniper.rb CHANGED

@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
 end
+Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
+  expect(@rec.tumor.ssc).to be 33
+end
+Then(/^I expect rec\.normal\.ssc to be nil$/) do
+  expect(@rec.normal.ssc).to be nil
+end
 Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
   expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
 end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
 end
 Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
-  expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
+  expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
 end

data/features/step_definitions/vcf_header.rb ADDED

@@ -0,0 +1,48 @@
+Given(/^the VCF header lines$/) do |string|
+  header = VcfHeader.new
+  header.add string
+  @vcf = header
+end
+When(/^I parse the VCF header$/) do
+end
+Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
+  expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
+end
+Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
+  expect(@vcf.fileformat).to eq arg1
+end
+Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
+  expect(@vcf.fileDate).to eq arg1
+end
+Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
+  expect(@vcf.field['fileDate']).to eq arg1
+end
+Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
+  expect(@vcf.phasing).to eq arg1
+end
+Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
+  expect(@vcf.reference).to eq arg1
+end
+Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
+  expect(@vcf.format[arg1].to_s).to eq arg2
+end
+Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
+  expect(@vcf.info[arg1].to_s).to eq arg2
+end
+Then(/^I expect vcf\.meta to contain all header meta information$/) do
+  m = @vcf.meta
+  expect(m['fileformat']).to eq "VCFv4.1"
+  expect(m['FORMAT']['DP']['Number']).to eq "1"
+  expect(m.size).to be 9
+end

data/features/support/env.rb CHANGED

@@ -1,12 +1,3 @@
-require 'bundler'
-begin
-  Bundler.setup(:default, :development)
-rescue Bundler::BundlerError => e
-  $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
-  exit e.status_code
-end
 # require 'mini/test'
 $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')

data/features/vcf_header.feature ADDED

@@ -0,0 +1,35 @@
+@meta
+Feature: Parsing VCF meta information from the header
+  Take a header and parse that information as defined by the VCF standard.
+  Scenario: When parsing a header line
+    Given the VCF header lines
+    """
+##fileformat=VCFv4.1
+##fileDate=20140121
+##phasing=none
+##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
+##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
+    """
+    When I parse the VCF header
+    Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
+    And I expect vcf.fileformat to be "VCFv4.1"
+    And I expect vcf.fileDate to be "20140121"
+    And I expect vcf.field['fileDate'] to be "20140121"
+    And I expect vcf.phasing to be "none"
+    And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
+    And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
+    And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
+    And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
+    And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
+    And I expect vcf.meta to contain all header meta information
+  Scenario: When parsing the header of somatic_sniper.vcf
+    Do something

data/lib/bio-vcf.rb CHANGED

@@ -11,9 +11,11 @@
 require 'bio-vcf/utils'
 require 'bio-vcf/vcf'
 require 'bio-vcf/vcfsample'
+require 'bio-vcf/vcfheader_line'
 require 'bio-vcf/vcfheader'
 require 'bio-vcf/vcfline'
 require 'bio-vcf/vcfgenotypefield'
 require 'bio-vcf/vcfrecord'
 require 'bio-vcf/variant'
 require 'bio-vcf/vcfstatistics'
+require 'bio-vcf/bedfilter'

data/lib/bio-vcf/bedfilter.rb ADDED

@@ -0,0 +1,43 @@
+module BioVcf
+  class BedFilter
+    def initialize bedfilen
+      require 'binary_search/native'
+      # Parse Bed file and build up search array
+      chrs = {}
+      info = {}
+      File.open(bedfilen).each_line { | line |
+        (chr,start,stop,gene) = line.strip.split(/\t/)[0..3]
+        chrs[chr] ||= []
+        chrs[chr].push(stop.to_i)
+        info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene]
+      }
+      # Make sure chrs is sorted
+      @chrs = {}
+      chrs.each { | k,list |
+        @chrs[k] = list.sort
+      }
+      @info = info
+    end
+    def contains(rec)
+      stop_list = @chrs[rec.chrom]
+      if stop_list
+        pos = rec.pos
+        stop = stop_list.bsearch { |bedstop| bedstop >= pos }
+        if stop
+          rinfo = @info[rec.chrom+':'+stop.to_s]
+          raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil
+          start = rinfo[1]
+          if pos >= start
+            # p [rec.chrom,rec.pos,rinfo]
+            return rinfo
+          end
+        end
+      end
+      nil
+    end
+  end
+end