RubyGems - bio-vcf - Versions diffs - 0.7.0 → 0.7.3 - Mend

bio-vcf 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -2
data/Gemfile +2 -5
data/Gemfile.lock +3 -3
data/README.md +101 -23
data/Rakefile +4 -2
data/VERSION +1 -1
data/bin/bio-vcf +133 -73
data/bio-vcf.gemspec +13 -10
data/features/cli.feature +9 -1
data/features/multisample.feature +4 -4
data/features/sfilter.feature +1 -1
data/features/step_definitions/cli-feature.rb +4 -0
data/features/step_definitions/multisample.rb +24 -12
data/features/step_definitions/sfilter.rb +80 -31
data/lib/bio-vcf.rb +1 -0
data/lib/bio-vcf/vcfgenotypefield.rb +45 -9
data/lib/bio-vcf/vcfheader.rb +1 -1
data/lib/bio-vcf/vcfrecord.rb +14 -8
data/lib/bio-vcf/vcfsample.rb +101 -152
data/lib/bio-vcf/vcfstatistics.rb +28 -0
data/test/data/regression/ifilter_s.dp.ref +31 -0
data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -0
metadata +16 -12

data/bio-vcf.gemspec CHANGED

@@ -5,12 +5,12 @@
 Gem::Specification.new do |s|
   s.name = "bio-vcf"
-  s.version = "0.7.0"
+  s.version = "0.7.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Pjotr Prins"]
-  s.date = "2014-06-24"
-  s.description = "Smart parser for VCF format"
+  s.date = "2014-09-01"
+  s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
   s.email = "pjotr.public01@thebird.nl"
   s.executables = ["bio-vcf"]
   s.extra_rdoc_files = [
@@ -49,10 +49,12 @@ Gem::Specification.new do |s|
     "lib/bio-vcf/vcfrdf.rb",
     "lib/bio-vcf/vcfrecord.rb",
     "lib/bio-vcf/vcfsample.rb",
+    "lib/bio-vcf/vcfstatistics.rb",
     "test/data/input/dbsnp.vcf",
     "test/data/input/multisample.vcf",
     "test/data/input/somaticsniper.vcf",
     "test/data/regression/eval_r.info.dp.ref",
+    "test/data/regression/ifilter_s.dp.ref",
     "test/data/regression/r.info.dp.ref",
     "test/data/regression/rewrite.info.sample.ref",
     "test/data/regression/s.dp.ref",
@@ -60,13 +62,14 @@ Gem::Specification.new do |s|
     "test/data/regression/sfilter_seval_s.dp.ref",
     "test/data/regression/thread4.ref",
     "test/data/regression/thread4_4.ref",
+    "test/data/regression/thread4_4_failed_filter-stderr.ref",
     "test/performance/metrics.md"
   ]
   s.homepage = "http://github.com/pjotrp/bioruby-vcf"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
   s.rubygems_version = "2.0.3"
-  s.summary = "VCF parser"
+  s.summary = "Fast multi-threaded VCF parser"
   if s.respond_to? :specification_version then
     s.specification_version = 4
@@ -74,19 +77,19 @@ Gem::Specification.new do |s|
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
       s.add_development_dependency(%q<rspec>, [">= 0"])
       s.add_development_dependency(%q<cucumber>, [">= 0"])
-      s.add_development_dependency(%q<jeweler>, [">= 0"])
-      s.add_development_dependency(%q<regressiontest>, [">= 0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
+      s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
     else
       s.add_dependency(%q<rspec>, [">= 0"])
       s.add_dependency(%q<cucumber>, [">= 0"])
-      s.add_dependency(%q<jeweler>, [">= 0"])
-      s.add_dependency(%q<regressiontest>, [">= 0"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
+      s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
     end
   else
     s.add_dependency(%q<rspec>, [">= 0"])
     s.add_dependency(%q<cucumber>, [">= 0"])
-    s.add_dependency(%q<jeweler>, [">= 0"])
-    s.add_dependency(%q<regressiontest>, [">= 0"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
+    s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
   end
 end

data/features/cli.feature CHANGED

@@ -23,6 +23,11 @@ Feature: Command-line interface (CLI)
     When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'"
     Then I expect the named output to match the named output "s.dp"
+  Scenario: Test the include sample filter using dp
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf -i --ifilter 's.dp>100' --seval s.dp"
+    Then I expect the named output to match the named output "ifilter_s.dp"
   Scenario: Test the info eval using dp
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf -i --eval 'r.info.dp'"
@@ -44,5 +49,8 @@ Feature: Command-line interface (CLI)
     When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
     Then I expect the named output to match the named output "rewrite.info.sample"
+  Scenario: Test deadlock on failed filter with threads
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
+    Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds

data/features/multisample.feature CHANGED

@@ -27,17 +27,17 @@ Feature: Multi-sample VCF
     And I expect rec.info.dp to be 1537
     And I expect rec.info.readposranksum to be 0.815
     And I expect rec.sample['Original'].ad to be [189,25]
-    And I expect rec.sample['Original'].gt to be [0,1]
+    And I expect rec.sample['Original'].gt to be "0/1"
     And I expect rec.sample['s3t2'].ad to be [167,26]
     And I expect rec.sample['s3t2'].dp to be 196
     And I expect rec.sample['s3t2'].gq to be 20
     And I expect rec.sample['s3t2'].pl to be [20,0,522]
     # And the nicer self resolving
-    And I expect rec.sample.original.gt to be [0,1]
+    And I expect rec.sample.original.gt to be "0/1"
     And I expect rec.sample.s3t2.pl to be [20,0,522]
     # And the even better
-    And I expect r.original.gt? to be true
-    And I expect rec.original.gt to be [0,1]
+    And I expect rec.original.gt? to be true
+    And I expect rec.original.gt to be "0/1"
     And I expect rec.s3t2.pl to be [20,0,522]
     # Check for missing data
     And I expect test rec.missing_samples? to be false

data/features/sfilter.feature CHANGED

@@ -35,7 +35,7 @@ Feature: Sample filters
     When I evaluate empty './.'
     Then I expect s.empty? to be true
     Then I expect s.dp? to be false
-    Then I expect s.dp to throw an error
+    Then I expect s.dp to be nil
     And sfilter 's.dp>4' to throw an error
   # Scenario: Missing sample with ignore missing set

data/features/step_definitions/cli-feature.rb CHANGED

@@ -10,3 +10,7 @@ end
 Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
 end
+Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
+  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
+end

data/features/step_definitions/multisample.rb CHANGED

@@ -37,6 +37,14 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
   expect(@rec1.info.readposranksum).to eq 0.815
 end
+Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
+  expect(@rec1.sample['Original'].gt).to eq "0/1"
+end
+Then(/^I expect rec\.original\.gt to be "(.*?)"$/) do |arg1|
+  expect(@rec1.original.gt).to eq "0/1"
+end
 Then(/^I expect rec\.sample\['Original'\]\.gt to be "(.*?)"$/) do |arg1|
   expect(@rec1.sample['Original'].gt).to eq "0/1"
 end
@@ -97,10 +105,6 @@ Then(/^I expect rec\.original\? to be true$/) do
   expect(@rec1.original?).to be true
 end
-Given(/^multisample vcf line with missing data$/) do |string|
-  pending # express the regexp above with the code you wish you had
-end
 Then(/^I expect rec\.original\? to be false$/) do
   expect(@rec1.original?).to eq false
 end
@@ -118,34 +122,42 @@ Then(/^I expect rec\.valid\? to be true$/) do
 end
 Then(/^I expect r\.original\.gt\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gt?).to be true
 end
 Then(/^I expect r\.original\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original?).to be true
+end
+Then(/^I expect rec\.original\? to be true$/) do
+  expect(@rec1.original?).to be true
+end
+Then(/^I expect rec\.original\.gt\? to be true$/) do
+  expect(@rec1.original.gt?).to be true
 end
 Then(/^I expect r\.original\.gti\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gti?).to eq true
 end
 Then(/^I expect r\.original\.gti to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gti).to eq [arg1.to_i,arg2.to_i]
 end
 Then(/^I expect r\.original\.gti\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gti[arg1.to_i]).to eq arg2.to_i
 end
 Then(/^I expect r\.original\.gts\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gts?).to eq true
 end
 Then(/^I expect r\.original\.gts to be \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gts).to eq [arg1,arg2]
 end
 Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
-  pending # express the regexp above with the code you wish you had
+  expect(@rec1.original.gts[arg1.to_i]).to eq arg2
 end

data/features/step_definitions/sfilter.rb CHANGED

@@ -1,90 +1,139 @@
 Given(/^the VCF line$/) do |string|
-  @header = nil
+  @header = VcfHeader.new
+  @header.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample")
   @vcfline = string
 end
 When(/^I evaluate '([^']+)'$/) do |arg1|
+  # concat VCF line with sample (arg1)
   @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
   @rec = VcfRecord.new(@fields,@header)
   p @rec
+  @g = @rec.sample['Sample']
+  p @g
+  expect(@g).not_to be nil
+  @s = VcfSample::Sample.new(@rec,@g)
+  @ignore_missing = false
 end
 Then(/^I expect s\.empty\? to be false$/) do
-  p @rec.sample[0]
   expect(@s.empty?).to be false
+  expect(@s.sfilter("s.empty?",do_cache: false)).to be false
 end
+Then(/^I expect s\.dp\? to be true$/) do
+  p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)]
+  p ['eval s.dp',@s.eval("s.dp",do_cache: false)]
+  p @g.dp
+  p @s.dp
+  p @s.sfilter("s.dp?",do_cache: false)
+  expect(@s.eval("s.dp?",do_cache: false)).to be true
+end
 Then(/^I expect s\.dp to be (\d+)$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+  # p @s.eval("s.dp")
+  p :now
+  p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)]
+  p ['eval s.dp',@s.eval("s.dp",do_cache: false)]
+  expect(@s.eval("s.dp",do_cache: false)).to equal arg1.to_i
 end
 Then(/^sfilter 's\.dp>(\d+)' to be true$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+  expect(@s.sfilter("dp>#{arg1}",do_cache: false)).to be true
+end
+When(/^I evaluate missing '([^']+)'$/) do |arg1|
+  # concat VCF line with sample (arg1)
+  @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
+  @rec = VcfRecord.new(@fields,@header)
+  p @rec
+  @g = @rec.sample['Sample']
+  @s = VcfSample::Sample.new(@rec,@g)
+  p @s
+  expect(@s).not_to be nil
+  @ignore_missing = false
 end
-When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)'$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
-  pending # express the regexp above with the code you wish you had
+Then(/^I expect s\.dp\? to be false$/) do
+  expect(@s.eval("s.dp?",do_cache: false)).to be false
 end
 Then(/^I expect s\.dp to be nil$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@s.eval("s.dp",ignore_missing_data: @ignore_missing, do_cache: false)).to be nil
 end
-Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1|
+  expect { @s.eval("s.dp>#{arg1}",do_cache: false) }.to raise_error NoMethodError
 end
-When(/^I evaluate empty '\.\/\.'$/) do
-  pending # express the regexp above with the code you wish you had
+Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1|
+   expect(@s.sfilter("s.dp>#{arg1}",ignore_missing_data: @ignore_missing, do_cache: false)).to be false
 end
-Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+When(/^I evaluate empty '\.\/\.'$/) do
+  # concat VCF line with sample (arg1)
+  @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t"))
+  @rec = VcfRecord.new(@fields,@header)
+  p @rec
+  @g = @rec.sample['Sample']
+  @s = VcfSample::Sample.new(@rec,@g)
+  p @s
+  expect(@s).not_to be nil
+  @ignore_missing = false
 end
-When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)' with ignore missing$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
-  pending # express the regexp above with the code you wish you had
+When(/^I evaluate missing '([^']+)' with ignore missing$/) do |arg1|
+  # concat VCF line with sample (arg1)
+  @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
+  @rec = VcfRecord.new(@fields,@header)
+  p @rec
+  @g = @rec.sample['Sample']
+  @s = VcfSample::Sample.new(@rec,@g)
+  p @s
+  expect(@s).not_to be nil
+  @ignore_missing = true
 end
 Then(/^I expect s\.empty\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
+  expect(@s.sfilter("s.empty?",do_cache: false)).to be true
 end
 Then(/^I expect s\.dp to throw an error$/) do
-  pending # express the regexp above with the code you wish you had
+  # @s.instance_eval { undef :dp }
+  p @s.eval("s.dp",do_cache: false)
+  expect { @s.eval("s.dp",do_cache: false) }.to raise_error NoMethodError
 end
 When(/^I evaluate empty '\.\/\.' with ignore missing$/) do
-  pending # express the regexp above with the code you wish you had
-end
-Then(/^I expect s\.dp\? to be true$/) do
-  pending # express the regexp above with the code you wish you had
-end
-Then(/^I expect s\.dp\? to be false$/) do
-  pending # express the regexp above with the code you wish you had
+  # concat VCF line with sample (arg1)
+  @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t"))
+  @rec = VcfRecord.new(@fields,@header)
+  p @rec
+  @g = @rec.sample['Sample']
+  @s = VcfSample::Sample.new(@rec,@g)
+  p @s
+  expect(@s).not_to be nil
+  @ignore_missing = true
 end
 Then(/^I expect s\.what\? to throw an error$/) do
-  pending # express the regexp above with the code you wish you had
+  expect { @s.eval("s.what?",do_cache: false) }.to raise_error RuntimeError
 end
 Then(/^I expect s\.what to throw an error$/) do
-  pending # express the regexp above with the code you wish you had
+  expect { @s.eval("s.what",do_cache: false) }.to raise_error NoMethodError
 end
 Then(/^I expect r\.chrom to be "(.*?)"$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+  expect(@s.eval("r.chrom",do_cache: false)).to eq "1"
 end
 Then(/^I expect r\.alt to be \["(.*?)"\]$/) do |arg1|
-  pending # express the regexp above with the code you wish you had
+  expect(@s.eval("r.alt",do_cache: false)).to eq ["G"]
 end
 Then(/^I expect r\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
-  pending # express the regexp above with the code you wish you had
+  expect(@s.eval("r.info.af",do_cache: false)).to eq 0.667
 end

data/lib/bio-vcf.rb CHANGED

@@ -16,3 +16,4 @@ require 'bio-vcf/vcfline'
 require 'bio-vcf/vcfgenotypefield'
 require 'bio-vcf/vcfrecord'
 require 'bio-vcf/variant'
+require 'bio-vcf/vcfstatistics'

data/lib/bio-vcf/vcfgenotypefield.rb CHANGED

@@ -2,6 +2,15 @@ module BioVcf
   MAXINT=100_000
+  class ValueError < Exception
+  end
+  module VcfValue
+    def VcfValue::empty? v
+      v == nil or v == '' or v == '.'
+    end
+  end
   # Helper class for a list of (variant) values, such as A,G.
   # The [] function does the hard work. You can pass in an index (integer)
   # or nucleotide which translates to an index.
@@ -95,11 +104,12 @@ module BioVcf
     attr_reader :format, :values, :header
-    def initialize s, format, header, alt
-      @is_empty = (s == '' or s == nil or s == './.')
+    def initialize s, format, header, ref, alt
+      @is_empty = VcfSample::empty?(s)
       @original_s = s
       @format = format
       @header = header
+      @ref = ref
       @alt = alt
     end
@@ -116,7 +126,7 @@ module BioVcf
     end
     def valid?
-      !@is_empty
+      !empty?
     end
     def dp4
@@ -141,14 +151,33 @@ module BioVcf
       VcfAltInfoList.new(@alt,values[fetch('AMQ')])
     end
+    def gti?
+      not VcfValue::empty?(fetch_value("GT"))
+    end
+    def gti
+      gt.split('/').map { |g| g.to_i }
+    end
+    def gts?
+      not VcfValue::empty?(fetch_value("GT"))
+    end
+    def gts
+      genotypes = [@ref] + @alt
+      gti.map { |i| genotypes[i] }
+    end
+    # Returns the value of a field
     def method_missing(m, *args, &block)
       return nil if @is_empty
       if m =~ /\?$/
-        # query if a value exists, e.g., r.info.dp?
+        # query if a value exists, e.g., r.info.dp? or s.dp?
         v = values[fetch(m.to_s.upcase.chop)]
-        v != nil
+        return (not VcfValue::empty?(v))
       else
         v = values[fetch(m.to_s.upcase)]
+        return nil if VcfValue::empty?(v)
         v = v.to_i if v =~ /^\d+$/
         v = v.to_f if v =~ /^\d+\.\d+$/
         v
@@ -157,13 +186,19 @@ module BioVcf
   private
+    # Fetch a value and throw an error if it does not exist
     def fetch name
       raise "ERROR: Field with name #{name} does not exist!" if !@format[name]
       @format[name]
     end
+    def fetch_value name
+      values[fetch(name)]
+    end
+    # Return an integer list
     def ilist name
-      v = values[fetch(name)]
+      v = fetch_value(name)
       return nil if not v
       v.split(',').map{|i| i.to_i}
     end
@@ -172,17 +207,18 @@ module BioVcf
   # Holds all samples
   class VcfGenotypeFields
-    def initialize fields, format, header, alt
+    def initialize fields, format, header, ref, alt
       @fields = fields
       @format = format
       @header = header
+      @ref = ref
       @alt = alt
       @samples = {} # lazy cache
       @sample_index = @header.sample_index()
     end
     def [] name
-      @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
+      @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
     end
     def method_missing(m, *args, &block)
@@ -191,7 +227,7 @@ module BioVcf
         # test for valid sample
         return !VcfSample::empty?(@fields[@sample_index[name.chop]])
       else
-        @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
+        @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
       end
     end