RubyGems - bio-vcf - Versions diffs - 0.8.1 → 0.9.5 - Mend

bio-vcf 0.8.1 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

checksums.yaml +5 -5
data/.travis.yml +1 -11
data/Gemfile +2 -8
data/LICENSE.txt +1 -1
data/README.md +467 -129
data/RELEASE_NOTES.md +27 -0
data/RELEASE_NOTES.md~ +11 -0
data/Rakefile +9 -42
data/TAGS +115 -0
data/VERSION +1 -1
data/bin/bio-vcf +156 -108
data/bio-vcf.gemspec +13 -75
data/features/cli.feature +22 -4
data/features/diff_count.feature +0 -1
data/features/filter.feature +12 -0
data/features/multisample.feature +12 -0
data/features/somaticsniper.feature +2 -0
data/features/step_definitions/cli-feature.rb +15 -6
data/features/step_definitions/diff_count.rb +1 -1
data/features/step_definitions/multisample.rb +19 -0
data/features/step_definitions/somaticsniper.rb +9 -1
data/features/step_definitions/vcf_header.rb +48 -0
data/features/support/env.rb +1 -11
data/features/vcf_header.feature +35 -0
data/lib/bio-vcf.rb +1 -0
data/lib/bio-vcf/pcows.rb +303 -0
data/lib/bio-vcf/vcffile.rb +46 -0
data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
data/lib/bio-vcf/vcfheader.rb +137 -5
data/lib/bio-vcf/vcfheader_line.rb +778 -0
data/lib/bio-vcf/vcfrecord.rb +56 -18
data/lib/bio-vcf/vcfsample.rb +26 -2
data/lib/regressiontest.rb +11 -0
data/lib/regressiontest/cli_exec.rb +101 -0
data/ragel/gen_vcfheaderline_parser.rl +165 -0
data/ragel/generate.sh +8 -0
data/template/vcf2json.erb +16 -16
data/template/vcf2json_full_header.erb +22 -0
data/template/vcf2json_use_meta.erb +41 -0
data/test/data/input/empty.vcf +2 -0
data/test/data/input/gatk_exome.vcf +237 -0
data/test/data/input/gatk_wgs.vcf +1000 -0
data/test/data/input/test.bed +632 -0
data/test/data/regression/empty-stderr.new +12 -0
data/test/data/regression/empty.new +2 -0
data/test/data/regression/empty.ref +2 -0
data/test/data/regression/eval_once-stderr.new +2 -0
data/test/data/regression/eval_once.new +1 -0
data/test/data/regression/eval_once.ref +1 -0
data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
data/test/data/regression/eval_r.info.dp.new +150 -0
data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
data/test/data/regression/ifilter_s.dp.new +31 -0
data/test/data/regression/pass1-stderr.new +10 -0
data/test/data/regression/pass1.new +88 -0
data/test/data/regression/pass1.ref +88 -0
data/test/data/regression/r.info.dp-stderr.new +4 -0
data/test/data/regression/r.info.dp.new +114 -0
data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
data/test/data/regression/rewrite.info.sample.new +150 -0
data/test/data/regression/s.dp-stderr.new +18 -0
data/test/data/regression/s.dp.new +145 -0
data/test/data/regression/seval_s.dp-stderr.new +10 -0
data/test/data/regression/seval_s.dp.new +36 -0
data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
data/test/data/regression/sfilter_seval_s.dp.new +31 -0
data/test/data/regression/thread4-stderr.new +10 -0
data/test/data/regression/thread4.new +150 -0
data/test/data/regression/thread4_4-stderr.new +25 -0
data/test/data/regression/thread4_4.new +130 -0
data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
data/test/data/regression/thread4_4_failed_filter.new +110 -0
data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
data/test/data/regression/vcf2json_full_header.new +225 -0
data/test/data/regression/vcf2json_full_header.ref +225 -0
data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
data/test/data/regression/vcf2json_use_meta.new +4697 -0
data/test/data/regression/vcf2json_use_meta.ref +4697 -0
data/test/performance/metrics.md +18 -1
data/test/stress/stress_test.sh +15 -0
data/test/tmp/test.vcf +12469 -0
metadata +63 -64
data/Gemfile.lock +0 -81

data/bio-vcf.gemspec CHANGED

@@ -1,18 +1,13 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# No longer generated by jeweler
 # -*- encoding: utf-8 -*-
-# stub: bio-vcf 0.8.1 ruby lib
 Gem::Specification.new do |s|
   s.name = "bio-vcf"
-  s.version = "0.8.1"
+  s.version = File.read("VERSION")
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
-  s.require_paths = ["lib"]
   s.authors = ["Pjotr Prins"]
-  s.date = "2014-11-26"
-  s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
+  s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
   s.email = "pjotr.public01@thebird.nl"
   s.executables = ["bio-vcf"]
   s.extra_rdoc_files = [
@@ -22,82 +17,25 @@ Gem::Specification.new do |s|
   s.files = [
     ".travis.yml",
     "Gemfile",
-    "Gemfile.lock",
     "LICENSE.txt",
     "README.md",
     "Rakefile",
     "VERSION",
     "bin/bio-vcf",
     "bio-vcf.gemspec",
-    "features/cli.feature",
-    "features/diff_count.feature",
-    "features/multisample.feature",
-    "features/sfilter.feature",
-    "features/somaticsniper.feature",
-    "features/step_definitions/bio-vcf_steps.rb",
-    "features/step_definitions/cli-feature.rb",
-    "features/step_definitions/diff_count.rb",
-    "features/step_definitions/multisample.rb",
-    "features/step_definitions/sfilter.rb",
-    "features/step_definitions/somaticsniper.rb",
-    "features/support/env.rb",
-    "lib/bio-vcf.rb",
-    "lib/bio-vcf/bedfilter.rb",
-    "lib/bio-vcf/template.rb",
-    "lib/bio-vcf/utils.rb",
-    "lib/bio-vcf/variant.rb",
-    "lib/bio-vcf/vcf.rb",
-    "lib/bio-vcf/vcfgenotypefield.rb",
-    "lib/bio-vcf/vcfheader.rb",
-    "lib/bio-vcf/vcfline.rb",
-    "lib/bio-vcf/vcfrdf.rb",
-    "lib/bio-vcf/vcfrecord.rb",
-    "lib/bio-vcf/vcfsample.rb",
-    "lib/bio-vcf/vcfstatistics.rb",
-    "template/gatk_vcf2rdf.erb",
-    "template/vcf2json.erb",
-    "template/vcf2rdf.erb",
-    "template/vcf2rdf_header.erb",
-    "test/data/input/dbsnp.vcf",
-    "test/data/input/multisample.vcf",
-    "test/data/input/somaticsniper.vcf",
-    "test/data/regression/eval_r.info.dp.ref",
-    "test/data/regression/ifilter_s.dp.ref",
-    "test/data/regression/r.info.dp.ref",
-    "test/data/regression/rewrite.info.sample.ref",
-    "test/data/regression/s.dp.ref",
-    "test/data/regression/seval_s.dp.ref",
-    "test/data/regression/sfilter_seval_s.dp.ref",
-    "test/data/regression/thread4.ref",
-    "test/data/regression/thread4_4.ref",
-    "test/data/regression/thread4_4_failed_filter-stderr.ref",
-    "test/performance/metrics.md"
+    "ragel/gen_vcfheaderline_parser.rl",
+    "ragel/generate.sh",
   ]
-  s.homepage = "http://github.com/pjotrp/bioruby-vcf"
+  s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
+  s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
+             Dir['template/**/*']
+  s.homepage = "http://github.com/vcflib/bio-vcf"
   s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
   s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
-  s.rubygems_version = "2.2.2"
-  s.summary = "Fast multi-threaded VCF parser"
-  if s.respond_to? :specification_version then
-    s.specification_version = 4
+  # s.rubygems_version = "2.0.3"
+  s.summary = "Fast multi-purpose multi-threaded VCF parser"
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<rspec>, [">= 0"])
-      s.add_development_dependency(%q<cucumber>, [">= 0"])
-      s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
-      s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
-    else
-      s.add_dependency(%q<rspec>, [">= 0"])
-      s.add_dependency(%q<cucumber>, [">= 0"])
-      s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
-      s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
-    end
-  else
-    s.add_dependency(%q<rspec>, [">= 0"])
-    s.add_dependency(%q<cucumber>, [">= 0"])
-    s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
-    s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
-  end
 end

data/features/cli.feature CHANGED

@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
   Scenario: Test the info filter using dp and threads
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
-    Then I expect the named output to match the named output "thread4"
+    Then I expect the named output to match the named output "thread4" in under 30 seconds
   Scenario: Test the info filter using dp and threads with lines
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
-    Then I expect the named output to match the named output "thread4_4"
+    Then I expect the named output to match the named output "thread4_4" in under 30 seconds
   Scenario: Test the sample filter using dp
     Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
     When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
     Then I expect the named output to match the named output "sfilter_seval_s.dp"
   Scenario: Rewrite an info field
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
     Then I expect the named output to match the named output "rewrite.info.sample"
+  Scenario: Test eval-once
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
+    Then I expect the named output to match the named output "eval_once"
+  Scenario: Test JSON output with header meta data
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
+    Then I expect the named output to match the named output "vcf2json_full_header"
+  Scenario: Test JSON output with header meta data and query samples
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
+    Then I expect the named output to match the named output "vcf2json_use_meta"
   Scenario: Test deadlock on failed filter with threads
     Given I have input file(s) named "test/data/input/multisample.vcf"
-    When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
+    When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
     Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
+  Scenario: Test VCF with no records
+    Given I have input file(s) named "test/data/input/empty.vcf"
+    When I execute "./bin/bio-vcf --timeout=5"
+    Then I expect no errors

data/features/diff_count.feature CHANGED

@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
     Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
     When I look for the difference
     Then I expect the diff to be [0,15,0,11]
-    And the relative diff to be [0,0.23,0,0.85]
     And I expect the defining tumor nucleotide to be "T"
     And I expect the tumor count to be 12
     When I set an inclusion threshold for the reference

data/features/filter.feature ADDED

@@ -0,0 +1,12 @@
+@filter
+Feature: Adding filters
+  bio-vcf can add soft filters. Rather than removing failing items we can
+  inject filter state into the FILTER field. To add state such as PASS or
+  LowDepth simply use a filter and the --set-filter switch. If a filter already
+  has state the new one is appended with a semi-colon.
+  Scenario: Test the info filter using dp and threads
+    Given I have input file(s) named "test/data/input/somaticsniper.vcf"
+    When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
+    Then I expect the named output to match the named output "pass1"

data/features/multisample.feature CHANGED

@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
     And I expect rec.info.ac to be 5
     And I expect rec.info.af to be 0.357
     And I expect rec.info.dp to be 1537
+    And I expect rec.info['dp'] to be 1537
     And I expect rec.info.readposranksum to be 0.815
+    And I expect rec.info['ReadPosRankSum'] to be 0.815
+    And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
     And I expect rec.sample['Original'].ad to be [189,25]
     And I expect rec.sample['Original'].gt to be "0/1"
     And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -76,3 +79,12 @@ Feature: Multi-sample VCF
     And I expect r.original.gts to be ["C","G"]
     And I expect r.original.gts[0] to be "C"
     And I expect r.original.gts[1] to be "G"
+    # INFO fields with matching tails
+    Given multisample vcf line
+    """
+1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
+    """
+    When I parse the record
+    Then I expect r.info.end to be 111
+    And I expect r.info.ciend to be 999

data/features/somaticsniper.feature CHANGED

@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
     And I expect rec.tumor.amq.to_ary to be [37,37]
     And I expect rec.tumor.mq to be 37
     And I expect rec.tumor.ss to be 2
+    And I expect rec.tumor.ssc to be 33
+    And I expect rec.normal.ssc to be nil
     # The following are additional functions
     And I expect rec.call_diff to be [-4,2,-2,0]
     And I expect rec.call_nuc to be "C"

data/features/step_definitions/cli-feature.rb CHANGED

@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
   @cmd = arg1 + ' < ' + @filenames[0]
 end
-Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
-end
+# Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
+# end
-Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
-end
+# Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
+# end
+# Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
+#   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
+# end
+# Then(/^I expect no errors$/) do
+#   RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
+# end

data/features/step_definitions/diff_count.rb CHANGED

@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
 end
 Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
-  res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
+  res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
   expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
 end

data/features/step_definitions/multisample.rb CHANGED

@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
   expect(@rec1.info.readposranksum).to eq 0.815
 end
+Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
+  expect(@rec1.info['dp']).to eq 1537
+end
+Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
+  expect(@rec1.info['ReadPosRankSum']).to eq 0.815
+end
+Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
+  expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
+end
 Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
   expect(@rec1.sample['Original'].gt).to eq "0/1"
 end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
   expect(@rec1.original.gts[arg1.to_i]).to eq arg2
 end
+Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
+  expect(@rec1.info.end).to eq arg1.to_i
+end
+Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
+  expect(@rec1.info.ciend).to eq arg1.to_i
+end

data/features/step_definitions/somaticsniper.rb CHANGED

@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
 end
+Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
+  expect(@rec.tumor.ssc).to be 33
+end
+Then(/^I expect rec\.normal\.ssc to be nil$/) do
+  expect(@rec.normal.ssc).to be nil
+end
 Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
   expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
 end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
 end
 Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
-  expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
+  expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
 end

data/features/step_definitions/vcf_header.rb ADDED

@@ -0,0 +1,48 @@
+Given(/^the VCF header lines$/) do |string|
+  header = VcfHeader.new
+  header.add string
+  @vcf = header
+end
+When(/^I parse the VCF header$/) do
+end
+Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
+  expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
+end
+Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
+  expect(@vcf.fileformat).to eq arg1
+end
+Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
+  expect(@vcf.fileDate).to eq arg1
+end
+Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
+  expect(@vcf.field['fileDate']).to eq arg1
+end
+Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
+  expect(@vcf.phasing).to eq arg1
+end
+Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
+  expect(@vcf.reference).to eq arg1
+end
+Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
+  expect(@vcf.format[arg1].to_s).to eq arg2
+end
+Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
+  expect(@vcf.info[arg1].to_s).to eq arg2
+end
+Then(/^I expect vcf\.meta to contain all header meta information$/) do
+  m = @vcf.meta
+  expect(m['fileformat']).to eq "VCFv4.1"
+  expect(m['FORMAT']['DP']['Number']).to eq "1"
+  expect(m.size).to be 9
+end

data/features/support/env.rb CHANGED

@@ -1,13 +1,3 @@
-require 'bundler'
-begin
-  Bundler.setup(:default, :development)
-rescue Bundler::BundlerError => e
-  $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
-  exit e.status_code
-end
-# require 'mini/test'
 $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
 require 'bio-vcf'
@@ -16,7 +6,7 @@ require 'rspec/expectations'
 # Add the regression module if in the path (it can also be a gem)
 rootdir = File.dirname(__FILE__) + '/../..'
-$LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
+$LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib')
 require 'regressiontest'
 include BioVcf

data/features/vcf_header.feature ADDED

@@ -0,0 +1,35 @@
+@meta
+Feature: Parsing VCF meta information from the header
+  Take a header and parse that information as defined by the VCF standard.
+  Scenario: When parsing a header line
+    Given the VCF header lines
+    """
+##fileformat=VCFv4.1
+##fileDate=20140121
+##phasing=none
+##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
+##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
+    """
+    When I parse the VCF header
+    Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
+    And I expect vcf.fileformat to be "VCFv4.1"
+    And I expect vcf.fileDate to be "20140121"
+    And I expect vcf.field['fileDate'] to be "20140121"
+    And I expect vcf.phasing to be "none"
+    And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
+    And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
+    And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
+    And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
+    And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
+    And I expect vcf.meta to contain all header meta information
+  Scenario: When parsing the header of somatic_sniper.vcf
+    Do something

data/lib/bio-vcf.rb CHANGED

@@ -11,6 +11,7 @@
 require 'bio-vcf/utils'
 require 'bio-vcf/vcf'
 require 'bio-vcf/vcfsample'
+require 'bio-vcf/vcfheader_line'
 require 'bio-vcf/vcfheader'
 require 'bio-vcf/vcfline'
 require 'bio-vcf/vcfgenotypefield'

data/lib/bio-vcf/pcows.rb ADDED

@@ -0,0 +1,303 @@
+# Parallel copy-on-write streaming (PCOWS)
+require 'tempfile'
+class PCOWS
+  RUNNINGEXT = 'part' # file extension
+  def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
+    num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
+    # $stderr.print "Using ",num_threads,"threads \n"
+    @num_threads = num_threads
+    @chunk_size = chunk_size
+    @pid_list = []
+    @name = name
+    @timeout = timeout
+    @quiet = quiet
+    @debug = debug
+    if @debug
+      $stderr.print "PCOWS running in DEBUG MODE\n"
+    end
+    if multi_threaded
+      @tmpdir =  Dir::mktmpdir(@name+'_')
+    end
+    @last_output = 0 # counter
+    @output_locked = false
+  end
+  # Feed the worker 'func and state' to COWS. Note that func is a
+  # lambda closure so it can pick up surrounding scope at invocation
+  # in addition to the data captured in 'state'.
+  def submit_worker(func,state)
+    pid = nil
+    if multi_threaded
+      count = @pid_list.size+1
+      fn = mktmpfilename(count)
+      pid = fork do
+        # ---- This is running a new copy-on-write process
+        tempfn = fn+'.'+RUNNINGEXT
+        STDOUT.reopen(File.open(tempfn, 'w+'))
+        func.call(state).each { | line | print line }
+        STDOUT.flush
+        STDOUT.close
+        # sleep 0.1
+        # f.flush
+        # f.close
+        # sleep 0.2  # interval to make sure we are done writing,
+                   # otherwise there may be misses at the end of a
+                   # block (maybe the f.close fixed it)
+        FileUtils::mv(tempfn,fn)
+        exit(0)
+      end
+      Process.detach(pid)
+    else
+      # ---- Single threaded: call in main process and output immediately
+      func.call(state).each { | line | print line }
+    end
+    @pid_list << [ pid,count,fn ]
+    return true
+  end
+  def submit_final_worker(func,state)
+    @final_worker = true
+    submit_worker(func,state)
+  end
+  # Make sure no more than num_threads are running at the same time -
+  # this is achieved by checking the PID table and the running files
+  # in the tmpdir
+  def wait_for_worker_slot()
+    return if single_threaded
+    Timeout.timeout(@timeout) do
+      printed_timeout_message = false
+      while true
+        # ---- count running pids
+        running = @pid_list.reduce(0) do | sum, info |
+          (pid,count,fn) = info
+          if pid_or_file_running?(pid,fn)
+            sum+1
+          else
+            sum
+          end
+        end
+        return if running < @num_threads
+        if not printed_timeout_message
+          $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
+          printed_timeout_message = true
+        end
+        sleep 0.1
+      end
+    end
+  end
+  # ---- In this section the output gets collected and passed on to a
+  #      printer thread. This function makes sure the printing is
+  #      ordered and that no printers are running at the same
+  #      time. The printer thread should be doing as little processing
+  #      as possible.
+  #
+  #      In this implementation type==:by_line will call func for
+  #      each line. Otherwise it is called once with the filename.
+  def process_output(func=nil,type=:by_line, blocking=false)
+    return if single_threaded
+    output = lambda { |fn|
+      if type == :by_line
+        File.new(fn).each_line { |buf|
+          print buf
+        }
+      else
+        func.call(fn)
+      end
+    }
+    if @output_locked
+      # ---- is the other thread still running? We wait until it
+      #      is finished to start the next one
+      (pid,count,fn) = @output_locked
+      $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
+      return if File.exist?(fn)  # continue because thread still processing
+      # Now we should remove the .keep file
+      cleanup_keep_file(fn)
+      @last_output += 1          # get next one in line
+      @output_locked = false
+    end
+    # ---- process the next output chunk. After completion it
+    #      gets renamed to chunk.keep. This to avoid missing
+    #      output (if we unlink the file prematurely)
+    if info = @pid_list[@last_output]
+      (pid,count,fn) = info
+      $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
+      if File.exist?(fn)
+        # Yes! We have the next output, create outputter
+        @output_locked = info
+        $stderr.print "Set lock on ",[info],"\n" if not @quiet
+        if not blocking
+          $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
+          pid = fork do
+            output.call(fn)
+            # after finishing output move it to .keep
+            FileUtils::mv(fn,fn+'.keep')
+            exit(0)
+          end
+          Process.detach(pid)
+        else
+          $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
+          output.call(fn)
+          FileUtils::mv(fn,fn+'.keep')
+        end
+      else
+        sleep 0.2
+      end
+    end
+  end
+  # Wait for a worker slot to appear. When working the pid is writing
+  # a file with extension .part(ial). After completion the file is
+  # renamed without .part and a slot is free.
+  def wait_for_worker(info)
+    (pid,count,fn) = info
+    if pid_or_file_running?(pid,fn)
+      $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
+      begin
+        Timeout.timeout(@timeout) do
+          while not File.exist?(fn)  # wait for the result to appear
+            sleep 0.2
+            return if not pid_or_file_running?(pid,fn) # worker is gone
+          end
+        end
+        # Partial file should have been renamed:
+        raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
+        $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
+      rescue Timeout::Error
+        # Kill it to speed up exit
+        Process.kill 9, pid
+        Process.wait pid
+        $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
+        $stderr.print "Bailing out"
+        raise
+      end
+    end
+  end
+  # This is the final cleanup after the reader thread is done. All workers
+  # need to complete.
+  def wait_for_workers()
+    return if single_threaded
+    @pid_list.each do |info|
+      wait_for_worker(info)
+    end
+  end
+  def process_remaining_output()
+    return if single_threaded
+    $stderr.print "Processing remaining output...\n" if not @quiet
+    while @output_locked
+      sleep 0.2
+      process_output() # keep trying
+    end
+    @pid_list.each do |info|
+      (pid,count,fn) = info
+      while pid_or_file_running?(pid,fn) or File.exist?(fn)
+        $stderr.print "Trying: ",[info],"\n" if not @quiet
+        process_output(nil,:by_line,true)
+        sleep 0.2
+      end
+    end
+    while @output_locked
+      sleep 0.1
+      process_output(nil,:by_line,true)
+    end
+    cleanup_tmpdir()
+  end
+  def cleanup()
+    @pid_list.each do |info|
+      (pid,count,fn) = info
+      if pid_running?(pid)
+        $stderr.print "Killing child ",[info],"\n"
+        begin
+          Process.kill 9, pid
+          Process.wait pid
+        rescue Errno::ENOENT
+          $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
+        rescue Errno::ESRCH
+          $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
+        end
+      end
+      File.unlink(fn) if File.exist?(fn)
+      cleanup_keep_file(fn,wait: false)
+      tempfn = fn+'.'+RUNNINGEXT
+      File.unlink(tempfn) if File.exist?(tempfn)
+    end
+    cleanup_tmpdir()
+  end
+  private
+  def mktmpfilename(num,ext=nil)
+    @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
+  end
+  def pid_or_file_running?(pid,fn)
+    (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
+  end
+  def pid_running?(pid)
+    begin
+      fpid,status=Process.waitpid2(pid,Process::WNOHANG)
+    rescue Errno::ECHILD, Errno::ESRCH
+      return false
+    end
+    return true if nil == fpid && nil == status
+    return ! (status.exited? || status.signaled?)
+  end
+  def single_threaded
+    @num_threads == 1
+  end
+  def multi_threaded
+    @num_threads > 1
+  end
+  def cpu_count
+    begin
+      return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
+      # Actually, the JVM does not allow fork...
+      return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
+    rescue LoadError
+      # Count on MAC
+      return Integer `sysctl -n hw.ncpu 2>/dev/null`
+    end
+    $stderr.print "Could not determine number of CPUs" if not @quiet
+    1
+  end
+  def cleanup_keep_file(fn, opts = { wait: true })
+    if not @debug
+      keep = fn+'.keep'
+      return if not opts[:wait] and !File.exist?(keep)
+      $stderr.print "Trying to remove #{keep}\n" if not @quiet
+      while true
+        if File.exist?(keep)
+          $stderr.print "Removing #{keep}\n" if not @quiet
+          File.unlink(keep)
+          break # forever loop
+        end
+        sleep 0.1
+      end #forever
+    end
+  end
+  def cleanup_tmpdir
+    if not @debug
+      $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
+      Dir.unlink(@tmpdir) if @tmpdir
+    end
+  end
+end