RubyGems - bio-vcf - Versions diffs - 0.8.2 → 0.9.0 - Mend

bio-vcf 0.8.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/.travis.yml +8 -2
data/Gemfile +4 -6
data/README.md +92 -57
data/Rakefile +3 -41
data/TAGS +115 -0
data/VERSION +1 -1
data/bin/bio-vcf +58 -70
data/bio-vcf.gemspec +23 -75
data/features/cli.feature +6 -1
data/features/multisample.feature +12 -0
data/features/step_definitions/cli-feature.rb +2 -2
data/features/step_definitions/multisample.rb +19 -0
data/features/step_definitions/vcf_header.rb +1 -1
data/features/support/env.rb +0 -9
data/lib/bio-vcf/pcows.rb +210 -0
data/lib/bio-vcf/vcfheader.rb +28 -9
data/lib/bio-vcf/vcfheader_line.rb +455 -160
data/lib/bio-vcf/vcfrecord.rb +30 -15
data/ragel/gen_vcfheaderline_parser.rl +68 -25
data/ragel/generate.sh +4 -1
data/template/vcf2json.erb +16 -16
data/template/vcf2json_full_header.erb +16 -17
data/template/vcf2json_use_meta.erb +35 -35
data/test/data/input/gatk_exome.vcf +237 -0
data/test/data/input/gatk_wgs.vcf +1000 -0
data/test/data/input/test.bed +632 -0
data/test/data/regression/eval_once-stderr.new +1 -0
data/test/data/regression/eval_once.new +1 -0
data/test/data/regression/eval_once.ref +1 -0
data/test/data/regression/eval_r.info.dp-stderr.new +4 -0
data/test/data/regression/eval_r.info.dp.new +150 -0
data/test/data/regression/ifilter_s.dp-stderr.new +28 -0
data/test/data/regression/ifilter_s.dp.new +31 -0
data/test/data/regression/r.info.dp-stderr.new +4 -0
data/test/data/regression/r.info.dp.new +147 -0
data/test/data/regression/rewrite.info.sample-stderr.new +4 -0
data/test/data/regression/rewrite.info.sample.new +150 -0
data/test/data/regression/s.dp-stderr.new +12 -0
data/test/data/regression/s.dp.new +145 -0
data/test/data/regression/seval_s.dp-stderr.new +4 -0
data/test/data/regression/seval_s.dp.new +36 -0
data/test/data/regression/sfilter_seval_s.dp-stderr.new +12 -0
data/test/data/regression/sfilter_seval_s.dp.new +31 -0
data/test/data/regression/thread4-stderr.new +4 -0
data/test/data/regression/thread4.new +150 -0
data/test/data/regression/thread4_4-stderr.new +15 -0
data/test/data/regression/thread4_4.new +150 -0
data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
data/test/data/regression/thread4_4_failed_filter.new +110 -0
data/test/data/regression/vcf2json_full_header-stderr.new +4 -0
data/test/data/regression/vcf2json_full_header.new +225 -0
data/test/data/regression/vcf2json_full_header.ref +222 -258
data/test/data/regression/vcf2json_use_meta-stderr.new +4 -0
data/test/data/regression/vcf2json_use_meta.new +4697 -0
data/test/data/regression/vcf2json_use_meta.ref +4697 -0
data/test/performance/metrics.md +18 -1
data/test/tmp/test.vcf +12469 -0
metadata +38 -62
data/Gemfile.lock +0 -81
data/ragel/gen_vcfheaderline_parser.rb +0 -483

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.8.2
1	+ 0.9.0

data/bin/bio-vcf CHANGED

@@ -4,7 +4,7 @@
 # Author:: Pjotr Prins
 # License:: MIT
 #
-# Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
+# Copyright (C) 2014-2015 Pjotr Prins <pjotr.prins@thebird.nl>
 USAGE = "Vcf parser"
@@ -15,6 +15,7 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
 version = File.new(VERSION_FILENAME).read.chomp
 require 'bio-vcf'
+require 'bio-vcf/pcows'
 require 'optparse'
 require 'timeout'
 require 'fileutils'
@@ -26,7 +27,7 @@ require 'fileutils'
 # Bio::Log::CLI.logger('stderr')
 # Bio::Log::CLI.trace('info')
-options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
+options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
 opts = OptionParser.new do |o|
   o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g.  #{File.basename($0)} < test/data/input/somaticsniper.vcf"
@@ -68,6 +69,9 @@ opts = OptionParser.new do |o|
   o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
     options[:eval_once] = true
     options[:eval] = cmd
+    # options[:num_threads] = 1
+    # options[:thread_lines] = 1
+    options[:skip_header] = true
   end
   o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
     options[:seval] = cmd
@@ -112,7 +116,14 @@ opts = OptionParser.new do |o|
     options[:template] = s
     options[:skip_header] = true
   end
+  o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
+    options[:tag] = true
+  end
+  o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
+    options[:timeout] = i
+  end
   # Uncomment the following when using the bio-logger
   # o.separator ""
@@ -137,9 +148,10 @@ opts = OptionParser.new do |o|
     options[:verbose] = true
   end
-  # o.on("--debug", "Show debug messages") do |v|
-  #   Bio::Log::CLI.trace('debug')
-  # end
+  o.on("--debug", "Show debug messages") do |v|
+    # Bio::Log::CLI.trace('debug')
+    options[:debug] = true
+  end
   o.separator ""
   o.on_tail('-h', '--help', 'display this help and exit') do
@@ -150,8 +162,8 @@ end
 opts.parse!(ARGV)
 BIOVCF_VERSION=version
-BIOVCF_BANNER = "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
-$stderr.print BIOVCF_BANNER
+BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015\n"
+$stderr.print BIOVCF_BANNER if !options[:quiet]
 if options[:show_help]
   print opts
@@ -174,15 +186,6 @@ if options[:template]
   template = Bio::Template.new(fn)
 end
-if options[:num_threads] != 1
-  begin
-    require 'parallel'
-  rescue LoadError
-    $stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
-    options[:num_threads] = 1
-  end
-end
 stats = nil
 if options[:statistics]
   options[:num_threads] = nil
@@ -202,7 +205,7 @@ include BioVcf
 # Parse the header section of a VCF file (chomping STDIN)
 def parse_header line, samples, options
-  header = VcfHeader.new
+  header = VcfHeader.new(options[:debug])
   header.add(line)
   print line if not options[:skip_header]
   STDIN.each_line do | headerline |
@@ -214,7 +217,7 @@ def parse_header line, samples, options
     if not options[:skip_header]
       if headerline =~ /^#CHR/
         # The header before actual data contains the sample names, first inject the BioVcf meta information
-        print header.tag(options),"\n" if not options[:skip_header]
+        print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
         selected = header.column_names
         if samples
           newfields = selected[0..8]
@@ -234,7 +237,7 @@ def parse_header line, samples, options
   return header,line
 end
-# Parse a VCF line and return the result as a string
+# Parse a VCF line and return the (template) result as a string buffer
 def parse_line line,header,options,bedfilter,samples,template,stats=nil
   fields = VcfLine.parse(line)
   rec = VcfRecord.new(fields,header)
@@ -261,7 +264,7 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
     return if not bed
   end
-  return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
+  return if filter and not rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
   if sfilter
     rec.each_sample(options[:sfilter_samples]) do | sample |
@@ -320,13 +323,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
       exit 1
     end
     return results.to_s+"\n" if results
-    exit(1) if options[:eval_once]  # <--- can this be reached?
   else
     if options[:rdf]
       # Output Turtle RDF
       VcfRdf::record(options[:id],rec,options[:tags])
     elsif options[:template]
-      # Ruby ERB template
+      # Use ERB template
       begin
         template.body(binding)
       rescue Exception => e
@@ -347,13 +349,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   end
 end
+pcows = PCOWS.new(options[:num_threads],'bio-vcf',options[:timeout])
 header = nil
 header_output_completed = false
-NUM_THREADS = options[:num_threads]
 CHUNK_SIZE = options[:thread_lines]
-CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
-chunks = []
-lines = []
+chunk_lines = []
 line_number=0
 if options[:bed]
@@ -361,6 +361,7 @@ if options[:bed]
 end
 begin
+  # Define linear parser function (going through one chunk)
   process = lambda { | lines |
     res = []
     lines.each do | line |
@@ -368,20 +369,17 @@ begin
     end
     res
   }
-  output = lambda { |collection|
-    collection.each do | result |
-      result.each { |line| print line }
-    end
-  } # end output
   # ---- Main loop
   STDIN.each_line do | line |
     line_number += 1
-    # ---- In this section header information is handled
     # ---- Skip embedded headers down the line...
     next if header_output_completed and line =~ /^#/
+    # ---- In the following section header information is handled -
+    #      this only happens once.
     # ---- Parse the header lines (chomps from STDIN)
     #      and returns header info and the current line
     if line =~ /^#/
@@ -399,50 +397,40 @@ begin
       header_output_completed = true
     end
-    # ---- In this section the VCF variant lines are parsed
-    lines << line
-    if NUM_THREADS == 1
-      $stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
-      if lines.size > CHUNK_SIZE
-        process.call(lines).each { | l | print l }
-        lines = []
-      end
-    else
-      if lines.size > CHUNK_SIZE
-        chunks << lines
-        if chunks.size > CHUNK_NUM
-          $stderr.print '.' if not options[:quiet]
-          out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
-            process.call(chunk)
-          }
-          chunks = []
-          # Output is forked to a separate process too
-          fork do
-            output.call out
-            STDOUT.flush
-            STDOUT.close
-            exit 0
-          end
-        end
-        lines = []
-      end
+    if options[:eval_once]
+      # this happens if we only want one line evaluated - say to get
+      # the number of samples
+      print parse_line(line,header,options,bedfilter,samples,template,stats)
+      exit 0
+    end
+    # ---- Lines are collected in one buffer and the lines buffer
+    #      is added to the chunks list (for the threads)
+    chunk_lines << line
+    # ---- In the following section the VCF lines are parsed by chunks
+    #      The chunks may go into different threads
+    if chunk_lines.size > CHUNK_SIZE
+      # ---- process one chunk
+      $stderr.print '.' if not options[:quiet]
+      pcows.wait_for_worker_slot()
+      pcows.submit_worker(process,chunk_lines)
+      pcows.process_output()
+      chunk_lines = []
     end
   end
-  $stderr.print '.' if not options[:quiet]
-  if NUM_THREADS == 1
-    process.call(lines).each { |l| print l}
-  else
-    chunks << lines
-    output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
-      process.call(chunk)
-    }
-  end
+  pcows.submit_worker(process,chunk_lines)
+  pcows.wait_for_workers()
+  pcows.process_remaining_output()
   print template.footer(binding) if template
   stats.print if stats
 rescue Exception => e
   # $stderr.print line
-  $stderr.print e.message,"\n"
+  $stderr.print e.message,"\n" if e.message != 'exit'
   raise if options[:verbose]
   exit 1
 end

data/bio-vcf.gemspec CHANGED

@@ -1,15 +1,13 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# No longer generated by jeweler
 # -*- encoding: utf-8 -*-
 Gem::Specification.new do |s|
   s.name = "bio-vcf"
-  s.version = "0.8.2"
+  s.version = "0.9.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Pjotr Prins"]
-  s.date = "2014-12-28"
+  # s.date = "2015-12-28"
   s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
   s.email = "pjotr.public01@thebird.nl"
   s.executables = ["bio-vcf"]
@@ -20,66 +18,19 @@ Gem::Specification.new do |s|
   s.files = [
     ".travis.yml",
     "Gemfile",
-    "Gemfile.lock",
     "LICENSE.txt",
     "README.md",
     "Rakefile",
     "VERSION",
     "bin/bio-vcf",
     "bio-vcf.gemspec",
-    "features/cli.feature",
-    "features/diff_count.feature",
-    "features/multisample.feature",
-    "features/sfilter.feature",
-    "features/somaticsniper.feature",
-    "features/step_definitions/bio-vcf_steps.rb",
-    "features/step_definitions/cli-feature.rb",
-    "features/step_definitions/diff_count.rb",
-    "features/step_definitions/multisample.rb",
-    "features/step_definitions/sfilter.rb",
-    "features/step_definitions/somaticsniper.rb",
-    "features/step_definitions/vcf_header.rb",
-    "features/support/env.rb",
-    "features/vcf_header.feature",
-    "lib/bio-vcf.rb",
-    "lib/bio-vcf/bedfilter.rb",
-    "lib/bio-vcf/template.rb",
-    "lib/bio-vcf/utils.rb",
-    "lib/bio-vcf/variant.rb",
-    "lib/bio-vcf/vcf.rb",
-    "lib/bio-vcf/vcfgenotypefield.rb",
-    "lib/bio-vcf/vcfheader.rb",
-    "lib/bio-vcf/vcfheader_line.rb",
-    "lib/bio-vcf/vcfline.rb",
-    "lib/bio-vcf/vcfrdf.rb",
-    "lib/bio-vcf/vcfrecord.rb",
-    "lib/bio-vcf/vcfsample.rb",
-    "lib/bio-vcf/vcfstatistics.rb",
-    "ragel/gen_vcfheaderline_parser.rb",
     "ragel/gen_vcfheaderline_parser.rl",
     "ragel/generate.sh",
-    "template/gatk_vcf2rdf.erb",
-    "template/vcf2json.erb",
-    "template/vcf2json_full_header.erb",
-    "template/vcf2json_use_meta.erb",
-    "template/vcf2rdf.erb",
-    "template/vcf2rdf_header.erb",
-    "test/data/input/dbsnp.vcf",
-    "test/data/input/multisample.vcf",
-    "test/data/input/somaticsniper.vcf",
-    "test/data/regression/eval_r.info.dp.ref",
-    "test/data/regression/ifilter_s.dp.ref",
-    "test/data/regression/r.info.dp.ref",
-    "test/data/regression/rewrite.info.sample.ref",
-    "test/data/regression/s.dp.ref",
-    "test/data/regression/seval_s.dp.ref",
-    "test/data/regression/sfilter_seval_s.dp.ref",
-    "test/data/regression/thread4.ref",
-    "test/data/regression/thread4_4.ref",
-    "test/data/regression/thread4_4_failed_filter-stderr.ref",
-    "test/data/regression/vcf2json_full_header.ref",
-    "test/performance/metrics.md"
   ]
+  s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
+  s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
+             Dir['template/**/*']
   s.homepage = "http://github.com/pjotrp/bioruby-vcf"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
@@ -87,25 +38,22 @@ Gem::Specification.new do |s|
   s.rubygems_version = "2.0.3"
   s.summary = "Fast multi-threaded VCF parser"
-  if s.respond_to? :specification_version then
-    s.specification_version = 4
+  # if s.respond_to? :specification_version then
+  #   s.specification_version = 4
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<rspec>, [">= 2.14.0"])
-      s.add_development_dependency(%q<cucumber>, [">= 1.3.11"])
-      s.add_development_dependency(%q<jeweler>, [">= 2.0.1"])
-      s.add_development_dependency(%q<regressiontest>, [">= 0.0.3"])
-    else
-      s.add_dependency(%q<rspec>, [">= 2.14.0"])
-      s.add_dependency(%q<cucumber>, [">= 1.3.11"])
-      s.add_dependency(%q<jeweler>, [">= 2.0.1"])
-      s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
-    end
-  else
-    s.add_dependency(%q<rspec>, [">= 2.14.0"])
-    s.add_dependency(%q<cucumber>, [">= 1.3.11"])
-    s.add_dependency(%q<jeweler>, [">= 2.0.1"])
-    s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
-  end
+  #   if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+  #     s.add_development_dependency(%q<rspec>, [">= 2.14.0"])
+  #     s.add_development_dependency(%q<cucumber>, [">= 1.3.11"])
+  #     s.add_development_dependency(%q<regressiontest>, [">= 0.0.3"])
+  #   else
+  #     s.add_dependency(%q<rspec>, [">= 2.14.0"])
+  #     s.add_dependency(%q<cucumber>, [">= 1.3.11"])
+  #     s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
+  #   end
+  # else
+  #   s.add_dependency(%q<rspec>, [">= 2.14.0"])
+  #   s.add_dependency(%q<cucumber>, [">= 1.3.11"])
+  #   s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
+  # end
 end

data/features/cli.feature CHANGED

@@ -48,6 +48,11 @@ Feature: Command-line interface (CLI)
     When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
     Then I expect the named output to match the named output "rewrite.info.sample"
+  Scenario: Test eval-once
+    Given I have input file(s) named "test/data/input/multisample.vcf"
+    When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
+    Then I expect the named output to match the named output "eval_once"
   Scenario: Test JSON output with header meta data
     Given I have input file(s) named "test/data/input/multisample.vcf"
     When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
@@ -60,7 +65,7 @@ Feature: Command-line interface (CLI)
   Scenario: Test deadlock on failed filter with threads
     Given I have input file(s) named "test/data/input/multisample.vcf"
-    When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
+    When I execute "./bin/bio-vcf -q --timeout 2 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
     Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds

data/features/multisample.feature CHANGED

@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
     And I expect rec.info.ac to be 5
     And I expect rec.info.af to be 0.357
     And I expect rec.info.dp to be 1537
+    And I expect rec.info['dp'] to be 1537
     And I expect rec.info.readposranksum to be 0.815
+    And I expect rec.info['ReadPosRankSum'] to be 0.815
+    And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
     And I expect rec.sample['Original'].ad to be [189,25]
     And I expect rec.sample['Original'].gt to be "0/1"
     And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -76,3 +79,12 @@ Feature: Multi-sample VCF
     And I expect r.original.gts to be ["C","G"]
     And I expect r.original.gts[0] to be "C"
     And I expect r.original.gts[1] to be "G"
+    # INFO fields with matching tails
+    Given multisample vcf line
+    """
+1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
+    """
+    When I parse the record
+    Then I expect r.info.end to be 111
+    And I expect r.info.ciend to be 999

data/features/step_definitions/cli-feature.rb CHANGED

@@ -8,9 +8,9 @@ When /^I execute "(.*?)"$/ do |arg1|
 end
 Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_true
+  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
 end
 Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
-  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
+  RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
 end