RubyGems - bio-vcf - Versions diffs - 0.9.0 → 0.9.2 - Mend

bio-vcf 0.9.0 → 0.9.2

Files changed (36) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -3
data/Gemfile.lock +44 -0
data/README.md +151 -28
data/VERSION +1 -1
data/bin/bio-vcf +47 -15
data/bio-vcf.gemspec +4 -21
data/features/#cli.feature# +71 -0
data/features/cli.feature +3 -3
data/features/filter.feature +12 -0
data/features/filter.feature~ +35 -0
data/features/somaticsniper.feature +2 -0
data/features/step_definitions/cli-feature.rb +5 -0
data/features/step_definitions/somaticsniper.rb +8 -0
data/lib/bio-vcf/pcows.rb +123 -36
data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
data/lib/bio-vcf/vcfrecord.rb +21 -0
data/lib/bio-vcf/vcfsample.rb +13 -0
data/test/data/regression/eval_once-stderr.new +2 -1
data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
data/test/data/regression/pass1-stderr.new +8 -0
data/test/data/regression/pass1.new +88 -0
data/test/data/regression/pass1.ref +88 -0
data/test/data/regression/r.info.dp-stderr.new +8 -4
data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
data/test/data/regression/s.dp-stderr.new +8 -4
data/test/data/regression/seval_s.dp-stderr.new +8 -4
data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
data/test/data/regression/thread4-stderr.new +8 -4
data/test/data/regression/thread4_4-stderr.new +44 -15
data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
data/test/stress/stress_test.sh +15 -0
data/test/stress/stress_test.sh~ +8 -0
metadata +14 -5

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1364a1b5bd401632c6c2ae2497a358c206d7a9e6
-  data.tar.gz: 2a8cad615012dcd175cc3bb614888b33db16483b
+  metadata.gz: a09729e3548751923f4b3c5ef81c8c9d7402b6b2
+  data.tar.gz: 4c525ad745c5486075e9a0f14fe5372a21c8f056
 SHA512:
-  metadata.gz: d8420c9926835ff632cd8dd93c057c08f37dc557c86d76fd1766a044dcc5a8b7c573553c2ef6efe8e348d868666c7fa6d62b34779e214b090cf3287e087e61c8
-  data.tar.gz: b666a3d97a63ac18ef7ff88e002b88dee8a81e703cadaacaffc968fe7f634591bcf5df40cd930b9493e293530047d8cc6a31df446ebf1e5cd7bfb2f6164e3e0a
+  metadata.gz: 343083ee8c055f534a840c8f668cb35a0c33fbccabe2b580edf859747aff8c8069266168ac66631bc3bbd2c8f58691847796eb00cef7784c7ebf966ec85e1d4f
+  data.tar.gz: f55292d0d744a496a5b39123285904a120f4c4ffae066dc33f244e09ae021618e94071cf8e587f56debfaf0c54233c3a5688887e21b21f5640bc8b271a3a00bb

data/.travis.yml CHANGED

@@ -3,9 +3,8 @@ sudo: false  # required for the new containers
 language: ruby
 rvm:
 #  - 1.9.3 <- No longer working
-# - 2.0.0
-# - 2.1.0
-  - 2.2.2
+  - 2.1.0
+  - 2.2.3
 # install:
 #   - gem install cucumber rspec regressiontest

data/Gemfile.lock ADDED

@@ -0,0 +1,44 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    builder (3.2.2)
+    cucumber (2.1.0)
+      builder (>= 2.1.2)
+      cucumber-core (~> 1.3.0)
+      diff-lcs (>= 1.1.3)
+      gherkin3 (~> 3.1.0)
+      multi_json (>= 1.7.5, < 2.0)
+      multi_test (>= 0.1.2)
+    cucumber-core (1.3.0)
+      gherkin3 (~> 3.1.0)
+    diff-lcs (1.2.5)
+    gherkin3 (3.1.1)
+    multi_json (1.11.2)
+    multi_test (0.1.2)
+    rake (10.4.2)
+    regressiontest (0.0.3)
+    rspec (3.3.0)
+      rspec-core (~> 3.3.0)
+      rspec-expectations (~> 3.3.0)
+      rspec-mocks (~> 3.3.0)
+    rspec-core (3.3.2)
+      rspec-support (~> 3.3.0)
+    rspec-expectations (3.3.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.3.0)
+    rspec-mocks (3.3.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.3.0)
+    rspec-support (3.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  cucumber
+  rake
+  regressiontest (>= 0.0.3)
+  rspec
+BUNDLED WITH
+   1.10.6

data/README.md CHANGED

@@ -4,6 +4,9 @@
 ## Updates
+* Getting ready for a 1.0 release
+* 0.9.1 removed a rare threading bug and cleanup on error
+* Added support for soft filters (request by Brad Chapman)
 * The outputter now writes (properly) in parallel with the parser
 * bio-vcf turns any VCF into JSON with header information, and
   allows you to pipe that JSON directly into any JSON supporting
@@ -23,18 +26,19 @@ So, why would you use bio-vcf over other parsers? Because
 3. Bio-vcf has great multi-sample support
 4. Bio-vcf has multiple global filters and sample filters
 5. Bio-vcf can access any VCF format
-6. Bio-vcf can parse and query the VCF header (META)
+6. Bio-vcf can parse and query the VCF header (META data)
 7. Bio-vcf can do calculations on fields
 8. Bio-vcf allows for genotype processing
 9. Bio-vcf has support for set analysis
 10. Bio-vcf has sane error handling
 11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
+12. Bio-vcf has soft filters
-Bio-vcf has better performance than other tools
-because of lazy parsing, multi-threading, and useful combinations of
-(fancy) command line filtering (who says Ruby is slow?). Adding
-cores, bio-vcf just does better. The more complicated the filters,
-the larger the gain. First the base line test to show IO performance
+Bio-vcf has better performance than other tools because of lazy
+parsing, multi-threading, and useful combinations of (fancy) command
+line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
+does better. The more complicated the filters, the larger the
+gain. First a base line test to show IO performance
 ```sh
   time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
@@ -44,7 +48,7 @@ the larger the gain. First the base line test to show IO performance
   sys     0m2.972s
 ```
-Next run the 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
+Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
 ```sh
   time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
@@ -62,10 +66,11 @@ user    12m53.273s
 sys     0m9.913s
 ```
-This means that on this machine bio-vcf is 24x faster than SnpSift even for a simple filter.
-In fact, bio-vcf is perfect for complex filters and parsing large data files on powerful machines. Parsing a 650 Mb GATK
-Illumina Hiseq VCF file and evaluating the results into a BED format on
-a 16 core machine takes
+This means that on this machine bio-vcf is 24x faster than SnpSift
+even for a simple filter.  In fact, bio-vcf is perfect for complex
+filters and parsing large data files on powerful machines. Parsing a
+650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
+BED format on a 16 core machine takes
 ```sh
   time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
@@ -77,8 +82,11 @@ a 16 core machine takes
 which shows decent core utilisation (10x). Running
 gzip compressed VCF files of 30+ Gb has similar performance gains.
-Use zcat to
-pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
+To view some complex filters on an 80Gb SNP file check out a
+[GTEx exercise](https://github.com/pjotrp/bioruby-vcf/blob/master/doc/GTEx_reduce.md).
+Use zcat (or even better pigz which is multi-core itself) to pipe such
+gzipped (vcf.gz) files into bio-vcf, e.g.
 ```sh
   zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
@@ -86,11 +94,12 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
     --eval '[r.chrom,r.pos,r.pos+1]' > test.bed
 ```
-bio-vcf comes with a sensible parser definition language (interestingly it is 100%
-Ruby), an embedded Ragel parser for INFO and FORMAT header definitions, as well as primitives for set analysis. Few
+bio-vcf comes with a sensible parser definition language
+(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
+FORMAT header definitions, as well as primitives for set analysis. Few
 assumptions are made about the actual contents of the VCF file (field
-names are resolved on the fly), so bio-vcf should work with
-all VCF files.
+names are resolved on the fly), so bio-vcf should work with all VCF
+files.
 To fetch all entries where all samples have depth larger than 20 and
 filter set to PASS use a sample filter
@@ -168,7 +177,8 @@ bio-vcf -i --seval 's.ad[1]'
 1       10303   25      31      28      32      17      23      22
 ```
-To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
+To calculate percentage non-reference (PNR) alt frequencies from s.ad
+which is sample (alt dp)/(ref dp + alt dp)
 ```ruby
 bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
@@ -262,12 +272,6 @@ gem install bio-vcf
 bio-vcf -h
 ```
-For multi-core also install the parallel gem
-```sh
-gem install parallel
-```
 ## Command line interface (CLI)
 Get the version of the VCF file
@@ -397,6 +401,23 @@ or for all
   bio-vcf --filter "rec.missing_samples?" < file.vcf
 ```
+To set a soft filter, i.e. the filter column is updated
+```sh
+bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
+```
+may render something like
+```
+1       46527674        4       LowQD
+1       108417572       4       LowQD
+1       155449089       4       LowQD
+1       169847826       4       LowQD
+1       203098164       3       LowQD
+2       39213209        4       LowQD
+```
 Likewise you can check for record validity
 ```sh
@@ -625,7 +646,7 @@ indexed value array
 and 'gts' as a nucleotide string array
 ```ruby
-  bio-vcf --seval 's.gts[0]'
+  bio-vcf --seval 's.gts'
     1       10665                   C       C               C       C
     1       10694                   G       G
     1       12783   G       G       G       G       G       G       G
@@ -634,6 +655,28 @@ and 'gts' as a nucleotide string array
 where gts represents the indexed genotype on [ref] + [alt].
+To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
+1/1 -> 2, is useful for indexed fields giving information on, for
+example signficance, use
+```ruby
+    bio-vcf --seval '!s.empty? and s.gtindex'
+    11      58949455        0       1
+    11      65481082        0       1
+    11      94180424        0       1
+    11      121036021       0       1
+```
+Now you can index other fields, e.g. GL
+```ruby
+    ./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
+    1       900057  1.0     1.0     0.994   1.0     1.0     -1      0.999   1.0     0.997   -1  0.994    0.989   -1      0.991   -1      0.972   0.992   1.0
+    ```
+shows a number of SNPs have been scored with high significance and a
+number are missing, here marked as -1.
 These values can also be used in filters and output allele depth, for
 example
@@ -655,6 +698,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
 1       13757   47      47      4       47      47      4       47
 ```
+## Sample counting
+Note, the use of lambda allows for sophisticated queries. You may need
+some expert advice here.
+To count valid genotype field in samples you can do something like
+```ruby
+bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
+```
+A similar complex count would be
+```ruby
+    bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
+```
+which tests for perfect SNPs scored (for example).
+## Reorder filter with lambda
+Sometime it pay to reorder the filter using a lambda. This is one
+example where the greedy sample counts are done only for those
+samples that match the other criteria:
+```ruby
+./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
+```
 ## Modify VCF files
@@ -904,7 +975,10 @@ Simple statistics are available for REF>ALT changes:
 ## Other examples
-For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
+For more exercises and examples see
+[doc](https://github.com/pjotrp/bioruby-vcf/tree/master/doc) directory
+and the the feature
+[section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
 ## API
@@ -937,13 +1011,62 @@ what the command line interface uses (see ./bin/bio-vcf)
 ## Trouble shooting
+### MRI supports threading
 Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
 in single threaded mode (for now).
+### Set TMPDIR when running out of space
 The multi-threading creates temporary files using the system TMPDIR.
 This behaviour can be overridden by setting the environment variable.
-Also, for genome-wide sequencing it may be useful to increase
---thread-lines to a value larger than 1_000_000.
+### Reorder filter on time out
+Make sure to minimize expensive calculations by moving them
+backward. An 'and' statement is evaluated from left to right. With
+```ruby
+fast_check and slow_check
+```
+slow_check only gets executed if fast_check is true.
+For more complex filters use lambda inside a conditional
+```ruby
+    ( fast_check ? lambda { slow_check }.call : false )
+```
+where slow_check is the slow section of your query. As is shown
+earlier in this document. Don't forget the .call!
+### Reduce thread lines on timeout
+Depending on your input data and the speed filters it may be useful to
+tweak the number of thread lines and/or to increase the timeout.
+On really fast file systems for genome-wide sequencing try increasing
+--thread-lines to a value larger than 100_000. On the other hand if
+the computations are intensive (per line) reduce the number of
+thread-lines (try 10_000 and 1_000).  If processes get killed that is
+the one to try.
+For larger files set the timeout to 600, or so. --timeout 600.
+Different values may show different core use on a machine.
+### Debugging
+To debug output use '-v --num-threads=1' for generating useful
+output. Also do not use the -i switch (ignore errors) when there
+are problems.
+### Tmpdir contains (old) bio-vcf directories
+Multi-threaded bio-vcf writes into a temporary directory during
+processing. When a process gets interrupted for some reason the
+temporary directory may remain.
 ## Project home page

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.9.0
1	+ 0.9.2

data/bin/bio-vcf CHANGED

@@ -27,7 +27,7 @@ require 'fileutils'
 # Bio::Log::CLI.logger('stderr')
 # Bio::Log::CLI.trace('info')
-options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
+options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
 opts = OptionParser.new do |o|
   o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g.  #{File.basename($0)} < test/data/input/somaticsniper.vcf"
@@ -58,6 +58,9 @@ opts = OptionParser.new do |o|
   o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
     options[:efilter_samples] = l
   end
+  o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
+    options[:add_filter] = name
+  end
   o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
     options[:bed] = bed
@@ -88,7 +91,7 @@ opts = OptionParser.new do |o|
     options[:rdf] = true
     options[:skip_header] = true
   end
-  o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
+  o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
     options[:num_threads] = i
   end
   o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -148,7 +151,7 @@ opts = OptionParser.new do |o|
     options[:verbose] = true
   end
-  o.on("--debug", "Show debug messages") do |v|
+  o.on("--debug", "Show debug messages and keep intermediate output") do |v|
     # Bio::Log::CLI.trace('debug')
     options[:debug] = true
   end
@@ -196,6 +199,8 @@ end
 raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
 raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
 raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
+# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
+# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
 if options[:samples]
   samples = options[:samples].map { |s| s.to_i }
@@ -218,6 +223,13 @@ def parse_header line, samples, options
       if headerline =~ /^#CHR/
         # The header before actual data contains the sample names, first inject the BioVcf meta information
         print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
+        # Then the additional filter(s)
+        # ##FILTER=<ID=LowQual,Description="Low quality">
+        add_filter = options[:add_filter]
+        if add_filter
+          print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
+        end
         selected = header.column_names
         if samples
           newfields = selected[0..8]
@@ -247,9 +259,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   sfilter = options[:sfilter]
   efilter = options[:efilter]
   ifilter = options[:ifilter]
+  add_filter = options[:add_filter] # contains a filter name (soft filter)
   seval = options[:seval]
   ignore_missing = options[:ignore_missing]
   quiet = options[:quiet]
+  set_filter_field = nil
   if sfilter or efilter or ifilter or seval
     # check for samples
@@ -264,15 +278,28 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
     return if not bed
   end
-  return if filter and not rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
+  skip = lambda { |&m|
+    matched = m.call
+    if add_filter
+      set_filter_field = true if matched
+      false  # always continue processing with an add-filter
+    else
+      not matched
+    end
+  }
+  if filter
+    return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
+  end
-  if sfilter
+  if sfilter # sample 'or' filter
     rec.each_sample(options[:sfilter_samples]) do | sample |
-      return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      # return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
     end
   end
-  if ifilter
+  if ifilter # include sample filter
     found = false
     rec.each_sample(options[:ifilter_samples]) do | sample |
       if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -281,12 +308,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
       end
     end
     # Skip if there are no matches
-    return if not found
+    return if skip.call {found}
   end
-  if efilter
+  if efilter # exclude sample filter
     rec.each_sample(options[:efilter_samples]) do | sample |
-      return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
     end
   end
@@ -294,6 +321,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   # -----------------------------
   # From here on decide on output
+  rec.add_to_filter_field(add_filter) if set_filter_field
   if samples
     # Select certain samples for output
     newfields = fields[0..8]
@@ -349,10 +379,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   end
 end
-pcows = PCOWS.new(options[:num_threads],'bio-vcf',options[:timeout])
+CHUNK_SIZE = options[:thread_lines]
+pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
+                  options[:quiet],options[:debug])
 header = nil
 header_output_completed = false
-CHUNK_SIZE = options[:thread_lines]
 chunk_lines = []
 line_number=0
@@ -411,7 +443,7 @@ begin
     # ---- In the following section the VCF lines are parsed by chunks
     #      The chunks may go into different threads
-    if chunk_lines.size > CHUNK_SIZE
+    if chunk_lines.size >= CHUNK_SIZE
       # ---- process one chunk
       $stderr.print '.' if not options[:quiet]
       pcows.wait_for_worker_slot()
@@ -421,7 +453,7 @@ begin
       chunk_lines = []
     end
   end
-  pcows.submit_worker(process,chunk_lines)
+  pcows.submit_final_worker(process,chunk_lines)
   pcows.wait_for_workers()
   pcows.process_remaining_output()
@@ -429,8 +461,8 @@ begin
   stats.print if stats
 rescue Exception => e
-  # $stderr.print line
   $stderr.print e.message,"\n" if e.message != 'exit'
+  pcows.cleanup()
   raise if options[:verbose]
   exit 1
 end