RubyGems - bio-vcf - Versions diffs - 0.9.0 → 0.9.2 - Mend

bio-vcf 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -3
data/Gemfile.lock +44 -0
data/README.md +151 -28
data/VERSION +1 -1
data/bin/bio-vcf +47 -15
data/bio-vcf.gemspec +4 -21
data/features/#cli.feature# +71 -0
data/features/cli.feature +3 -3
data/features/filter.feature +12 -0
data/features/filter.feature~ +35 -0
data/features/somaticsniper.feature +2 -0
data/features/step_definitions/cli-feature.rb +5 -0
data/features/step_definitions/somaticsniper.rb +8 -0
data/lib/bio-vcf/pcows.rb +123 -36
data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
data/lib/bio-vcf/vcfrecord.rb +21 -0
data/lib/bio-vcf/vcfsample.rb +13 -0
data/test/data/regression/eval_once-stderr.new +2 -1
data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
data/test/data/regression/pass1-stderr.new +8 -0
data/test/data/regression/pass1.new +88 -0
data/test/data/regression/pass1.ref +88 -0
data/test/data/regression/r.info.dp-stderr.new +8 -4
data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
data/test/data/regression/s.dp-stderr.new +8 -4
data/test/data/regression/seval_s.dp-stderr.new +8 -4
data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
data/test/data/regression/thread4-stderr.new +8 -4
data/test/data/regression/thread4_4-stderr.new +44 -15
data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
data/test/stress/stress_test.sh +15 -0
data/test/stress/stress_test.sh~ +8 -0
metadata +14 -5

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1364a1b5bd401632c6c2ae2497a358c206d7a9e6
-  data.tar.gz: 2a8cad615012dcd175cc3bb614888b33db16483b
+  metadata.gz: a09729e3548751923f4b3c5ef81c8c9d7402b6b2
+  data.tar.gz: 4c525ad745c5486075e9a0f14fe5372a21c8f056
 SHA512:
-  metadata.gz: d8420c9926835ff632cd8dd93c057c08f37dc557c86d76fd1766a044dcc5a8b7c573553c2ef6efe8e348d868666c7fa6d62b34779e214b090cf3287e087e61c8
-  data.tar.gz: b666a3d97a63ac18ef7ff88e002b88dee8a81e703cadaacaffc968fe7f634591bcf5df40cd930b9493e293530047d8cc6a31df446ebf1e5cd7bfb2f6164e3e0a
+  metadata.gz: 343083ee8c055f534a840c8f668cb35a0c33fbccabe2b580edf859747aff8c8069266168ac66631bc3bbd2c8f58691847796eb00cef7784c7ebf966ec85e1d4f
+  data.tar.gz: f55292d0d744a496a5b39123285904a120f4c4ffae066dc33f244e09ae021618e94071cf8e587f56debfaf0c54233c3a5688887e21b21f5640bc8b271a3a00bb

data/.travis.yml CHANGED

@@ -3,9 +3,8 @@ sudo: false  # required for the new containers
 language: ruby
 rvm:
 #  - 1.9.3 <- No longer working
-# - 2.0.0
-# - 2.1.0
-  - 2.2.2
+  - 2.1.0
+  - 2.2.3
 # install:
 #   - gem install cucumber rspec regressiontest

data/Gemfile.lock ADDED

@@ -0,0 +1,44 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    builder (3.2.2)
+    cucumber (2.1.0)
+      builder (>= 2.1.2)
+      cucumber-core (~> 1.3.0)
+      diff-lcs (>= 1.1.3)
+      gherkin3 (~> 3.1.0)
+      multi_json (>= 1.7.5, < 2.0)
+      multi_test (>= 0.1.2)
+    cucumber-core (1.3.0)
+      gherkin3 (~> 3.1.0)
+    diff-lcs (1.2.5)
+    gherkin3 (3.1.1)
+    multi_json (1.11.2)
+    multi_test (0.1.2)
+    rake (10.4.2)
+    regressiontest (0.0.3)
+    rspec (3.3.0)
+      rspec-core (~> 3.3.0)
+      rspec-expectations (~> 3.3.0)
+      rspec-mocks (~> 3.3.0)
+    rspec-core (3.3.2)
+      rspec-support (~> 3.3.0)
+    rspec-expectations (3.3.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.3.0)
+    rspec-mocks (3.3.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.3.0)
+    rspec-support (3.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  cucumber
+  rake
+  regressiontest (>= 0.0.3)
+  rspec
+BUNDLED WITH
+   1.10.6

data/README.md CHANGED

@@ -4,6 +4,9 @@
 ## Updates
+* Getting ready for a 1.0 release
+* 0.9.1 removed a rare threading bug and cleanup on error
+* Added support for soft filters (request by Brad Chapman)
 * The outputter now writes (properly) in parallel with the parser
 * bio-vcf turns any VCF into JSON with header information, and
   allows you to pipe that JSON directly into any JSON supporting
@@ -23,18 +26,19 @@ So, why would you use bio-vcf over other parsers? Because
 3. Bio-vcf has great multi-sample support
 4. Bio-vcf has multiple global filters and sample filters
 5. Bio-vcf can access any VCF format
-6. Bio-vcf can parse and query the VCF header (META)
+6. Bio-vcf can parse and query the VCF header (META data)
 7. Bio-vcf can do calculations on fields
 8. Bio-vcf allows for genotype processing
 9. Bio-vcf has support for set analysis
 10. Bio-vcf has sane error handling
 11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
+12. Bio-vcf has soft filters
-Bio-vcf has better performance than other tools
-because of lazy parsing, multi-threading, and useful combinations of
-(fancy) command line filtering (who says Ruby is slow?). Adding
-cores, bio-vcf just does better. The more complicated the filters,
-the larger the gain. First the base line test to show IO performance
+Bio-vcf has better performance than other tools because of lazy
+parsing, multi-threading, and useful combinations of (fancy) command
+line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
+does better. The more complicated the filters, the larger the
+gain. First a base line test to show IO performance
 ```sh
   time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
@@ -44,7 +48,7 @@ the larger the gain. First the base line test to show IO performance
   sys     0m2.972s
 ```
-Next run the 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
+Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
 ```sh
   time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
@@ -62,10 +66,11 @@ user    12m53.273s
 sys     0m9.913s
 ```
-This means that on this machine bio-vcf is 24x faster than SnpSift even for a simple filter.
-In fact, bio-vcf is perfect for complex filters and parsing large data files on powerful machines. Parsing a 650 Mb GATK
-Illumina Hiseq VCF file and evaluating the results into a BED format on
-a 16 core machine takes
+This means that on this machine bio-vcf is 24x faster than SnpSift
+even for a simple filter.  In fact, bio-vcf is perfect for complex
+filters and parsing large data files on powerful machines. Parsing a
+650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
+BED format on a 16 core machine takes
 ```sh
   time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
@@ -77,8 +82,11 @@ a 16 core machine takes
 which shows decent core utilisation (10x). Running
 gzip compressed VCF files of 30+ Gb has similar performance gains.
-Use zcat to
-pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
+To view some complex filters on an 80Gb SNP file check out a
+[GTEx exercise](https://github.com/pjotrp/bioruby-vcf/blob/master/doc/GTEx_reduce.md).
+Use zcat (or even better pigz which is multi-core itself) to pipe such
+gzipped (vcf.gz) files into bio-vcf, e.g.
 ```sh
   zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
@@ -86,11 +94,12 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
     --eval '[r.chrom,r.pos,r.pos+1]' > test.bed
 ```
-bio-vcf comes with a sensible parser definition language (interestingly it is 100%
-Ruby), an embedded Ragel parser for INFO and FORMAT header definitions, as well as primitives for set analysis. Few
+bio-vcf comes with a sensible parser definition language
+(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
+FORMAT header definitions, as well as primitives for set analysis. Few
 assumptions are made about the actual contents of the VCF file (field
-names are resolved on the fly), so bio-vcf should work with
-all VCF files.
+names are resolved on the fly), so bio-vcf should work with all VCF
+files.
 To fetch all entries where all samples have depth larger than 20 and
 filter set to PASS use a sample filter
@@ -168,7 +177,8 @@ bio-vcf -i --seval 's.ad[1]'
 1       10303   25      31      28      32      17      23      22
 ```
-To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
+To calculate percentage non-reference (PNR) alt frequencies from s.ad
+which is sample (alt dp)/(ref dp + alt dp)
 ```ruby
 bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
@@ -262,12 +272,6 @@ gem install bio-vcf
 bio-vcf -h
 ```
-For multi-core also install the parallel gem
-```sh
-gem install parallel
-```
 ## Command line interface (CLI)
 Get the version of the VCF file
@@ -397,6 +401,23 @@ or for all
   bio-vcf --filter "rec.missing_samples?" < file.vcf
 ```
+To set a soft filter, i.e. the filter column is updated
+```sh
+bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
+```
+may render something like
+```
+1       46527674        4       LowQD
+1       108417572       4       LowQD
+1       155449089       4       LowQD
+1       169847826       4       LowQD
+1       203098164       3       LowQD
+2       39213209        4       LowQD
+```
 Likewise you can check for record validity
 ```sh
@@ -625,7 +646,7 @@ indexed value array
 and 'gts' as a nucleotide string array
 ```ruby
-  bio-vcf --seval 's.gts[0]'
+  bio-vcf --seval 's.gts'
     1       10665                   C       C               C       C
     1       10694                   G       G
     1       12783   G       G       G       G       G       G       G
@@ -634,6 +655,28 @@ and 'gts' as a nucleotide string array
 where gts represents the indexed genotype on [ref] + [alt].
+To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
+1/1 -> 2, is useful for indexed fields giving information on, for
+example signficance, use
+```ruby
+    bio-vcf --seval '!s.empty? and s.gtindex'
+    11      58949455        0       1
+    11      65481082        0       1
+    11      94180424        0       1
+    11      121036021       0       1
+```
+Now you can index other fields, e.g. GL
+```ruby
+    ./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
+    1       900057  1.0     1.0     0.994   1.0     1.0     -1      0.999   1.0     0.997   -1  0.994    0.989   -1      0.991   -1      0.972   0.992   1.0
+    ```
+shows a number of SNPs have been scored with high significance and a
+number are missing, here marked as -1.
 These values can also be used in filters and output allele depth, for
 example
@@ -655,6 +698,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
 1       13757   47      47      4       47      47      4       47
 ```
+## Sample counting
+Note, the use of lambda allows for sophisticated queries. You may need
+some expert advice here.
+To count valid genotype field in samples you can do something like
+```ruby
+bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
+```
+A similar complex count would be
+```ruby
+    bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
+```
+which tests for perfect SNPs scored (for example).
+## Reorder filter with lambda
+Sometime it pay to reorder the filter using a lambda. This is one
+example where the greedy sample counts are done only for those
+samples that match the other criteria:
+```ruby
+./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
+```
 ## Modify VCF files
@@ -904,7 +975,10 @@ Simple statistics are available for REF>ALT changes:
 ## Other examples
-For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
+For more exercises and examples see
+[doc](https://github.com/pjotrp/bioruby-vcf/tree/master/doc) directory
+and the the feature
+[section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
 ## API
@@ -937,13 +1011,62 @@ what the command line interface uses (see ./bin/bio-vcf)
 ## Trouble shooting
+### MRI supports threading
 Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
 in single threaded mode (for now).
+### Set TMPDIR when running out of space
 The multi-threading creates temporary files using the system TMPDIR.
 This behaviour can be overridden by setting the environment variable.
-Also, for genome-wide sequencing it may be useful to increase
---thread-lines to a value larger than 1_000_000.
+### Reorder filter on time out
+Make sure to minimize expensive calculations by moving them
+backward. An 'and' statement is evaluated from left to right. With
+```ruby
+fast_check and slow_check
+```
+slow_check only gets executed if fast_check is true.
+For more complex filters use lambda inside a conditional
+```ruby
+    ( fast_check ? lambda { slow_check }.call : false )
+```
+where slow_check is the slow section of your query. As is shown
+earlier in this document. Don't forget the .call!
+### Reduce thread lines on timeout
+Depending on your input data and the speed filters it may be useful to
+tweak the number of thread lines and/or to increase the timeout.
+On really fast file systems for genome-wide sequencing try increasing
+--thread-lines to a value larger than 100_000. On the other hand if
+the computations are intensive (per line) reduce the number of
+thread-lines (try 10_000 and 1_000).  If processes get killed that is
+the one to try.
+For larger files set the timeout to 600, or so. --timeout 600.
+Different values may show different core use on a machine.
+### Debugging
+To debug output use '-v --num-threads=1' for generating useful
+output. Also do not use the -i switch (ignore errors) when there
+are problems.
+### Tmpdir contains (old) bio-vcf directories
+Multi-threaded bio-vcf writes into a temporary directory during
+processing. When a process gets interrupted for some reason the
+temporary directory may remain.
 ## Project home page

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.9.0
1	+ 0.9.2

data/bin/bio-vcf CHANGED

@@ -27,7 +27,7 @@ require 'fileutils'
 # Bio::Log::CLI.logger('stderr')
 # Bio::Log::CLI.trace('info')
-options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
+options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
 opts = OptionParser.new do |o|
   o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g.  #{File.basename($0)} < test/data/input/somaticsniper.vcf"
@@ -58,6 +58,9 @@ opts = OptionParser.new do |o|
   o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
     options[:efilter_samples] = l
   end
+  o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
+    options[:add_filter] = name
+  end
   o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
     options[:bed] = bed
@@ -88,7 +91,7 @@ opts = OptionParser.new do |o|
     options[:rdf] = true
     options[:skip_header] = true
   end
-  o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
+  o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
     options[:num_threads] = i
   end
   o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -148,7 +151,7 @@ opts = OptionParser.new do |o|
     options[:verbose] = true
   end
-  o.on("--debug", "Show debug messages") do |v|
+  o.on("--debug", "Show debug messages and keep intermediate output") do |v|
     # Bio::Log::CLI.trace('debug')
     options[:debug] = true
   end
@@ -196,6 +199,8 @@ end
 raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
 raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
 raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
+# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
+# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
 if options[:samples]
   samples = options[:samples].map { |s| s.to_i }
@@ -218,6 +223,13 @@ def parse_header line, samples, options
       if headerline =~ /^#CHR/
         # The header before actual data contains the sample names, first inject the BioVcf meta information
         print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
+        # Then the additional filter(s)
+        # ##FILTER=<ID=LowQual,Description="Low quality">
+        add_filter = options[:add_filter]
+        if add_filter
+          print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
+        end
         selected = header.column_names
         if samples
           newfields = selected[0..8]
@@ -247,9 +259,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   sfilter = options[:sfilter]
   efilter = options[:efilter]
   ifilter = options[:ifilter]
+  add_filter = options[:add_filter] # contains a filter name (soft filter)
   seval = options[:seval]
   ignore_missing = options[:ignore_missing]
   quiet = options[:quiet]
+  set_filter_field = nil
   if sfilter or efilter or ifilter or seval
     # check for samples
@@ -264,15 +278,28 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
     return if not bed
   end
-  return if filter and not rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
+  skip = lambda { |&m|
+    matched = m.call
+    if add_filter
+      set_filter_field = true if matched
+      false  # always continue processing with an add-filter
+    else
+      not matched
+    end
+  }
+  if filter
+    return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
+  end
-  if sfilter
+  if sfilter # sample 'or' filter
     rec.each_sample(options[:sfilter_samples]) do | sample |
-      return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      # return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
     end
   end
-  if ifilter
+  if ifilter # include sample filter
     found = false
     rec.each_sample(options[:ifilter_samples]) do | sample |
       if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -281,12 +308,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
       end
     end
     # Skip if there are no matches
-    return if not found
+    return if skip.call {found}
   end
-  if efilter
+  if efilter # exclude sample filter
     rec.each_sample(options[:efilter_samples]) do | sample |
-      return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
+      return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
     end
   end
@@ -294,6 +321,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   # -----------------------------
   # From here on decide on output
+  rec.add_to_filter_field(add_filter) if set_filter_field
   if samples
     # Select certain samples for output
     newfields = fields[0..8]
@@ -349,10 +379,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
   end
 end
-pcows = PCOWS.new(options[:num_threads],'bio-vcf',options[:timeout])
+CHUNK_SIZE = options[:thread_lines]
+pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
+                  options[:quiet],options[:debug])
 header = nil
 header_output_completed = false
-CHUNK_SIZE = options[:thread_lines]
 chunk_lines = []
 line_number=0
@@ -411,7 +443,7 @@ begin
     # ---- In the following section the VCF lines are parsed by chunks
     #      The chunks may go into different threads
-    if chunk_lines.size > CHUNK_SIZE
+    if chunk_lines.size >= CHUNK_SIZE
       # ---- process one chunk
       $stderr.print '.' if not options[:quiet]
       pcows.wait_for_worker_slot()
@@ -421,7 +453,7 @@ begin
       chunk_lines = []
     end
   end
-  pcows.submit_worker(process,chunk_lines)
+  pcows.submit_final_worker(process,chunk_lines)
   pcows.wait_for_workers()
   pcows.process_remaining_output()
@@ -429,8 +461,8 @@ begin
   stats.print if stats
 rescue Exception => e
-  # $stderr.print line
   $stderr.print e.message,"\n" if e.message != 'exit'
+  pcows.cleanup()
   raise if options[:verbose]
   exit 1
 end