RubyGems - bio-vcf - Versions diffs - 0.8.1 → 0.9.5 - Mend

bio-vcf 0.8.1 → 0.9.5

Files changed (84) hide show

checksums.yaml +5 -5
data/.travis.yml +1 -11
data/Gemfile +2 -8
data/LICENSE.txt +1 -1
data/README.md +467 -129
data/RELEASE_NOTES.md +27 -0
data/RELEASE_NOTES.md~ +11 -0
data/Rakefile +9 -42
data/TAGS +115 -0
data/VERSION +1 -1
data/bin/bio-vcf +156 -108
data/bio-vcf.gemspec +13 -75
data/features/cli.feature +22 -4
data/features/diff_count.feature +0 -1
data/features/filter.feature +12 -0
data/features/multisample.feature +12 -0
data/features/somaticsniper.feature +2 -0
data/features/step_definitions/cli-feature.rb +15 -6
data/features/step_definitions/diff_count.rb +1 -1
data/features/step_definitions/multisample.rb +19 -0
data/features/step_definitions/somaticsniper.rb +9 -1
data/features/step_definitions/vcf_header.rb +48 -0
data/features/support/env.rb +1 -11
data/features/vcf_header.feature +35 -0
data/lib/bio-vcf.rb +1 -0
data/lib/bio-vcf/pcows.rb +303 -0
data/lib/bio-vcf/vcffile.rb +46 -0
data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
data/lib/bio-vcf/vcfheader.rb +137 -5
data/lib/bio-vcf/vcfheader_line.rb +778 -0
data/lib/bio-vcf/vcfrecord.rb +56 -18
data/lib/bio-vcf/vcfsample.rb +26 -2
data/lib/regressiontest.rb +11 -0
data/lib/regressiontest/cli_exec.rb +101 -0
data/ragel/gen_vcfheaderline_parser.rl +165 -0
data/ragel/generate.sh +8 -0
data/template/vcf2json.erb +16 -16
data/template/vcf2json_full_header.erb +22 -0
data/template/vcf2json_use_meta.erb +41 -0
data/test/data/input/empty.vcf +2 -0
data/test/data/input/gatk_exome.vcf +237 -0
data/test/data/input/gatk_wgs.vcf +1000 -0
data/test/data/input/test.bed +632 -0
data/test/data/regression/empty-stderr.new +12 -0
data/test/data/regression/empty.new +2 -0
data/test/data/regression/empty.ref +2 -0
data/test/data/regression/eval_once-stderr.new +2 -0
data/test/data/regression/eval_once.new +1 -0
data/test/data/regression/eval_once.ref +1 -0
data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
data/test/data/regression/eval_r.info.dp.new +150 -0
data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
data/test/data/regression/ifilter_s.dp.new +31 -0
data/test/data/regression/pass1-stderr.new +10 -0
data/test/data/regression/pass1.new +88 -0
data/test/data/regression/pass1.ref +88 -0
data/test/data/regression/r.info.dp-stderr.new +4 -0
data/test/data/regression/r.info.dp.new +114 -0
data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
data/test/data/regression/rewrite.info.sample.new +150 -0
data/test/data/regression/s.dp-stderr.new +18 -0
data/test/data/regression/s.dp.new +145 -0
data/test/data/regression/seval_s.dp-stderr.new +10 -0
data/test/data/regression/seval_s.dp.new +36 -0
data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
data/test/data/regression/sfilter_seval_s.dp.new +31 -0
data/test/data/regression/thread4-stderr.new +10 -0
data/test/data/regression/thread4.new +150 -0
data/test/data/regression/thread4_4-stderr.new +25 -0
data/test/data/regression/thread4_4.new +130 -0
data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
data/test/data/regression/thread4_4_failed_filter.new +110 -0
data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
data/test/data/regression/vcf2json_full_header.new +225 -0
data/test/data/regression/vcf2json_full_header.ref +225 -0
data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
data/test/data/regression/vcf2json_use_meta.new +4697 -0
data/test/data/regression/vcf2json_use_meta.ref +4697 -0
data/test/performance/metrics.md +18 -1
data/test/stress/stress_test.sh +15 -0
data/test/tmp/test.vcf +12469 -0
metadata +63 -64
data/Gemfile.lock +0 -81

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 90a933c33c683c1f0886a202fa5a9ee5ed2ad8ff
-  data.tar.gz: 1f769a89fcb3e3b44e22864ddf729ea3ac040260
+SHA256:
+  metadata.gz: 814f6cb6c8bc237fd08ab4f22f2bfea514525fc6c2fd1b9081cd314bfa3c2fd2
+  data.tar.gz: 194aa006ac5c46c157360e37e98e42dbf88d121f9c64983792e8a7cfe43d0142
 SHA512:
-  metadata.gz: 308d93ca1bcb142fa9cd4be63d929edb0ad92b7ac0da0d4f2d51f4b363de6ef0b87ac3d2688af8c36257317e203f69355fdc3348c6a330adb7d997af7ab6714d
-  data.tar.gz: d7d328a13d90b209a6068f9d3f09d56e8a00262ccf7fba9d9d67ffe4935993b2ccbb2ddc2f9a4831dd8928258a9c5d468f050d9edfbb95737616f4bfaf184bb0
+  metadata.gz: 4cad19fa108652d42aaaff95296176a21781813848a1dacae879cd44fcb324b9f38f763fc1e516ce41cb492433457c17d27e4aca0f64308758594fb68c0abadf
+  data.tar.gz: 5290a23fe85fe063b6fb8606c77cca3abd3df3af624ac9da2ac3348336bbfafa7e8b3d70d2c5c3ba57a55da75a052f1fa2c1d20212ba7b4e656ac7263f5f1af0

data/.travis.yml CHANGED

@@ -1,13 +1,3 @@
 language: ruby
-rvm:
-#  - 1.9.3 <- No longer working
-  - 2.0.0
-  - 2.1.0
-#  - jruby-head
-#  - jruby-19mode # JRuby in 1.9 mode
-#  - 1.8.7
-#  - jruby-18mode # JRuby in 1.8 mode
-#  - rbx-18mode
-# uncomment this line if your project needs to run something other than `rake`:
-# script: bundle exec rspec spec
+arch: arm64

data/Gemfile CHANGED

@@ -1,15 +1,9 @@
 source "http://rubygems.org"
-# Add dependencies required to use your gem here.
-# Example:
-#   gem "activesupport", ">= 2.3.5"
-# Add dependencies to develop your gem here.
-# Include everything needed to run rake, tests, features, etc.
 group :development do
-  # gem "minitest"
+  gem "rake"
   gem "rspec"
   gem "cucumber"
-  gem "jeweler", "~> 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
-  gem "regressiontest", "~> 0.0.3"
 end

data/LICENSE.txt CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2013 Pjotr Prins
+Copyright (c) 2013-2020 Pjotr Prins <pjotr.public68@thebird.nl>
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED

@@ -1,49 +1,146 @@
 # bio-vcf
-[![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-vcf.png)](http://travis-ci.org/pjotrp/bioruby-vcf)
+[![Build Status](https://secure.travis-ci.org/vcflib/bio-vcf.png)](http://travis-ci.org/vcflib/bio-vcf) [![rubygem](https://img.shields.io/gem/v/bio-vcf.svg?style=flat)](http://rubygems.org/gems/bio-vcf "Install with Rubygems") [![AnacondaBadge](https://anaconda.org/bioconda/bio-vcf/badges/installer/conda.svg)](https://anaconda.org/bioconda/bio-vcf) [![DL](https://anaconda.org/bioconda/bio-vcf/badges/downloads.svg)](https://anaconda.org/bioconda/bio-vcf)
+[![DebianBadge](https://badges.debian.net/badges/debian/testing/bio-vcf/version.svg)](https://packages.debian.org/testing/bio-vcf)
-A new generation VCF parser. Bio-vcf is not only fast for genome-wide
-(WGS) data, it also comes with a really nice filtering, evaluation and
-rewrite language and it can output any type of textual data, including
-RDF and JSON. Why would you use bio-vcf over other parsers?
+Quick index:
+- [INSTALL](#Install)
+- [Command line interface (CLI)](#command-line-interface-cli)
+  + [Set analysis](#set-analysis)
+  + [Genotype processing](#genotype-processing)
+  + [Sample counting](#sample-counting)
+  + [Filter with lambda](#reorder-filter-with-lambda)
+  + [Modify VCF files](#modify-vcf-files)
+  + [RDF output](#rdf-output)
+- [Templates](#templates)
+- [Metadata](#metadata)
+- [Statistics](#statistics)
+- [API](#api)
+- [Cite](#cite)
+## Bio-vcf
+Bio-vcf provides a domain specific language (DSL) for processing the
+VCF format. Record named fields can be queried with regular
+expressions, e.g.
+```ruby
+sample.dp>20 and rec.filter !~ /LowQD/ and rec.tumor.bcount[rec.alt]>4
+```
+Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf
+is not only very fast for genome-wide (WGS) data, it also comes with a
+really nice filtering, evaluation and rewrite language and it can
+output any type of textual data, including VCF header and contents in
+RDF and JSON.
+So, why would you use bio-vcf over other parsers? Because
 1. Bio-vcf is fast and scales on multi-core computers
 2. Bio-vcf has an expressive filtering and evaluation language
 3. Bio-vcf has great multi-sample support
 4. Bio-vcf has multiple global filters and sample filters
 5. Bio-vcf can access any VCF format
-6. Bio-vcf can do calculations on fields
-7. Bio-vcf allows for genotype processing
-8. Bio-vcf has support for set analysis
-9. Bio-vcf has sane error handling
-10. Bio-vcf can output tabular data, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs using (erb) templates
+6. Bio-vcf can parse and query the VCF header (META data)
+7. Bio-vcf can do calculations on fields
+8. Bio-vcf allows for genotype processing
+9. Bio-vcf has support for set analysis
+10. Bio-vcf has sane error handling
+11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
+12. Bio-vcf has soft filters
+Some examples are documented for [reducing GTeX](doc/GTEx_reduce.md),
+[comparing GATK](doc/GATK_comparison.md), [comparing
+VCFs](doc/Compare_VCFs.md), JSON [loading Mongo
+database](doc/Using_Mongo.md), and [generating RDF](doc/Using_RDF.md).
+## Options
+In true Unix fashion files can be piped in or passed on the command
+line:
+    bio-vcf --help
+```
+bio-vcf (biogem with pcows) by Pjotr Prins 2015-2020
+Usage: bio-vcf [options] filename
+e.g.  bio-vcf < test/data/input/somaticsniper.vcf
+    -i, --ignore-missing             Ignore missing data
+        --filter cmd                 Evaluate filter on each record
+        --sfilter cmd                Evaluate filter on each sample
+        --sfilter-samples list       Filter on selected samples (e.g., 0,1
+        --ifilter, --if cmd          Include filter
+        --ifilter-samples list       Include set - implicitely defines exclude set
+        --efilter, --ef cmd          Exclude filter
+        --efilter-samples list       Exclude set - overrides exclude set
+        --add-filter name            Set/add filter field to name
+        --bed bedfile                Filter on BED elements
+    -e, --eval cmd                   Evaluate command on each record
+        --eval-once cmd              Evaluate command once (usually for header info)
+        --seval cmd                  Evaluate command on each sample
+        --rewrite eval               Rewrite INFO
+        --samples list               Output selected samples
+        --rdf                        Generate Turtle RDF (also check out --template!)
+        --num-threads [num]          Multi-core version (default ALL)
+        --thread-lines num           Fork thread on num lines (default 40000)
+        --skip-header                Do not output VCF header info
+        --set-header list            Set a special tab delimited output header (#samples expands to sample names)
+    -t, --template erb               Use ERB template for output
+        --add-header-tag             Add bio-vcf status tag to header output
+        --timeout [num]              Timeout waiting for thread to complete (default 180)
+        --names                      Output sample names
+        --statistics                 Output statistics
+    -q, --quiet                      Run quietly
+    -v, --verbose                    Run verbosely
+        --debug                      Show debug messages and keep intermediate output
+        --id name                    Identifier
+        --tags list                  Add tags
+    -h, --help                       display this help and exit
+```
+## Performance
+Bio-vcf has better performance than other tools because of lazy
+parsing, multi-threading, and useful combinations of (fancy) command
+line filtering. Adding cores, bio-vcf just
+does better. The more complicated the filters, the larger the
+gain. First a base line test to show IO performance
-Bio-vcf has better performance than other tools
-because of lazy parsing, multi-threading, and useful combinations of
-(fancy) command line filtering. For example on an 2 core machine
-bio-vcf is typically 50% faster than JVM based SnpSift. On an 8 core machine
-bio-vcf is at least 3x faster than SnpSift. Parsing a 1 Gb ESP
-VCF with 8 cores with bio-vcf takes
+```sh
+  time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
+  1987143 15897724 1003214613
+  real    0m7.823s
+  user    0m7.002s
+  sys     0m2.972s
+```
+Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
 ```sh
-  time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
-  real    0m21.095s
-  user    1m41.101s
-  sys     0m7.852s
+  time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
+  real    0m32.491s
+  user    2m34.767s
+  sys     0m12.733s
 ```
-while parsing with SnpSift takes
+The same with SnpSift v4.0 takes
 ```sh
-  time cat ESP6500SI_V2_SSA137.vcf |java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > test.vcf
-  real    1m4.913s
-  user    0m58.071s
-  sys     0m7.982s
+time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > /dev/null
+real    12m36.121s
+user    12m53.273s
+sys     0m9.913s
 ```
-Bio-vcf is perfect for parsing large data files. Parsing a 650 Mb GATK
-Illumina Hiseq VCF file and evaluating the results into a BED format on
-a 16 core machine takes
+This means that on this machine bio-vcf is 24x faster than SnpSift
+even for a simple filter.  In fact, bio-vcf is perfect for complex
+filters and parsing large data files on powerful machines. Parsing a
+650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
+BED format on a 16 core machine takes
 ```sh
   time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
@@ -52,29 +149,38 @@ a 16 core machine takes
   sys     0m5.039s
 ```
-which shows decent core utilisation (10x). We are running
-gzip compressed VCF files of 30+ Gb with similar performance gains.
+which shows decent core utilisation (10x). Running
+gzip compressed VCF files of 30+ Gb has similar performance gains.
-Use zcat to
-pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
+To view some complex filters on an 80Gb SNP file check out a
+[GTEx exercise](https://github.com/vcflib/bio-vcf/blob/master/doc/GTEx_reduce.md).
+Use zcat (or even better pigz which is multi-core itself) to pipe such
+gzipped (vcf.gz) files into bio-vcf, e.g.
 ```sh
   zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
-    --sfilter '!s.empty? and s.dp>20'
+    --sfilter '!s.empty? and s.dp>20'
     --eval '[r.chrom,r.pos,r.pos+1]' > test.bed
 ```
-bio-vcf comes with a sensible parser definition language (it is 100%
-Ruby), as well as primitives for set analysis. Few
+bio-vcf comes with a sensible parser definition language, an embedded Ragel parser for INFO and
+FORMAT header definitions, as well as primitives for set analysis. Few
 assumptions are made about the actual contents of the VCF file (field
-names are resolved on the fly), so bio-vcf should practically work with
-all VCF files.
+names are resolved on the fly), so bio-vcf should work with all VCF
+files.
-To fetch all entries where all samples have depth larger than 20 use
-a sample filter
+To fetch all entries where all samples have depth larger than 20 and
+filter set to PASS use a sample filter
 ```ruby
-  bio-vcf --sfilter 'sample.dp>20' < file.vcf
+  bio-vcf --sfilter 'sample.dp>20 and rec.filter=="PASS"' < file.vcf
+```
+or with a regex
+```ruby
+  bio-vcf --sfilter 'sample.dp>20 and rec.filter !~ /LowQD/' < file.vcf
 ```
 To only filter on some samples number 0 and 3:
@@ -87,7 +193,7 @@ Where 's.dp' is the shorter name for 'sample.dp'.
 It is also possible to specify sample names, or info fields:
-For example, to filter somatic data
+For example, to filter somatic data
 ```ruby
   bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
@@ -140,7 +246,8 @@ bio-vcf -i --seval 's.ad[1]'
 1       10303   25      31      28      32      17      23      22
 ```
-To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
+To calculate percentage non-reference (PNR) alt frequencies from s.ad
+which is sample (alt dp)/(ref dp + alt dp)
 ```ruby
 bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
@@ -214,30 +321,64 @@ The VCF format is commonly used for variant calling between NGS
 samples. The fast parser needs to carry some state, recorded for each
 file in VcfHeader, which contains the VCF file header. Individual
 lines (variant calls) first go through a raw parser returning an array
-of fields. Further (lazy) parsing is handled through VcfRecord.
+of fields. Further (lazy) parsing is handled through VcfRecord.
 At this point the filter is pretty generic with multi-sample support.
 If something is not working, check out the feature descriptions and
 the source code. It is not hard to add features. Otherwise, send a short
 example of a VCF statement you need to work on.
-## Installation
+## Install
+Requirements:
-Note that you need Ruby 2.x or later. The 2.x Ruby series also give
-a performance improvement. Bio-vcf will show the Ruby version when
-typing the command 'bio-vcf -h'.
+* ruby
-To intall bio-vcf with gem:
+To install bio-vcf with Ruby gems, install Ruby first, e.g. on Debian
+(as root)
+```sh
+apt-get install ruby
+```
+Installing ruby includes the `gem` command to install bio-vcf:
 ```sh
 gem install bio-vcf
+export PATH=/usr/local/bin:$PATH
 bio-vcf -h
 ```
-For multi-core also install the parallel gem
+displays the help
+```
+bio-vcf x.x (biogem Ruby with pcows) by Pjotr Prins 2015-2020
+Usage: bio-vcf [options] filename
+e.g.  bio-vcf < test/data/input/somaticsniper.vcf
+    -i, --ignore-missing             Ignore missing data
+        --filter cmd                 Evaluate filter on each record
+(etc.)
+```
+To install without root you may install a gem locally with
 ```sh
-gem install parallel
+gem install --install-dir ~/bio-vcf bio-vcf
+```
+and run it with something like
+```sh
+~/bio-vcf/gems/bio-vcf-0.9.4/bin/bio-vcf -h
+```
+Finally, it is possible to checkout the git repository and simply
+run the tool with
+```sh
+git clone https://github.com/vcflib/bio-vcf.git
+cd bio-vcf
+ruby ./bin/bio-vcf -h
 ```
 ## Command line interface (CLI)
@@ -263,56 +404,72 @@ Get the sample names
   NORMAL,TUMOR
 ```
+Alternatively use the command line switch for --names, e.g.
+```ruby
+  bio-vcf --names < file.vcf
+  NORMAL,TUMOR
+```
+Get information from the header (META)
+```ruby
+  bio-vcf -q --skip-header --eval-once 'header.meta["GATKCommandLine"]' < gatk_exome.vcf
+```
 The 'fields' array contains unprocessed data (strings).  Print first
 five raw fields
 ```ruby
-  bio-vcf --eval 'fields[0..4]' < file.vcf
+  bio-vcf --eval 'fields[0..4]' < file.vcf
 ```
 Add a filter to display the fields on chromosome 12
 ```ruby
-  bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
+  bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
 ```
 It gets better when we start using processed data, represented by an
 object named 'rec'. Position is a value, so we can filter a range
 ```ruby
-  bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
+  bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
 ```
 The shorter name for 'rec.chrom' is 'r.chrom', so you may write
 ```ruby
-  bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
+  bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
 ```
 To ignore and continue parsing on missing data use the
 --ignore-missing (-i) and or --quiet (-q) switches
 ```ruby
-  bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
+  bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
 ```
 Info fields are referenced by
 ```ruby
-  bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
+  bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
 ```
-With subfields defined by rec.format
+(alternatively you can use the indexed rec.info['DP'] and list INFO fields with
+rec.info.fields).
+Subfields defined by rec.format:
 ```ruby
-  bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
+  bio-vcf --filter 'rec.tumor.ss != 2' < file.vcf
 ```
 Output
 ```ruby
-  bio-vcf --filter 'rec.tumor.gq>30'
-    --eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
+  bio-vcf --filter 'rec.tumor.gq>30'
+    --eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
     < file.vcf
 ```
@@ -326,26 +483,26 @@ Show the count of the bases that were scored as somatic
 Actually, we have a convenience implementation for bcount, so this is the same
 ```ruby
-  bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
+  bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
     < file.vcf
 ```
 Filter on the somatic results that were scored at least 4 times
 ```ruby
-  bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
+  bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bcount[rec.alt]>4' < test.vcf
 ```
 Similar for base quality scores
 ```ruby
-  bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
+  bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
 ```
 Filter out on sample values
 ```ruby
-  bio-vcf --sfilter 's.dp>20' < test.vcf
+  bio-vcf --sfilter 's.dp>20' < test.vcf
 ```
 To filter missing on samples:
@@ -360,6 +517,23 @@ or for all
   bio-vcf --filter "rec.missing_samples?" < file.vcf
 ```
+To set a soft filter, i.e. the filter column is updated
+```sh
+bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
+```
+may render something like
+```
+1       46527674        4       LowQD
+1       108417572       4       LowQD
+1       155449089       4       LowQD
+1       169847826       4       LowQD
+1       203098164       3       LowQD
+2       39213209        4       LowQD
+```
 Likewise you can check for record validity
 ```sh
@@ -410,17 +584,17 @@ Even shorter r is an alias for rec
 Note: special functions are not yet implemented! Look below
 for genotype processing which has indexing in 'gti'.
-Sometime you want to use a special function in a filter. For
-example percentage variant reads can be defined as [a,c,g,t]
-with frequencies against sample read depth (dp) as
-[0,0.03,0.47,0.50]. Filtering would with a special function,
+Sometime you want to use a special function in a filter. For
+example percentage variant reads can be defined as [a,c,g,t]
+with frequencies against sample read depth (dp) as
+[0,0.03,0.47,0.50]. Filtering would with a special function,
 which we named freq
 ```sh
   bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
 ```
-which is equal to
+which is equal to
 ```sh
   bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
@@ -440,7 +614,7 @@ ref should always be identical across samples.
 ## DbSNP
-One clinical variant DbSNP example
+One clinical variant DbSNP example
 ```sh
     bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
@@ -465,16 +639,16 @@ renders
 bio-vcf allows for set analysis. With the complement filter, for
 example, samples are selected that evaluate to true, all others should
-evaluate to false. For this we create three filters, one for all
+evaluate to false. For this we create three filters, one for all
 samples that are included (the --ifilter or -if), for all samples that
 are excluded (the --efilter or -ef) and for any sample (the --sfilter
 or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
-filter).
+filter).
 The equivalent of the union filter is by using the --sfilter, so
 ```sh
-  bio-vcf --sfilter 's.dp>20'
+  bio-vcf --sfilter 's.dp>20'
 ```
 Filters DP on all samples and is true if all samples match the
@@ -482,7 +656,7 @@ criterium (AND). To filter on a subset you can add a
 selector
 ```sh
-  bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
+  bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
 ```
 For set analysis there are the additional ifilter (include) and
@@ -502,7 +676,7 @@ values
 The equivalent of the complement filter is by specifying what samples
 to include, here with a regex and define filters on the included
- and excluded samples (the ones not in ifilter-samples) and the
+ and excluded samples (the ones not in ifilter-samples) and the
 ```sh
   ./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
@@ -523,7 +697,7 @@ To print out the GT's add --seval
 To set an additional filter on the excluded samples:
 ```sh
-  bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
+  bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
 ```
 Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
@@ -536,15 +710,15 @@ In the near future it is also possible to select samples on a regex (here
 select all samples where the name starts with s3)
 ```sh
-  bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
+  bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
 ```
 ```sh
-  bio-vcf --include /s3.+/ --sfilter 'dp>20'  --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
+  bio-vcf --include /s3.+/ --sfilter 'dp>20'  --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
 --set-intersect  include=true
-  bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20'  --ifilter 'gt==s3t1.gt'
+  bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20'  --ifilter 'gt==s3t1.gt'
 --set-catesian   one in include=true, rest=false
-  bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
+  bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
 ```
 With the filter commands you can use --ignore-missing to skip errors.
@@ -567,7 +741,7 @@ results in a string value
 to access components of the genotype field we can use standard Ruby
 ```ruby
-  bio-vcf --seval 's.gt.split(/\//)[0]'
+  bio-vcf --seval 's.gt.split(/\//)[0]'
     1       10665   .     .     0     0     .     0     0
     1       10694   .     .     1     1     .     .     .
     1       12783   0     0     0     0     0     0     0
@@ -578,7 +752,7 @@ or special functions, such as 'gti' which gives the genotype as an
 indexed value array
 ```ruby
-  bio-vcf --seval 's.gti[0]'
+  bio-vcf --seval 's.gti[0]'
     1       10665                   0       0               0       0
     1       10694                   1       1
     1       12783   0       0       0       0       0       0       0
@@ -588,7 +762,7 @@ indexed value array
 and 'gts' as a nucleotide string array
 ```ruby
-  bio-vcf --seval 's.gts[0]'
+  bio-vcf --seval 's.gts'
     1       10665                   C       C               C       C
     1       10694                   G       G
     1       12783   G       G       G       G       G       G       G
@@ -597,6 +771,28 @@ and 'gts' as a nucleotide string array
 where gts represents the indexed genotype on [ref] + [alt].
+To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
+1/1 -> 2, is useful for indexed fields giving information on, for
+example signficance, use
+```ruby
+    bio-vcf --seval '!s.empty? and s.gtindex'
+    11      58949455        0       1
+    11      65481082        0       1
+    11      94180424        0       1
+    11      121036021       0       1
+```
+Now you can index other fields, e.g. GL
+```ruby
+    ./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
+    1       900057  1.0     1.0     0.994   1.0     1.0     -1      0.999   1.0     0.997   -1  0.994    0.989   -1      0.991   -1      0.972   0.992   1.0
+```
+shows a number of SNPs have been scored with high significance and a
+number are missing, here marked as -1.
 These values can also be used in filters and output allele depth, for
 example
@@ -618,6 +814,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
 1       13757   47      47      4       47      47      4       47
 ```
+## Sample counting
+Note, the use of lambda allows for sophisticated queries. You may need
+some expert advice here.
+To count valid genotype field in samples you can do something like
+```ruby
+bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
+```
+A similar complex count would be
+```ruby
+    bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
+```
+which tests for perfect SNPs scored (for example).
+## Reorder filter with lambda
+Sometime it pay to reorder the filter using a lambda. This is one
+example where the greedy sample counts are done only for those
+samples that match the other criteria:
+```ruby
+./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
+```
 ## Modify VCF files
@@ -633,6 +857,17 @@ To remove/select 3 samples:
   bio-vcf --samples 0,1,3 < mytest.vcf
 ```
+You can also select samples by name (as long as they do not contain
+spaces)
+```sh
+  bio-vcf --names < mytest.vcf
+    Original        s1t1    s2t1    s3t1    s1t2    s2t2    s3t2
+  bio-vcf --samples "Original,s1t1,s3t1" < mytest.vcf
+```
 Filter on a BED file and annotate the gene name in the resulting VCF
 ```sh
@@ -679,11 +914,11 @@ Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert t
 ## Templates
-To have more output options blastxmlparser can use an [ERB
+To have more output options bio-vcf can use an [ERB
 template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
 very flexible option that can output textual formats such as JSON, YAML, HTML
 and RDF. Examples are provided in
-[./templates](https://github.com/pjotrp/bioruby-vcf/templates/). A JSON
+[./templates](https://github.com/vcflib/bio-vcf/templates/). A JSON
 template could be
 ```Javascript
@@ -693,11 +928,11 @@ template could be
   "seq:ref": "<%= rec.ref %>" ,
   "seq:alt": "<%= rec.alt[0] %>" ,
   "seq:maf": <%= rec.info.maf[0] %> ,
-  "dp":      <%= rec.info.dp %> ,
+  "dp":      <%= rec.info.dp %>
 };
 ```
-To get JSON, run with something like (combining
+To get JSON, run with something like (combining
 with a filter)
 ```sh
@@ -713,7 +948,7 @@ which renders
   "seq:ref": "C" ,
   "seq:alt": "T" ,
   "seq:maf": 0.0151 ,
-  "dp":      86 ,
+  "dp":      86
 };
 ```
@@ -723,11 +958,11 @@ Likewise for RDF output:
   bio-vcf --template template/vcf2rdf.erb --filter 'r.info.sao==1' < dbsnp.vcf
 ```
-renders the ERB template
+renders the ERB template
 ```ruby
 <%
-  id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
+  id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
 %>
 :<%= id %>
   :query_id    "<%= id %>",
@@ -740,7 +975,7 @@ renders the ERB template
   db:vcf       true .
 ```
-into
+into
 ```
 :ch13_33703698_A
@@ -765,50 +1000,56 @@ can be
 ```Javascript
 =HEADER
 <% require 'json' %>
-[
-  { "HEADER": {
+{ "HEADER": {
     "options":  <%= options.to_h.to_json %>,
     "files":    <%= ARGV %>,
     "version":  "<%= BIOVCF_VERSION %>"
   },
+  "BODY":[
 =BODY
-{
-  "seq:chr": "<%= rec.chrom %>" ,
-  "seq:pos": <%= rec.pos %> ,
-  "seq:ref": "<%= rec.ref %>" ,
-  "seq:alt": "<%= rec.alt[0] %>" ,
-  "dp":      <%= rec.info.dp %> ,
-},
+    {
+      "seq:chr": "<%= rec.chrom %>" ,
+      "seq:pos": <%= rec.pos %> ,
+      "seq:ref": "<%= rec.ref %>" ,
+      "seq:alt": "<%= rec.alt[0] %>" ,
+      "dp":      <%= rec.info.dp %>
+    },
 =FOOTER
-]
+  ]
+}
+```
+with
+```sh
+  bio-vcf --template template/vcf2json.erb < dbsnp.vcf
 ```
 may generate something like
 ```Javascript
-[
-  { "HEADER": {
+{ "HEADER": {
     "options":  {"show_help":false,"source":"https://github.com/CuppenResearch/bioruby-vcf","version":"0.8.1-pre3 (Pjotr Prins)","date":"2014-11-26 12:51:36 +0000","thread_lines":40000,"template":"template/vcf2json.erb","skip_header":true},
     "files":    [],
     "version":  "0.8.1-pre3"
   },
-{
-  "seq:chr": "1" ,
-  "seq:pos": 883516 ,
-  "seq:ref": "G" ,
-  "seq:alt": "A" ,
-  "dp":       ,
-},
-{
-  "seq:chr": "1" ,
-  "seq:pos": 891344 ,
-  "seq:ref": "G" ,
-  "seq:alt": "A" ,
-  "dp":       ,
-},
-]
+  "BODY":[
+    {
+      "seq:chr": "1" ,
+      "seq:pos": 883516 ,
+      "seq:ref": "G" ,
+      "seq:alt": "A" ,
+      "dp":
+    },
+    {
+      "seq:chr": "1" ,
+      "seq:pos": 891344 ,
+      "seq:ref": "G" ,
+      "seq:alt": "A" ,
+      "dp": ,
+    },
+  ]
+}
 ```
 Note that the template is not smart enough to remove the final comma
@@ -816,6 +1057,19 @@ from the last BODY element. To make it valid JSON that needs to be
 removed. A future version may add a parameter to the BODY element or a
 global rewrite function for this purpose. YAML and RDF have no such issue.
+### Using full VCF header (meta) info
+To get and put the full information from the header, simple use
+vcf.meta.to_json.  See ./template/vcf2json_full_header.erb for an
+example. This meta information can also be used to output info fields
+and sample values on the fly! For an example, see the template at
+[./template/vcf2json_use_meta.erb](https://github.com/vcflib/bio-vcf/tree/master/template/vcf2json_use_meta.erb)
+and the generated output at
+[./test/data/regression/vcf2json_use_meta.ref](https://github.com/vcflib/bio-vcf/tree/master/test/data/regression/vcf2json_use_meta.ref).
+This way, it is possible to write templates that can convert the content of
+*any* VCF file without prior knowledge to JSON, RDF, etc.
 ## Statistics
 Simple statistics are available for REF>ALT changes:
@@ -828,7 +1082,7 @@ Simple statistics are available for REF>ALT changes:
       G>A             59      45%
       C>T             30      23%
       A>G              5       4%
-      C>G              5       4%
+      C>G              5       4%
       C>A              5       4%
       G>T              4       3%
       T>C              4       3%
@@ -848,7 +1102,10 @@ Simple statistics are available for REF>ALT changes:
 ## Other examples
-For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
+For more exercises and examples see
+[doc](https://github.com/vcflib/bio-vcf/tree/master/doc) directory
+and the the feature
+[section](https://github.com/vcflib/bio-vcf/tree/master/features).
 ## API
@@ -879,27 +1136,109 @@ what the command line interface uses (see ./bin/bio-vcf)
   end
 ```
+### VCFFile
+The class ```BioVcf::VCFfile``` wraps a file and provides an ```enum``` with the
+method each, that can be used as in iterator.
+```ruby
+vcf_file = "dbsnp.vcf"
+vcf  = BioVcf::VCFfile.new(file:file, is_gz: false )
+it vcf.each
+puts it.peek
+vcf_file = "dbsnp.vcf.gz"
+vcf  = BioVcf::VCFfile.new(file:file, is_gz: true )
+it vcf.each
+puts it.peek
+```
 ## Trouble shooting
+### MRI supports threading
 Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
 in single threaded mode (for now).
+### Set TMPDIR when running out of space
 The multi-threading creates temporary files using the system TMPDIR.
 This behaviour can be overridden by setting the environment variable.
-Also, for genome-wide sequencing it may be useful to increase
---thread-lines to a value larger than 1_000_000.
+### Reorder filter on time out
+Make sure to minimize expensive calculations by moving them
+backward. An 'and' statement is evaluated from left to right. With
+```ruby
+fast_check and slow_check
+```
+slow_check only gets executed if fast_check is true.
+For more complex filters use lambda inside a conditional
+```ruby
+    ( fast_check ? lambda { slow_check }.call : false )
+```
+where slow_check is the slow section of your query. As is shown
+earlier in this document. Don't forget the .call!
+### Reduce thread lines on timeout
+Depending on your input data and the speed filters it may be useful to
+tweak the number of thread lines and/or to increase the timeout.
+On really fast file systems for genome-wide sequencing try increasing
+--thread-lines to a value larger than 100_000. On the other hand if
+the computations are intensive (per line) reduce the number of
+thread-lines (try 10_000 and 1_000).  If processes get killed that is
+the one to try.
+For larger files set the timeout to 600, or so. --timeout 600.
+Different values may show different core use on a machine.
+### Development
+To run the tests from source
+```sh
+bundle install --path vendor/bundle
+bundle exec rake
+```
+Note: we develop in a GNU Guix environment, see the header of
+[guix.scm](guix.scm) which does not use bundler.
+### Debugging
+To debug output use '-v --num-threads=1' for generating useful
+output. Also do not use the -i switch (ignore errors) when there
+are problems.
+### Could not find rake-10.4.2 in any of the sources
+Remove Gemfile.lock before running other tools.
+### Tmpdir contains (old) bio-vcf directories
+Multi-threaded bio-vcf writes into a temporary directory during
+processing. When a process gets interrupted for some reason the
+temporary directory may remain.
 ## Project home page
 Information on the source tree, documentation, examples, issues and
 how to contribute, see
-  http://github.com/pjotrp/bioruby-vcf
+  http://github.com/vcflib/bio-vcf
 ## Cite
 If you use this software, please cite one of
 * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
 * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
@@ -909,5 +1248,4 @@ This Biogem is published at (http://biogems.info/index.html#bio-vcf)
 ## Copyright
-Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
+Copyright (c) 2014-2020 Pjotr Prins. See LICENSE.txt for further details.