bio-vcf 0.9.0 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -3
- data/Gemfile.lock +44 -0
- data/README.md +151 -28
- data/VERSION +1 -1
- data/bin/bio-vcf +47 -15
- data/bio-vcf.gemspec +4 -21
- data/features/#cli.feature# +71 -0
- data/features/cli.feature +3 -3
- data/features/filter.feature +12 -0
- data/features/filter.feature~ +35 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +5 -0
- data/features/step_definitions/somaticsniper.rb +8 -0
- data/lib/bio-vcf/pcows.rb +123 -36
- data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +21 -0
- data/lib/bio-vcf/vcfsample.rb +13 -0
- data/test/data/regression/eval_once-stderr.new +2 -1
- data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
- data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
- data/test/data/regression/pass1-stderr.new +8 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +8 -4
- data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
- data/test/data/regression/s.dp-stderr.new +8 -4
- data/test/data/regression/seval_s.dp-stderr.new +8 -4
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
- data/test/data/regression/thread4-stderr.new +8 -4
- data/test/data/regression/thread4_4-stderr.new +44 -15
- data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
- data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
- data/test/stress/stress_test.sh +15 -0
- data/test/stress/stress_test.sh~ +8 -0
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a09729e3548751923f4b3c5ef81c8c9d7402b6b2
|
4
|
+
data.tar.gz: 4c525ad745c5486075e9a0f14fe5372a21c8f056
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 343083ee8c055f534a840c8f668cb35a0c33fbccabe2b580edf859747aff8c8069266168ac66631bc3bbd2c8f58691847796eb00cef7784c7ebf966ec85e1d4f
|
7
|
+
data.tar.gz: f55292d0d744a496a5b39123285904a120f4c4ffae066dc33f244e09ae021618e94071cf8e587f56debfaf0c54233c3a5688887e21b21f5640bc8b271a3a00bb
|
data/.travis.yml
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
builder (3.2.2)
|
5
|
+
cucumber (2.1.0)
|
6
|
+
builder (>= 2.1.2)
|
7
|
+
cucumber-core (~> 1.3.0)
|
8
|
+
diff-lcs (>= 1.1.3)
|
9
|
+
gherkin3 (~> 3.1.0)
|
10
|
+
multi_json (>= 1.7.5, < 2.0)
|
11
|
+
multi_test (>= 0.1.2)
|
12
|
+
cucumber-core (1.3.0)
|
13
|
+
gherkin3 (~> 3.1.0)
|
14
|
+
diff-lcs (1.2.5)
|
15
|
+
gherkin3 (3.1.1)
|
16
|
+
multi_json (1.11.2)
|
17
|
+
multi_test (0.1.2)
|
18
|
+
rake (10.4.2)
|
19
|
+
regressiontest (0.0.3)
|
20
|
+
rspec (3.3.0)
|
21
|
+
rspec-core (~> 3.3.0)
|
22
|
+
rspec-expectations (~> 3.3.0)
|
23
|
+
rspec-mocks (~> 3.3.0)
|
24
|
+
rspec-core (3.3.2)
|
25
|
+
rspec-support (~> 3.3.0)
|
26
|
+
rspec-expectations (3.3.1)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.3.0)
|
29
|
+
rspec-mocks (3.3.2)
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
+
rspec-support (~> 3.3.0)
|
32
|
+
rspec-support (3.3.0)
|
33
|
+
|
34
|
+
PLATFORMS
|
35
|
+
ruby
|
36
|
+
|
37
|
+
DEPENDENCIES
|
38
|
+
cucumber
|
39
|
+
rake
|
40
|
+
regressiontest (>= 0.0.3)
|
41
|
+
rspec
|
42
|
+
|
43
|
+
BUNDLED WITH
|
44
|
+
1.10.6
|
data/README.md
CHANGED
@@ -4,6 +4,9 @@
|
|
4
4
|
|
5
5
|
## Updates
|
6
6
|
|
7
|
+
* Getting ready for a 1.0 release
|
8
|
+
* 0.9.1 removed a rare threading bug and cleanup on error
|
9
|
+
* Added support for soft filters (request by Brad Chapman)
|
7
10
|
* The outputter now writes (properly) in parallel with the parser
|
8
11
|
* bio-vcf turns any VCF into JSON with header information, and
|
9
12
|
allows you to pipe that JSON directly into any JSON supporting
|
@@ -23,18 +26,19 @@ So, why would you use bio-vcf over other parsers? Because
|
|
23
26
|
3. Bio-vcf has great multi-sample support
|
24
27
|
4. Bio-vcf has multiple global filters and sample filters
|
25
28
|
5. Bio-vcf can access any VCF format
|
26
|
-
6. Bio-vcf can parse and query the VCF header (META)
|
29
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
27
30
|
7. Bio-vcf can do calculations on fields
|
28
31
|
8. Bio-vcf allows for genotype processing
|
29
32
|
9. Bio-vcf has support for set analysis
|
30
33
|
10. Bio-vcf has sane error handling
|
31
34
|
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
35
|
+
12. Bio-vcf has soft filters
|
32
36
|
|
33
|
-
Bio-vcf has better performance than other tools
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
37
|
+
Bio-vcf has better performance than other tools because of lazy
|
38
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
39
|
+
line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
|
40
|
+
does better. The more complicated the filters, the larger the
|
41
|
+
gain. First a base line test to show IO performance
|
38
42
|
|
39
43
|
```sh
|
40
44
|
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
@@ -44,7 +48,7 @@ the larger the gain. First the base line test to show IO performance
|
|
44
48
|
sys 0m2.972s
|
45
49
|
```
|
46
50
|
|
47
|
-
Next run
|
51
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
48
52
|
|
49
53
|
```sh
|
50
54
|
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
@@ -62,10 +66,11 @@ user 12m53.273s
|
|
62
66
|
sys 0m9.913s
|
63
67
|
```
|
64
68
|
|
65
|
-
This means that on this machine bio-vcf is 24x faster than SnpSift
|
66
|
-
In fact, bio-vcf is perfect for complex
|
67
|
-
|
68
|
-
|
69
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
70
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
71
|
+
filters and parsing large data files on powerful machines. Parsing a
|
72
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
73
|
+
BED format on a 16 core machine takes
|
69
74
|
|
70
75
|
```sh
|
71
76
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -77,8 +82,11 @@ a 16 core machine takes
|
|
77
82
|
which shows decent core utilisation (10x). Running
|
78
83
|
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
79
84
|
|
80
|
-
|
81
|
-
|
85
|
+
To view some complex filters on an 80Gb SNP file check out a
|
86
|
+
[GTEx exercise](https://github.com/pjotrp/bioruby-vcf/blob/master/doc/GTEx_reduce.md).
|
87
|
+
|
88
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
89
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
82
90
|
|
83
91
|
```sh
|
84
92
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
@@ -86,11 +94,12 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
|
86
94
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
87
95
|
```
|
88
96
|
|
89
|
-
bio-vcf comes with a sensible parser definition language
|
90
|
-
Ruby), an embedded Ragel parser for INFO and
|
97
|
+
bio-vcf comes with a sensible parser definition language
|
98
|
+
(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
|
99
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
91
100
|
assumptions are made about the actual contents of the VCF file (field
|
92
|
-
names are resolved on the fly), so bio-vcf should work with
|
93
|
-
|
101
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
102
|
+
files.
|
94
103
|
|
95
104
|
To fetch all entries where all samples have depth larger than 20 and
|
96
105
|
filter set to PASS use a sample filter
|
@@ -168,7 +177,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
168
177
|
1 10303 25 31 28 32 17 23 22
|
169
178
|
```
|
170
179
|
|
171
|
-
To calculate alt frequencies from s.ad
|
180
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
181
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
172
182
|
|
173
183
|
```ruby
|
174
184
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -262,12 +272,6 @@ gem install bio-vcf
|
|
262
272
|
bio-vcf -h
|
263
273
|
```
|
264
274
|
|
265
|
-
For multi-core also install the parallel gem
|
266
|
-
|
267
|
-
```sh
|
268
|
-
gem install parallel
|
269
|
-
```
|
270
|
-
|
271
275
|
## Command line interface (CLI)
|
272
276
|
|
273
277
|
Get the version of the VCF file
|
@@ -397,6 +401,23 @@ or for all
|
|
397
401
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
398
402
|
```
|
399
403
|
|
404
|
+
To set a soft filter, i.e. the filter column is updated
|
405
|
+
|
406
|
+
```sh
|
407
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
408
|
+
```
|
409
|
+
|
410
|
+
may render something like
|
411
|
+
|
412
|
+
```
|
413
|
+
1 46527674 4 LowQD
|
414
|
+
1 108417572 4 LowQD
|
415
|
+
1 155449089 4 LowQD
|
416
|
+
1 169847826 4 LowQD
|
417
|
+
1 203098164 3 LowQD
|
418
|
+
2 39213209 4 LowQD
|
419
|
+
```
|
420
|
+
|
400
421
|
Likewise you can check for record validity
|
401
422
|
|
402
423
|
```sh
|
@@ -625,7 +646,7 @@ indexed value array
|
|
625
646
|
and 'gts' as a nucleotide string array
|
626
647
|
|
627
648
|
```ruby
|
628
|
-
bio-vcf --seval 's.gts
|
649
|
+
bio-vcf --seval 's.gts'
|
629
650
|
1 10665 C C C C
|
630
651
|
1 10694 G G
|
631
652
|
1 12783 G G G G G G G
|
@@ -634,6 +655,28 @@ and 'gts' as a nucleotide string array
|
|
634
655
|
|
635
656
|
where gts represents the indexed genotype on [ref] + [alt].
|
636
657
|
|
658
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
659
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
660
|
+
example signficance, use
|
661
|
+
|
662
|
+
```ruby
|
663
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
664
|
+
11 58949455 0 1
|
665
|
+
11 65481082 0 1
|
666
|
+
11 94180424 0 1
|
667
|
+
11 121036021 0 1
|
668
|
+
```
|
669
|
+
|
670
|
+
Now you can index other fields, e.g. GL
|
671
|
+
|
672
|
+
```ruby
|
673
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
674
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
675
|
+
```
|
676
|
+
|
677
|
+
shows a number of SNPs have been scored with high significance and a
|
678
|
+
number are missing, here marked as -1.
|
679
|
+
|
637
680
|
These values can also be used in filters and output allele depth, for
|
638
681
|
example
|
639
682
|
|
@@ -655,6 +698,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
655
698
|
1 13757 47 47 4 47 47 4 47
|
656
699
|
```
|
657
700
|
|
701
|
+
## Sample counting
|
702
|
+
|
703
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
704
|
+
some expert advice here.
|
705
|
+
|
706
|
+
To count valid genotype field in samples you can do something like
|
707
|
+
|
708
|
+
```ruby
|
709
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
710
|
+
```
|
711
|
+
|
712
|
+
A similar complex count would be
|
713
|
+
|
714
|
+
```ruby
|
715
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
716
|
+
```
|
717
|
+
|
718
|
+
which tests for perfect SNPs scored (for example).
|
719
|
+
|
720
|
+
## Reorder filter with lambda
|
721
|
+
|
722
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
723
|
+
example where the greedy sample counts are done only for those
|
724
|
+
samples that match the other criteria:
|
725
|
+
|
726
|
+
```ruby
|
727
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
728
|
+
```
|
658
729
|
|
659
730
|
## Modify VCF files
|
660
731
|
|
@@ -904,7 +975,10 @@ Simple statistics are available for REF>ALT changes:
|
|
904
975
|
|
905
976
|
## Other examples
|
906
977
|
|
907
|
-
For more examples see
|
978
|
+
For more exercises and examples see
|
979
|
+
[doc](https://github.com/pjotrp/bioruby-vcf/tree/master/doc) directory
|
980
|
+
and the the feature
|
981
|
+
[section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
908
982
|
|
909
983
|
## API
|
910
984
|
|
@@ -937,13 +1011,62 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
937
1011
|
|
938
1012
|
## Trouble shooting
|
939
1013
|
|
1014
|
+
### MRI supports threading
|
1015
|
+
|
940
1016
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
941
1017
|
in single threaded mode (for now).
|
942
1018
|
|
1019
|
+
### Set TMPDIR when running out of space
|
1020
|
+
|
943
1021
|
The multi-threading creates temporary files using the system TMPDIR.
|
944
1022
|
This behaviour can be overridden by setting the environment variable.
|
945
|
-
|
946
|
-
|
1023
|
+
|
1024
|
+
### Reorder filter on time out
|
1025
|
+
|
1026
|
+
Make sure to minimize expensive calculations by moving them
|
1027
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1028
|
+
|
1029
|
+
```ruby
|
1030
|
+
fast_check and slow_check
|
1031
|
+
```
|
1032
|
+
|
1033
|
+
slow_check only gets executed if fast_check is true.
|
1034
|
+
|
1035
|
+
For more complex filters use lambda inside a conditional
|
1036
|
+
|
1037
|
+
```ruby
|
1038
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1039
|
+
```
|
1040
|
+
|
1041
|
+
where slow_check is the slow section of your query. As is shown
|
1042
|
+
earlier in this document. Don't forget the .call!
|
1043
|
+
|
1044
|
+
### Reduce thread lines on timeout
|
1045
|
+
|
1046
|
+
Depending on your input data and the speed filters it may be useful to
|
1047
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1048
|
+
|
1049
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1050
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1051
|
+
the computations are intensive (per line) reduce the number of
|
1052
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1053
|
+
the one to try.
|
1054
|
+
|
1055
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1056
|
+
|
1057
|
+
Different values may show different core use on a machine.
|
1058
|
+
|
1059
|
+
### Debugging
|
1060
|
+
|
1061
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1062
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1063
|
+
are problems.
|
1064
|
+
|
1065
|
+
### Tmpdir contains (old) bio-vcf directories
|
1066
|
+
|
1067
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1068
|
+
processing. When a process gets interrupted for some reason the
|
1069
|
+
temporary directory may remain.
|
947
1070
|
|
948
1071
|
## Project home page
|
949
1072
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.9.
|
1
|
+
0.9.2
|
data/bin/bio-vcf
CHANGED
@@ -27,7 +27,7 @@ require 'fileutils'
|
|
27
27
|
# Bio::Log::CLI.logger('stderr')
|
28
28
|
# Bio::Log::CLI.trace('info')
|
29
29
|
|
30
|
-
options = { show_help: false, source: 'https://github.com/
|
30
|
+
options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
|
31
31
|
opts = OptionParser.new do |o|
|
32
32
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
33
33
|
|
@@ -58,6 +58,9 @@ opts = OptionParser.new do |o|
|
|
58
58
|
o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
|
59
59
|
options[:efilter_samples] = l
|
60
60
|
end
|
61
|
+
o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
|
62
|
+
options[:add_filter] = name
|
63
|
+
end
|
61
64
|
|
62
65
|
o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
|
63
66
|
options[:bed] = bed
|
@@ -88,7 +91,7 @@ opts = OptionParser.new do |o|
|
|
88
91
|
options[:rdf] = true
|
89
92
|
options[:skip_header] = true
|
90
93
|
end
|
91
|
-
o.on("--num-threads [num]", Integer, "Multi-core version (default
|
94
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
|
92
95
|
options[:num_threads] = i
|
93
96
|
end
|
94
97
|
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
@@ -148,7 +151,7 @@ opts = OptionParser.new do |o|
|
|
148
151
|
options[:verbose] = true
|
149
152
|
end
|
150
153
|
|
151
|
-
o.on("--debug", "Show debug messages") do |v|
|
154
|
+
o.on("--debug", "Show debug messages and keep intermediate output") do |v|
|
152
155
|
# Bio::Log::CLI.trace('debug')
|
153
156
|
options[:debug] = true
|
154
157
|
end
|
@@ -196,6 +199,8 @@ end
|
|
196
199
|
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
197
200
|
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
198
201
|
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
202
|
+
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
|
203
|
+
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
|
199
204
|
|
200
205
|
if options[:samples]
|
201
206
|
samples = options[:samples].map { |s| s.to_i }
|
@@ -218,6 +223,13 @@ def parse_header line, samples, options
|
|
218
223
|
if headerline =~ /^#CHR/
|
219
224
|
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
220
225
|
print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
|
226
|
+
# Then the additional filter(s)
|
227
|
+
# ##FILTER=<ID=LowQual,Description="Low quality">
|
228
|
+
add_filter = options[:add_filter]
|
229
|
+
if add_filter
|
230
|
+
print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
|
231
|
+
end
|
232
|
+
|
221
233
|
selected = header.column_names
|
222
234
|
if samples
|
223
235
|
newfields = selected[0..8]
|
@@ -247,9 +259,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
247
259
|
sfilter = options[:sfilter]
|
248
260
|
efilter = options[:efilter]
|
249
261
|
ifilter = options[:ifilter]
|
262
|
+
add_filter = options[:add_filter] # contains a filter name (soft filter)
|
250
263
|
seval = options[:seval]
|
251
264
|
ignore_missing = options[:ignore_missing]
|
252
265
|
quiet = options[:quiet]
|
266
|
+
set_filter_field = nil
|
253
267
|
|
254
268
|
if sfilter or efilter or ifilter or seval
|
255
269
|
# check for samples
|
@@ -264,15 +278,28 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
264
278
|
return if not bed
|
265
279
|
end
|
266
280
|
|
267
|
-
|
281
|
+
skip = lambda { |&m|
|
282
|
+
matched = m.call
|
283
|
+
if add_filter
|
284
|
+
set_filter_field = true if matched
|
285
|
+
false # always continue processing with an add-filter
|
286
|
+
else
|
287
|
+
not matched
|
288
|
+
end
|
289
|
+
}
|
290
|
+
|
291
|
+
if filter
|
292
|
+
return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
293
|
+
end
|
268
294
|
|
269
|
-
if sfilter
|
295
|
+
if sfilter # sample 'or' filter
|
270
296
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
271
|
-
return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
297
|
+
# return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
298
|
+
return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
272
299
|
end
|
273
300
|
end
|
274
301
|
|
275
|
-
if ifilter
|
302
|
+
if ifilter # include sample filter
|
276
303
|
found = false
|
277
304
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
278
305
|
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
@@ -281,12 +308,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
281
308
|
end
|
282
309
|
end
|
283
310
|
# Skip if there are no matches
|
284
|
-
return if
|
311
|
+
return if skip.call {found}
|
285
312
|
end
|
286
313
|
|
287
|
-
if efilter
|
314
|
+
if efilter # exclude sample filter
|
288
315
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
289
|
-
return if
|
316
|
+
return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
290
317
|
end
|
291
318
|
end
|
292
319
|
|
@@ -294,6 +321,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
294
321
|
|
295
322
|
# -----------------------------
|
296
323
|
# From here on decide on output
|
324
|
+
|
325
|
+
rec.add_to_filter_field(add_filter) if set_filter_field
|
326
|
+
|
297
327
|
if samples
|
298
328
|
# Select certain samples for output
|
299
329
|
newfields = fields[0..8]
|
@@ -349,10 +379,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
349
379
|
end
|
350
380
|
end
|
351
381
|
|
352
|
-
|
382
|
+
CHUNK_SIZE = options[:thread_lines]
|
383
|
+
|
384
|
+
pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
|
385
|
+
options[:quiet],options[:debug])
|
353
386
|
header = nil
|
354
387
|
header_output_completed = false
|
355
|
-
CHUNK_SIZE = options[:thread_lines]
|
356
388
|
chunk_lines = []
|
357
389
|
line_number=0
|
358
390
|
|
@@ -411,7 +443,7 @@ begin
|
|
411
443
|
# ---- In the following section the VCF lines are parsed by chunks
|
412
444
|
# The chunks may go into different threads
|
413
445
|
|
414
|
-
if chunk_lines.size
|
446
|
+
if chunk_lines.size >= CHUNK_SIZE
|
415
447
|
# ---- process one chunk
|
416
448
|
$stderr.print '.' if not options[:quiet]
|
417
449
|
pcows.wait_for_worker_slot()
|
@@ -421,7 +453,7 @@ begin
|
|
421
453
|
chunk_lines = []
|
422
454
|
end
|
423
455
|
end
|
424
|
-
pcows.
|
456
|
+
pcows.submit_final_worker(process,chunk_lines)
|
425
457
|
pcows.wait_for_workers()
|
426
458
|
pcows.process_remaining_output()
|
427
459
|
|
@@ -429,8 +461,8 @@ begin
|
|
429
461
|
stats.print if stats
|
430
462
|
|
431
463
|
rescue Exception => e
|
432
|
-
# $stderr.print line
|
433
464
|
$stderr.print e.message,"\n" if e.message != 'exit'
|
465
|
+
pcows.cleanup()
|
434
466
|
raise if options[:verbose]
|
435
467
|
exit 1
|
436
468
|
end
|