bio-vcf 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -3
- data/Gemfile.lock +44 -0
- data/README.md +151 -28
- data/VERSION +1 -1
- data/bin/bio-vcf +47 -15
- data/bio-vcf.gemspec +4 -21
- data/features/#cli.feature# +71 -0
- data/features/cli.feature +3 -3
- data/features/filter.feature +12 -0
- data/features/filter.feature~ +35 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +5 -0
- data/features/step_definitions/somaticsniper.rb +8 -0
- data/lib/bio-vcf/pcows.rb +123 -36
- data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +21 -0
- data/lib/bio-vcf/vcfsample.rb +13 -0
- data/test/data/regression/eval_once-stderr.new +2 -1
- data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
- data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
- data/test/data/regression/pass1-stderr.new +8 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +8 -4
- data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
- data/test/data/regression/s.dp-stderr.new +8 -4
- data/test/data/regression/seval_s.dp-stderr.new +8 -4
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
- data/test/data/regression/thread4-stderr.new +8 -4
- data/test/data/regression/thread4_4-stderr.new +44 -15
- data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
- data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
- data/test/stress/stress_test.sh +15 -0
- data/test/stress/stress_test.sh~ +8 -0
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a09729e3548751923f4b3c5ef81c8c9d7402b6b2
|
4
|
+
data.tar.gz: 4c525ad745c5486075e9a0f14fe5372a21c8f056
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 343083ee8c055f534a840c8f668cb35a0c33fbccabe2b580edf859747aff8c8069266168ac66631bc3bbd2c8f58691847796eb00cef7784c7ebf966ec85e1d4f
|
7
|
+
data.tar.gz: f55292d0d744a496a5b39123285904a120f4c4ffae066dc33f244e09ae021618e94071cf8e587f56debfaf0c54233c3a5688887e21b21f5640bc8b271a3a00bb
|
data/.travis.yml
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
builder (3.2.2)
|
5
|
+
cucumber (2.1.0)
|
6
|
+
builder (>= 2.1.2)
|
7
|
+
cucumber-core (~> 1.3.0)
|
8
|
+
diff-lcs (>= 1.1.3)
|
9
|
+
gherkin3 (~> 3.1.0)
|
10
|
+
multi_json (>= 1.7.5, < 2.0)
|
11
|
+
multi_test (>= 0.1.2)
|
12
|
+
cucumber-core (1.3.0)
|
13
|
+
gherkin3 (~> 3.1.0)
|
14
|
+
diff-lcs (1.2.5)
|
15
|
+
gherkin3 (3.1.1)
|
16
|
+
multi_json (1.11.2)
|
17
|
+
multi_test (0.1.2)
|
18
|
+
rake (10.4.2)
|
19
|
+
regressiontest (0.0.3)
|
20
|
+
rspec (3.3.0)
|
21
|
+
rspec-core (~> 3.3.0)
|
22
|
+
rspec-expectations (~> 3.3.0)
|
23
|
+
rspec-mocks (~> 3.3.0)
|
24
|
+
rspec-core (3.3.2)
|
25
|
+
rspec-support (~> 3.3.0)
|
26
|
+
rspec-expectations (3.3.1)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.3.0)
|
29
|
+
rspec-mocks (3.3.2)
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
+
rspec-support (~> 3.3.0)
|
32
|
+
rspec-support (3.3.0)
|
33
|
+
|
34
|
+
PLATFORMS
|
35
|
+
ruby
|
36
|
+
|
37
|
+
DEPENDENCIES
|
38
|
+
cucumber
|
39
|
+
rake
|
40
|
+
regressiontest (>= 0.0.3)
|
41
|
+
rspec
|
42
|
+
|
43
|
+
BUNDLED WITH
|
44
|
+
1.10.6
|
data/README.md
CHANGED
@@ -4,6 +4,9 @@
|
|
4
4
|
|
5
5
|
## Updates
|
6
6
|
|
7
|
+
* Getting ready for a 1.0 release
|
8
|
+
* 0.9.1 removed a rare threading bug and cleanup on error
|
9
|
+
* Added support for soft filters (request by Brad Chapman)
|
7
10
|
* The outputter now writes (properly) in parallel with the parser
|
8
11
|
* bio-vcf turns any VCF into JSON with header information, and
|
9
12
|
allows you to pipe that JSON directly into any JSON supporting
|
@@ -23,18 +26,19 @@ So, why would you use bio-vcf over other parsers? Because
|
|
23
26
|
3. Bio-vcf has great multi-sample support
|
24
27
|
4. Bio-vcf has multiple global filters and sample filters
|
25
28
|
5. Bio-vcf can access any VCF format
|
26
|
-
6. Bio-vcf can parse and query the VCF header (META)
|
29
|
+
6. Bio-vcf can parse and query the VCF header (META data)
|
27
30
|
7. Bio-vcf can do calculations on fields
|
28
31
|
8. Bio-vcf allows for genotype processing
|
29
32
|
9. Bio-vcf has support for set analysis
|
30
33
|
10. Bio-vcf has sane error handling
|
31
34
|
11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
|
35
|
+
12. Bio-vcf has soft filters
|
32
36
|
|
33
|
-
Bio-vcf has better performance than other tools
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
37
|
+
Bio-vcf has better performance than other tools because of lazy
|
38
|
+
parsing, multi-threading, and useful combinations of (fancy) command
|
39
|
+
line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
|
40
|
+
does better. The more complicated the filters, the larger the
|
41
|
+
gain. First a base line test to show IO performance
|
38
42
|
|
39
43
|
```sh
|
40
44
|
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
|
@@ -44,7 +48,7 @@ the larger the gain. First the base line test to show IO performance
|
|
44
48
|
sys 0m2.972s
|
45
49
|
```
|
46
50
|
|
47
|
-
Next run
|
51
|
+
Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
|
48
52
|
|
49
53
|
```sh
|
50
54
|
time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
|
@@ -62,10 +66,11 @@ user 12m53.273s
|
|
62
66
|
sys 0m9.913s
|
63
67
|
```
|
64
68
|
|
65
|
-
This means that on this machine bio-vcf is 24x faster than SnpSift
|
66
|
-
In fact, bio-vcf is perfect for complex
|
67
|
-
|
68
|
-
|
69
|
+
This means that on this machine bio-vcf is 24x faster than SnpSift
|
70
|
+
even for a simple filter. In fact, bio-vcf is perfect for complex
|
71
|
+
filters and parsing large data files on powerful machines. Parsing a
|
72
|
+
650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
|
73
|
+
BED format on a 16 core machine takes
|
69
74
|
|
70
75
|
```sh
|
71
76
|
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
@@ -77,8 +82,11 @@ a 16 core machine takes
|
|
77
82
|
which shows decent core utilisation (10x). Running
|
78
83
|
gzip compressed VCF files of 30+ Gb has similar performance gains.
|
79
84
|
|
80
|
-
|
81
|
-
|
85
|
+
To view some complex filters on an 80Gb SNP file check out a
|
86
|
+
[GTEx exercise](https://github.com/pjotrp/bioruby-vcf/blob/master/doc/GTEx_reduce.md).
|
87
|
+
|
88
|
+
Use zcat (or even better pigz which is multi-core itself) to pipe such
|
89
|
+
gzipped (vcf.gz) files into bio-vcf, e.g.
|
82
90
|
|
83
91
|
```sh
|
84
92
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
@@ -86,11 +94,12 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
|
86
94
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
87
95
|
```
|
88
96
|
|
89
|
-
bio-vcf comes with a sensible parser definition language
|
90
|
-
Ruby), an embedded Ragel parser for INFO and
|
97
|
+
bio-vcf comes with a sensible parser definition language
|
98
|
+
(interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
|
99
|
+
FORMAT header definitions, as well as primitives for set analysis. Few
|
91
100
|
assumptions are made about the actual contents of the VCF file (field
|
92
|
-
names are resolved on the fly), so bio-vcf should work with
|
93
|
-
|
101
|
+
names are resolved on the fly), so bio-vcf should work with all VCF
|
102
|
+
files.
|
94
103
|
|
95
104
|
To fetch all entries where all samples have depth larger than 20 and
|
96
105
|
filter set to PASS use a sample filter
|
@@ -168,7 +177,8 @@ bio-vcf -i --seval 's.ad[1]'
|
|
168
177
|
1 10303 25 31 28 32 17 23 22
|
169
178
|
```
|
170
179
|
|
171
|
-
To calculate alt frequencies from s.ad
|
180
|
+
To calculate percentage non-reference (PNR) alt frequencies from s.ad
|
181
|
+
which is sample (alt dp)/(ref dp + alt dp)
|
172
182
|
|
173
183
|
```ruby
|
174
184
|
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
@@ -262,12 +272,6 @@ gem install bio-vcf
|
|
262
272
|
bio-vcf -h
|
263
273
|
```
|
264
274
|
|
265
|
-
For multi-core also install the parallel gem
|
266
|
-
|
267
|
-
```sh
|
268
|
-
gem install parallel
|
269
|
-
```
|
270
|
-
|
271
275
|
## Command line interface (CLI)
|
272
276
|
|
273
277
|
Get the version of the VCF file
|
@@ -397,6 +401,23 @@ or for all
|
|
397
401
|
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
398
402
|
```
|
399
403
|
|
404
|
+
To set a soft filter, i.e. the filter column is updated
|
405
|
+
|
406
|
+
```sh
|
407
|
+
bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
|
408
|
+
```
|
409
|
+
|
410
|
+
may render something like
|
411
|
+
|
412
|
+
```
|
413
|
+
1 46527674 4 LowQD
|
414
|
+
1 108417572 4 LowQD
|
415
|
+
1 155449089 4 LowQD
|
416
|
+
1 169847826 4 LowQD
|
417
|
+
1 203098164 3 LowQD
|
418
|
+
2 39213209 4 LowQD
|
419
|
+
```
|
420
|
+
|
400
421
|
Likewise you can check for record validity
|
401
422
|
|
402
423
|
```sh
|
@@ -625,7 +646,7 @@ indexed value array
|
|
625
646
|
and 'gts' as a nucleotide string array
|
626
647
|
|
627
648
|
```ruby
|
628
|
-
bio-vcf --seval 's.gts
|
649
|
+
bio-vcf --seval 's.gts'
|
629
650
|
1 10665 C C C C
|
630
651
|
1 10694 G G
|
631
652
|
1 12783 G G G G G G G
|
@@ -634,6 +655,28 @@ and 'gts' as a nucleotide string array
|
|
634
655
|
|
635
656
|
where gts represents the indexed genotype on [ref] + [alt].
|
636
657
|
|
658
|
+
To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
|
659
|
+
1/1 -> 2, is useful for indexed fields giving information on, for
|
660
|
+
example signficance, use
|
661
|
+
|
662
|
+
```ruby
|
663
|
+
bio-vcf --seval '!s.empty? and s.gtindex'
|
664
|
+
11 58949455 0 1
|
665
|
+
11 65481082 0 1
|
666
|
+
11 94180424 0 1
|
667
|
+
11 121036021 0 1
|
668
|
+
```
|
669
|
+
|
670
|
+
Now you can index other fields, e.g. GL
|
671
|
+
|
672
|
+
```ruby
|
673
|
+
./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
|
674
|
+
1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
|
675
|
+
```
|
676
|
+
|
677
|
+
shows a number of SNPs have been scored with high significance and a
|
678
|
+
number are missing, here marked as -1.
|
679
|
+
|
637
680
|
These values can also be used in filters and output allele depth, for
|
638
681
|
example
|
639
682
|
|
@@ -655,6 +698,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
|
|
655
698
|
1 13757 47 47 4 47 47 4 47
|
656
699
|
```
|
657
700
|
|
701
|
+
## Sample counting
|
702
|
+
|
703
|
+
Note, the use of lambda allows for sophisticated queries. You may need
|
704
|
+
some expert advice here.
|
705
|
+
|
706
|
+
To count valid genotype field in samples you can do something like
|
707
|
+
|
708
|
+
```ruby
|
709
|
+
bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
|
710
|
+
```
|
711
|
+
|
712
|
+
A similar complex count would be
|
713
|
+
|
714
|
+
```ruby
|
715
|
+
bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
|
716
|
+
```
|
717
|
+
|
718
|
+
which tests for perfect SNPs scored (for example).
|
719
|
+
|
720
|
+
## Reorder filter with lambda
|
721
|
+
|
722
|
+
Sometime it pay to reorder the filter using a lambda. This is one
|
723
|
+
example where the greedy sample counts are done only for those
|
724
|
+
samples that match the other criteria:
|
725
|
+
|
726
|
+
```ruby
|
727
|
+
./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
|
728
|
+
```
|
658
729
|
|
659
730
|
## Modify VCF files
|
660
731
|
|
@@ -904,7 +975,10 @@ Simple statistics are available for REF>ALT changes:
|
|
904
975
|
|
905
976
|
## Other examples
|
906
977
|
|
907
|
-
For more examples see
|
978
|
+
For more exercises and examples see
|
979
|
+
[doc](https://github.com/pjotrp/bioruby-vcf/tree/master/doc) directory
|
980
|
+
and the the feature
|
981
|
+
[section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
908
982
|
|
909
983
|
## API
|
910
984
|
|
@@ -937,13 +1011,62 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
937
1011
|
|
938
1012
|
## Trouble shooting
|
939
1013
|
|
1014
|
+
### MRI supports threading
|
1015
|
+
|
940
1016
|
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
941
1017
|
in single threaded mode (for now).
|
942
1018
|
|
1019
|
+
### Set TMPDIR when running out of space
|
1020
|
+
|
943
1021
|
The multi-threading creates temporary files using the system TMPDIR.
|
944
1022
|
This behaviour can be overridden by setting the environment variable.
|
945
|
-
|
946
|
-
|
1023
|
+
|
1024
|
+
### Reorder filter on time out
|
1025
|
+
|
1026
|
+
Make sure to minimize expensive calculations by moving them
|
1027
|
+
backward. An 'and' statement is evaluated from left to right. With
|
1028
|
+
|
1029
|
+
```ruby
|
1030
|
+
fast_check and slow_check
|
1031
|
+
```
|
1032
|
+
|
1033
|
+
slow_check only gets executed if fast_check is true.
|
1034
|
+
|
1035
|
+
For more complex filters use lambda inside a conditional
|
1036
|
+
|
1037
|
+
```ruby
|
1038
|
+
( fast_check ? lambda { slow_check }.call : false )
|
1039
|
+
```
|
1040
|
+
|
1041
|
+
where slow_check is the slow section of your query. As is shown
|
1042
|
+
earlier in this document. Don't forget the .call!
|
1043
|
+
|
1044
|
+
### Reduce thread lines on timeout
|
1045
|
+
|
1046
|
+
Depending on your input data and the speed filters it may be useful to
|
1047
|
+
tweak the number of thread lines and/or to increase the timeout.
|
1048
|
+
|
1049
|
+
On really fast file systems for genome-wide sequencing try increasing
|
1050
|
+
--thread-lines to a value larger than 100_000. On the other hand if
|
1051
|
+
the computations are intensive (per line) reduce the number of
|
1052
|
+
thread-lines (try 10_000 and 1_000). If processes get killed that is
|
1053
|
+
the one to try.
|
1054
|
+
|
1055
|
+
For larger files set the timeout to 600, or so. --timeout 600.
|
1056
|
+
|
1057
|
+
Different values may show different core use on a machine.
|
1058
|
+
|
1059
|
+
### Debugging
|
1060
|
+
|
1061
|
+
To debug output use '-v --num-threads=1' for generating useful
|
1062
|
+
output. Also do not use the -i switch (ignore errors) when there
|
1063
|
+
are problems.
|
1064
|
+
|
1065
|
+
### Tmpdir contains (old) bio-vcf directories
|
1066
|
+
|
1067
|
+
Multi-threaded bio-vcf writes into a temporary directory during
|
1068
|
+
processing. When a process gets interrupted for some reason the
|
1069
|
+
temporary directory may remain.
|
947
1070
|
|
948
1071
|
## Project home page
|
949
1072
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.9.
|
1
|
+
0.9.2
|
data/bin/bio-vcf
CHANGED
@@ -27,7 +27,7 @@ require 'fileutils'
|
|
27
27
|
# Bio::Log::CLI.logger('stderr')
|
28
28
|
# Bio::Log::CLI.trace('info')
|
29
29
|
|
30
|
-
options = { show_help: false, source: 'https://github.com/
|
30
|
+
options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
|
31
31
|
opts = OptionParser.new do |o|
|
32
32
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
33
33
|
|
@@ -58,6 +58,9 @@ opts = OptionParser.new do |o|
|
|
58
58
|
o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
|
59
59
|
options[:efilter_samples] = l
|
60
60
|
end
|
61
|
+
o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
|
62
|
+
options[:add_filter] = name
|
63
|
+
end
|
61
64
|
|
62
65
|
o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
|
63
66
|
options[:bed] = bed
|
@@ -88,7 +91,7 @@ opts = OptionParser.new do |o|
|
|
88
91
|
options[:rdf] = true
|
89
92
|
options[:skip_header] = true
|
90
93
|
end
|
91
|
-
o.on("--num-threads [num]", Integer, "Multi-core version (default
|
94
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
|
92
95
|
options[:num_threads] = i
|
93
96
|
end
|
94
97
|
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
@@ -148,7 +151,7 @@ opts = OptionParser.new do |o|
|
|
148
151
|
options[:verbose] = true
|
149
152
|
end
|
150
153
|
|
151
|
-
o.on("--debug", "Show debug messages") do |v|
|
154
|
+
o.on("--debug", "Show debug messages and keep intermediate output") do |v|
|
152
155
|
# Bio::Log::CLI.trace('debug')
|
153
156
|
options[:debug] = true
|
154
157
|
end
|
@@ -196,6 +199,8 @@ end
|
|
196
199
|
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
197
200
|
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
198
201
|
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
202
|
+
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
|
203
|
+
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
|
199
204
|
|
200
205
|
if options[:samples]
|
201
206
|
samples = options[:samples].map { |s| s.to_i }
|
@@ -218,6 +223,13 @@ def parse_header line, samples, options
|
|
218
223
|
if headerline =~ /^#CHR/
|
219
224
|
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
220
225
|
print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
|
226
|
+
# Then the additional filter(s)
|
227
|
+
# ##FILTER=<ID=LowQual,Description="Low quality">
|
228
|
+
add_filter = options[:add_filter]
|
229
|
+
if add_filter
|
230
|
+
print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
|
231
|
+
end
|
232
|
+
|
221
233
|
selected = header.column_names
|
222
234
|
if samples
|
223
235
|
newfields = selected[0..8]
|
@@ -247,9 +259,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
247
259
|
sfilter = options[:sfilter]
|
248
260
|
efilter = options[:efilter]
|
249
261
|
ifilter = options[:ifilter]
|
262
|
+
add_filter = options[:add_filter] # contains a filter name (soft filter)
|
250
263
|
seval = options[:seval]
|
251
264
|
ignore_missing = options[:ignore_missing]
|
252
265
|
quiet = options[:quiet]
|
266
|
+
set_filter_field = nil
|
253
267
|
|
254
268
|
if sfilter or efilter or ifilter or seval
|
255
269
|
# check for samples
|
@@ -264,15 +278,28 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
264
278
|
return if not bed
|
265
279
|
end
|
266
280
|
|
267
|
-
|
281
|
+
skip = lambda { |&m|
|
282
|
+
matched = m.call
|
283
|
+
if add_filter
|
284
|
+
set_filter_field = true if matched
|
285
|
+
false # always continue processing with an add-filter
|
286
|
+
else
|
287
|
+
not matched
|
288
|
+
end
|
289
|
+
}
|
290
|
+
|
291
|
+
if filter
|
292
|
+
return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
293
|
+
end
|
268
294
|
|
269
|
-
if sfilter
|
295
|
+
if sfilter # sample 'or' filter
|
270
296
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
271
|
-
return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
297
|
+
# return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
298
|
+
return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
272
299
|
end
|
273
300
|
end
|
274
301
|
|
275
|
-
if ifilter
|
302
|
+
if ifilter # include sample filter
|
276
303
|
found = false
|
277
304
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
278
305
|
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
@@ -281,12 +308,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
281
308
|
end
|
282
309
|
end
|
283
310
|
# Skip if there are no matches
|
284
|
-
return if
|
311
|
+
return if skip.call {found}
|
285
312
|
end
|
286
313
|
|
287
|
-
if efilter
|
314
|
+
if efilter # exclude sample filter
|
288
315
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
289
|
-
return if
|
316
|
+
return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
290
317
|
end
|
291
318
|
end
|
292
319
|
|
@@ -294,6 +321,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
294
321
|
|
295
322
|
# -----------------------------
|
296
323
|
# From here on decide on output
|
324
|
+
|
325
|
+
rec.add_to_filter_field(add_filter) if set_filter_field
|
326
|
+
|
297
327
|
if samples
|
298
328
|
# Select certain samples for output
|
299
329
|
newfields = fields[0..8]
|
@@ -349,10 +379,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
349
379
|
end
|
350
380
|
end
|
351
381
|
|
352
|
-
|
382
|
+
CHUNK_SIZE = options[:thread_lines]
|
383
|
+
|
384
|
+
pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
|
385
|
+
options[:quiet],options[:debug])
|
353
386
|
header = nil
|
354
387
|
header_output_completed = false
|
355
|
-
CHUNK_SIZE = options[:thread_lines]
|
356
388
|
chunk_lines = []
|
357
389
|
line_number=0
|
358
390
|
|
@@ -411,7 +443,7 @@ begin
|
|
411
443
|
# ---- In the following section the VCF lines are parsed by chunks
|
412
444
|
# The chunks may go into different threads
|
413
445
|
|
414
|
-
if chunk_lines.size
|
446
|
+
if chunk_lines.size >= CHUNK_SIZE
|
415
447
|
# ---- process one chunk
|
416
448
|
$stderr.print '.' if not options[:quiet]
|
417
449
|
pcows.wait_for_worker_slot()
|
@@ -421,7 +453,7 @@ begin
|
|
421
453
|
chunk_lines = []
|
422
454
|
end
|
423
455
|
end
|
424
|
-
pcows.
|
456
|
+
pcows.submit_final_worker(process,chunk_lines)
|
425
457
|
pcows.wait_for_workers()
|
426
458
|
pcows.process_remaining_output()
|
427
459
|
|
@@ -429,8 +461,8 @@ begin
|
|
429
461
|
stats.print if stats
|
430
462
|
|
431
463
|
rescue Exception => e
|
432
|
-
# $stderr.print line
|
433
464
|
$stderr.print e.message,"\n" if e.message != 'exit'
|
465
|
+
pcows.cleanup()
|
434
466
|
raise if options[:verbose]
|
435
467
|
exit 1
|
436
468
|
end
|