bio-vcf 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -3
  3. data/Gemfile.lock +44 -0
  4. data/README.md +151 -28
  5. data/VERSION +1 -1
  6. data/bin/bio-vcf +47 -15
  7. data/bio-vcf.gemspec +4 -21
  8. data/features/#cli.feature# +71 -0
  9. data/features/cli.feature +3 -3
  10. data/features/filter.feature +12 -0
  11. data/features/filter.feature~ +35 -0
  12. data/features/somaticsniper.feature +2 -0
  13. data/features/step_definitions/cli-feature.rb +5 -0
  14. data/features/step_definitions/somaticsniper.rb +8 -0
  15. data/lib/bio-vcf/pcows.rb +123 -36
  16. data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
  17. data/lib/bio-vcf/vcfrecord.rb +21 -0
  18. data/lib/bio-vcf/vcfsample.rb +13 -0
  19. data/test/data/regression/eval_once-stderr.new +2 -1
  20. data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
  21. data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
  22. data/test/data/regression/pass1-stderr.new +8 -0
  23. data/test/data/regression/pass1.new +88 -0
  24. data/test/data/regression/pass1.ref +88 -0
  25. data/test/data/regression/r.info.dp-stderr.new +8 -4
  26. data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
  27. data/test/data/regression/s.dp-stderr.new +8 -4
  28. data/test/data/regression/seval_s.dp-stderr.new +8 -4
  29. data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
  30. data/test/data/regression/thread4-stderr.new +8 -4
  31. data/test/data/regression/thread4_4-stderr.new +44 -15
  32. data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
  33. data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
  34. data/test/stress/stress_test.sh +15 -0
  35. data/test/stress/stress_test.sh~ +8 -0
  36. metadata +14 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1364a1b5bd401632c6c2ae2497a358c206d7a9e6
4
- data.tar.gz: 2a8cad615012dcd175cc3bb614888b33db16483b
3
+ metadata.gz: a09729e3548751923f4b3c5ef81c8c9d7402b6b2
4
+ data.tar.gz: 4c525ad745c5486075e9a0f14fe5372a21c8f056
5
5
  SHA512:
6
- metadata.gz: d8420c9926835ff632cd8dd93c057c08f37dc557c86d76fd1766a044dcc5a8b7c573553c2ef6efe8e348d868666c7fa6d62b34779e214b090cf3287e087e61c8
7
- data.tar.gz: b666a3d97a63ac18ef7ff88e002b88dee8a81e703cadaacaffc968fe7f634591bcf5df40cd930b9493e293530047d8cc6a31df446ebf1e5cd7bfb2f6164e3e0a
6
+ metadata.gz: 343083ee8c055f534a840c8f668cb35a0c33fbccabe2b580edf859747aff8c8069266168ac66631bc3bbd2c8f58691847796eb00cef7784c7ebf966ec85e1d4f
7
+ data.tar.gz: f55292d0d744a496a5b39123285904a120f4c4ffae066dc33f244e09ae021618e94071cf8e587f56debfaf0c54233c3a5688887e21b21f5640bc8b271a3a00bb
@@ -3,9 +3,8 @@ sudo: false # required for the new containers
3
3
  language: ruby
4
4
  rvm:
5
5
  # - 1.9.3 <- No longer working
6
- # - 2.0.0
7
- # - 2.1.0
8
- - 2.2.2
6
+ - 2.1.0
7
+ - 2.2.3
9
8
 
10
9
  # install:
11
10
  # - gem install cucumber rspec regressiontest
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ builder (3.2.2)
5
+ cucumber (2.1.0)
6
+ builder (>= 2.1.2)
7
+ cucumber-core (~> 1.3.0)
8
+ diff-lcs (>= 1.1.3)
9
+ gherkin3 (~> 3.1.0)
10
+ multi_json (>= 1.7.5, < 2.0)
11
+ multi_test (>= 0.1.2)
12
+ cucumber-core (1.3.0)
13
+ gherkin3 (~> 3.1.0)
14
+ diff-lcs (1.2.5)
15
+ gherkin3 (3.1.1)
16
+ multi_json (1.11.2)
17
+ multi_test (0.1.2)
18
+ rake (10.4.2)
19
+ regressiontest (0.0.3)
20
+ rspec (3.3.0)
21
+ rspec-core (~> 3.3.0)
22
+ rspec-expectations (~> 3.3.0)
23
+ rspec-mocks (~> 3.3.0)
24
+ rspec-core (3.3.2)
25
+ rspec-support (~> 3.3.0)
26
+ rspec-expectations (3.3.1)
27
+ diff-lcs (>= 1.2.0, < 2.0)
28
+ rspec-support (~> 3.3.0)
29
+ rspec-mocks (3.3.2)
30
+ diff-lcs (>= 1.2.0, < 2.0)
31
+ rspec-support (~> 3.3.0)
32
+ rspec-support (3.3.0)
33
+
34
+ PLATFORMS
35
+ ruby
36
+
37
+ DEPENDENCIES
38
+ cucumber
39
+ rake
40
+ regressiontest (>= 0.0.3)
41
+ rspec
42
+
43
+ BUNDLED WITH
44
+ 1.10.6
data/README.md CHANGED
@@ -4,6 +4,9 @@
4
4
 
5
5
  ## Updates
6
6
 
7
+ * Getting ready for a 1.0 release
8
+ * 0.9.1 removed a rare threading bug and cleanup on error
9
+ * Added support for soft filters (request by Brad Chapman)
7
10
  * The outputter now writes (properly) in parallel with the parser
8
11
  * bio-vcf turns any VCF into JSON with header information, and
9
12
  allows you to pipe that JSON directly into any JSON supporting
@@ -23,18 +26,19 @@ So, why would you use bio-vcf over other parsers? Because
23
26
  3. Bio-vcf has great multi-sample support
24
27
  4. Bio-vcf has multiple global filters and sample filters
25
28
  5. Bio-vcf can access any VCF format
26
- 6. Bio-vcf can parse and query the VCF header (META)
29
+ 6. Bio-vcf can parse and query the VCF header (META data)
27
30
  7. Bio-vcf can do calculations on fields
28
31
  8. Bio-vcf allows for genotype processing
29
32
  9. Bio-vcf has support for set analysis
30
33
  10. Bio-vcf has sane error handling
31
34
  11. Bio-vcf can convert *any* VCF to *any* output, including tabular data, BED, HTML, LaTeX, RDF, JSON and JSON-LD and even other VCFs by using (erb) templates
35
+ 12. Bio-vcf has soft filters
32
36
 
33
- Bio-vcf has better performance than other tools
34
- because of lazy parsing, multi-threading, and useful combinations of
35
- (fancy) command line filtering (who says Ruby is slow?). Adding
36
- cores, bio-vcf just does better. The more complicated the filters,
37
- the larger the gain. First the base line test to show IO performance
37
+ Bio-vcf has better performance than other tools because of lazy
38
+ parsing, multi-threading, and useful combinations of (fancy) command
39
+ line filtering (who says Ruby is slow?). Adding cores, bio-vcf just
40
+ does better. The more complicated the filters, the larger the
41
+ gain. First a base line test to show IO performance
38
42
 
39
43
  ```sh
40
44
  time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|wc
@@ -44,7 +48,7 @@ the larger the gain. First the base line test to show IO performance
44
48
  sys 0m2.972s
45
49
  ```
46
50
 
47
- Next run the 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
51
+ Next run this 1Gb data with bio-vcf effectively using 5 cores on AMD Opteron(tm) Processor 6174 using Linux
48
52
 
49
53
  ```sh
50
54
  time cat ESP6500SI-V2-SSA137.GRCh38-liftover.*.vcf|./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp.to_f>0.3' > /dev/null
@@ -62,10 +66,11 @@ user 12m53.273s
62
66
  sys 0m9.913s
63
67
  ```
64
68
 
65
- This means that on this machine bio-vcf is 24x faster than SnpSift even for a simple filter.
66
- In fact, bio-vcf is perfect for complex filters and parsing large data files on powerful machines. Parsing a 650 Mb GATK
67
- Illumina Hiseq VCF file and evaluating the results into a BED format on
68
- a 16 core machine takes
69
+ This means that on this machine bio-vcf is 24x faster than SnpSift
70
+ even for a simple filter. In fact, bio-vcf is perfect for complex
71
+ filters and parsing large data files on powerful machines. Parsing a
72
+ 650 Mb GATK Illumina Hiseq VCF file and evaluating the results into a
73
+ BED format on a 16 core machine takes
69
74
 
70
75
  ```sh
71
76
  time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
@@ -77,8 +82,11 @@ a 16 core machine takes
77
82
  which shows decent core utilisation (10x). Running
78
83
  gzip compressed VCF files of 30+ Gb has similar performance gains.
79
84
 
80
- Use zcat to
81
- pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
85
+ To view some complex filters on an 80Gb SNP file check out a
86
+ [GTEx exercise](https://github.com/pjotrp/bioruby-vcf/blob/master/doc/GTEx_reduce.md).
87
+
88
+ Use zcat (or even better pigz which is multi-core itself) to pipe such
89
+ gzipped (vcf.gz) files into bio-vcf, e.g.
82
90
 
83
91
  ```sh
84
92
  zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
@@ -86,11 +94,12 @@ pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
86
94
  --eval '[r.chrom,r.pos,r.pos+1]' > test.bed
87
95
  ```
88
96
 
89
- bio-vcf comes with a sensible parser definition language (interestingly it is 100%
90
- Ruby), an embedded Ragel parser for INFO and FORMAT header definitions, as well as primitives for set analysis. Few
97
+ bio-vcf comes with a sensible parser definition language
98
+ (interestingly it is 100% Ruby), an embedded Ragel parser for INFO and
99
+ FORMAT header definitions, as well as primitives for set analysis. Few
91
100
  assumptions are made about the actual contents of the VCF file (field
92
- names are resolved on the fly), so bio-vcf should work with
93
- all VCF files.
101
+ names are resolved on the fly), so bio-vcf should work with all VCF
102
+ files.
94
103
 
95
104
  To fetch all entries where all samples have depth larger than 20 and
96
105
  filter set to PASS use a sample filter
@@ -168,7 +177,8 @@ bio-vcf -i --seval 's.ad[1]'
168
177
  1 10303 25 31 28 32 17 23 22
169
178
  ```
170
179
 
171
- To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
180
+ To calculate percentage non-reference (PNR) alt frequencies from s.ad
181
+ which is sample (alt dp)/(ref dp + alt dp)
172
182
 
173
183
  ```ruby
174
184
  bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
@@ -262,12 +272,6 @@ gem install bio-vcf
262
272
  bio-vcf -h
263
273
  ```
264
274
 
265
- For multi-core also install the parallel gem
266
-
267
- ```sh
268
- gem install parallel
269
- ```
270
-
271
275
  ## Command line interface (CLI)
272
276
 
273
277
  Get the version of the VCF file
@@ -397,6 +401,23 @@ or for all
397
401
  bio-vcf --filter "rec.missing_samples?" < file.vcf
398
402
  ```
399
403
 
404
+ To set a soft filter, i.e. the filter column is updated
405
+
406
+ ```sh
407
+ bio-vcf --add-filter LowQD --filter 'r.tumor.dp<5' < test/data/input/somaticsniper.vcf |bio-vcf --eval '[r.chr,r.pos,r.tumor.dp,r.filter]' --filter 'r.filter.index("LowQD")'
408
+ ```
409
+
410
+ may render something like
411
+
412
+ ```
413
+ 1 46527674 4 LowQD
414
+ 1 108417572 4 LowQD
415
+ 1 155449089 4 LowQD
416
+ 1 169847826 4 LowQD
417
+ 1 203098164 3 LowQD
418
+ 2 39213209 4 LowQD
419
+ ```
420
+
400
421
  Likewise you can check for record validity
401
422
 
402
423
  ```sh
@@ -625,7 +646,7 @@ indexed value array
625
646
  and 'gts' as a nucleotide string array
626
647
 
627
648
  ```ruby
628
- bio-vcf --seval 's.gts[0]'
649
+ bio-vcf --seval 's.gts'
629
650
  1 10665 C C C C
630
651
  1 10694 G G
631
652
  1 12783 G G G G G G G
@@ -634,6 +655,28 @@ and 'gts' as a nucleotide string array
634
655
 
635
656
  where gts represents the indexed genotype on [ref] + [alt].
636
657
 
658
+ To convert combined genotypes into numbers, i.e., 0/0 -> 0, 0/1 -> 1,
659
+ 1/1 -> 2, is useful for indexed fields giving information on, for
660
+ example signficance, use
661
+
662
+ ```ruby
663
+ bio-vcf --seval '!s.empty? and s.gtindex'
664
+ 11 58949455 0 1
665
+ 11 65481082 0 1
666
+ 11 94180424 0 1
667
+ 11 121036021 0 1
668
+ ```
669
+
670
+ Now you can index other fields, e.g. GL
671
+
672
+ ```ruby
673
+ ./bin/bio-vcf --seval '[(!s.empty? ? s.gl[s.gtindex]:-1)]'
674
+ 1 900057 1.0 1.0 0.994 1.0 1.0 -1 0.999 1.0 0.997 -1 0.994 0.989 -1 0.991 -1 0.972 0.992 1.0
675
+ ```
676
+
677
+ shows a number of SNPs have been scored with high significance and a
678
+ number are missing, here marked as -1.
679
+
637
680
  These values can also be used in filters and output allele depth, for
638
681
  example
639
682
 
@@ -655,6 +698,34 @@ bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0
655
698
  1 13757 47 47 4 47 47 4 47
656
699
  ```
657
700
 
701
+ ## Sample counting
702
+
703
+ Note, the use of lambda allows for sophisticated queries. You may need
704
+ some expert advice here.
705
+
706
+ To count valid genotype field in samples you can do something like
707
+
708
+ ```ruby
709
+ bio-vcf --eval 'r.samples.count {|s| s.gt!="./."}'
710
+ ```
711
+
712
+ A similar complex count would be
713
+
714
+ ```ruby
715
+ bio-vcf --eval '[r.chr,r.pos,r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }]'
716
+ ```
717
+
718
+ which tests for perfect SNPs scored (for example).
719
+
720
+ ## Reorder filter with lambda
721
+
722
+ Sometime it pay to reorder the filter using a lambda. This is one
723
+ example where the greedy sample counts are done only for those
724
+ samples that match the other criteria:
725
+
726
+ ```ruby
727
+ ./bin/bio-vcf --num-threads=1 --filter '(r.info.miss<0.05 and r.info.exp_freq_a1>0.05 and r.info.exp_freq_a1<0.95 and r.info.impinfo>0.7 and r.info.hw<1.0) ? lambda { found=r.samples.count { |s| (!s.empty? && s.gl[s.gtindex]==1.0) }.to_f; total=r.samples.count{|s| s.gt!="./."} ; found/total>0.7 and total-found<30 }.call : false)'
728
+ ```
658
729
 
659
730
  ## Modify VCF files
660
731
 
@@ -904,7 +975,10 @@ Simple statistics are available for REF>ALT changes:
904
975
 
905
976
  ## Other examples
906
977
 
907
- For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
978
+ For more exercises and examples see
979
+ [doc](https://github.com/pjotrp/bioruby-vcf/tree/master/doc) directory
980
+ and the the feature
981
+ [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
908
982
 
909
983
  ## API
910
984
 
@@ -937,13 +1011,62 @@ what the command line interface uses (see ./bin/bio-vcf)
937
1011
 
938
1012
  ## Trouble shooting
939
1013
 
1014
+ ### MRI supports threading
1015
+
940
1016
  Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
941
1017
  in single threaded mode (for now).
942
1018
 
1019
+ ### Set TMPDIR when running out of space
1020
+
943
1021
  The multi-threading creates temporary files using the system TMPDIR.
944
1022
  This behaviour can be overridden by setting the environment variable.
945
- Also, for genome-wide sequencing it may be useful to increase
946
- --thread-lines to a value larger than 1_000_000.
1023
+
1024
+ ### Reorder filter on time out
1025
+
1026
+ Make sure to minimize expensive calculations by moving them
1027
+ backward. An 'and' statement is evaluated from left to right. With
1028
+
1029
+ ```ruby
1030
+ fast_check and slow_check
1031
+ ```
1032
+
1033
+ slow_check only gets executed if fast_check is true.
1034
+
1035
+ For more complex filters use lambda inside a conditional
1036
+
1037
+ ```ruby
1038
+ ( fast_check ? lambda { slow_check }.call : false )
1039
+ ```
1040
+
1041
+ where slow_check is the slow section of your query. As is shown
1042
+ earlier in this document. Don't forget the .call!
1043
+
1044
+ ### Reduce thread lines on timeout
1045
+
1046
+ Depending on your input data and the speed filters it may be useful to
1047
+ tweak the number of thread lines and/or to increase the timeout.
1048
+
1049
+ On really fast file systems for genome-wide sequencing try increasing
1050
+ --thread-lines to a value larger than 100_000. On the other hand if
1051
+ the computations are intensive (per line) reduce the number of
1052
+ thread-lines (try 10_000 and 1_000). If processes get killed that is
1053
+ the one to try.
1054
+
1055
+ For larger files set the timeout to 600, or so. --timeout 600.
1056
+
1057
+ Different values may show different core use on a machine.
1058
+
1059
+ ### Debugging
1060
+
1061
+ To debug output use '-v --num-threads=1' for generating useful
1062
+ output. Also do not use the -i switch (ignore errors) when there
1063
+ are problems.
1064
+
1065
+ ### Tmpdir contains (old) bio-vcf directories
1066
+
1067
+ Multi-threaded bio-vcf writes into a temporary directory during
1068
+ processing. When a process gets interrupted for some reason the
1069
+ temporary directory may remain.
947
1070
 
948
1071
  ## Project home page
949
1072
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.0
1
+ 0.9.2
@@ -27,7 +27,7 @@ require 'fileutils'
27
27
  # Bio::Log::CLI.logger('stderr')
28
28
  # Bio::Log::CLI.trace('info')
29
29
 
30
- options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
30
+ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
31
31
  opts = OptionParser.new do |o|
32
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
33
33
 
@@ -58,6 +58,9 @@ opts = OptionParser.new do |o|
58
58
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
59
59
  options[:efilter_samples] = l
60
60
  end
61
+ o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
62
+ options[:add_filter] = name
63
+ end
61
64
 
62
65
  o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
63
66
  options[:bed] = bed
@@ -88,7 +91,7 @@ opts = OptionParser.new do |o|
88
91
  options[:rdf] = true
89
92
  options[:skip_header] = true
90
93
  end
91
- o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
94
+ o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
92
95
  options[:num_threads] = i
93
96
  end
94
97
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -148,7 +151,7 @@ opts = OptionParser.new do |o|
148
151
  options[:verbose] = true
149
152
  end
150
153
 
151
- o.on("--debug", "Show debug messages") do |v|
154
+ o.on("--debug", "Show debug messages and keep intermediate output") do |v|
152
155
  # Bio::Log::CLI.trace('debug')
153
156
  options[:debug] = true
154
157
  end
@@ -196,6 +199,8 @@ end
196
199
  raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
197
200
  raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
198
201
  raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
202
+ # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
203
+ # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
199
204
 
200
205
  if options[:samples]
201
206
  samples = options[:samples].map { |s| s.to_i }
@@ -218,6 +223,13 @@ def parse_header line, samples, options
218
223
  if headerline =~ /^#CHR/
219
224
  # The header before actual data contains the sample names, first inject the BioVcf meta information
220
225
  print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
226
+ # Then the additional filter(s)
227
+ # ##FILTER=<ID=LowQual,Description="Low quality">
228
+ add_filter = options[:add_filter]
229
+ if add_filter
230
+ print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
231
+ end
232
+
221
233
  selected = header.column_names
222
234
  if samples
223
235
  newfields = selected[0..8]
@@ -247,9 +259,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
247
259
  sfilter = options[:sfilter]
248
260
  efilter = options[:efilter]
249
261
  ifilter = options[:ifilter]
262
+ add_filter = options[:add_filter] # contains a filter name (soft filter)
250
263
  seval = options[:seval]
251
264
  ignore_missing = options[:ignore_missing]
252
265
  quiet = options[:quiet]
266
+ set_filter_field = nil
253
267
 
254
268
  if sfilter or efilter or ifilter or seval
255
269
  # check for samples
@@ -264,15 +278,28 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
264
278
  return if not bed
265
279
  end
266
280
 
267
- return if filter and not rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
281
+ skip = lambda { |&m|
282
+ matched = m.call
283
+ if add_filter
284
+ set_filter_field = true if matched
285
+ false # always continue processing with an add-filter
286
+ else
287
+ not matched
288
+ end
289
+ }
290
+
291
+ if filter
292
+ return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
293
+ end
268
294
 
269
- if sfilter
295
+ if sfilter # sample 'or' filter
270
296
  rec.each_sample(options[:sfilter_samples]) do | sample |
271
- return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
297
+ # return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
298
+ return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
272
299
  end
273
300
  end
274
301
 
275
- if ifilter
302
+ if ifilter # include sample filter
276
303
  found = false
277
304
  rec.each_sample(options[:ifilter_samples]) do | sample |
278
305
  if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -281,12 +308,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
281
308
  end
282
309
  end
283
310
  # Skip if there are no matches
284
- return if not found
311
+ return if skip.call {found}
285
312
  end
286
313
 
287
- if efilter
314
+ if efilter # exclude sample filter
288
315
  rec.each_sample(options[:efilter_samples]) do | sample |
289
- return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
316
+ return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
290
317
  end
291
318
  end
292
319
 
@@ -294,6 +321,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
294
321
 
295
322
  # -----------------------------
296
323
  # From here on decide on output
324
+
325
+ rec.add_to_filter_field(add_filter) if set_filter_field
326
+
297
327
  if samples
298
328
  # Select certain samples for output
299
329
  newfields = fields[0..8]
@@ -349,10 +379,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
349
379
  end
350
380
  end
351
381
 
352
- pcows = PCOWS.new(options[:num_threads],'bio-vcf',options[:timeout])
382
+ CHUNK_SIZE = options[:thread_lines]
383
+
384
+ pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
385
+ options[:quiet],options[:debug])
353
386
  header = nil
354
387
  header_output_completed = false
355
- CHUNK_SIZE = options[:thread_lines]
356
388
  chunk_lines = []
357
389
  line_number=0
358
390
 
@@ -411,7 +443,7 @@ begin
411
443
  # ---- In the following section the VCF lines are parsed by chunks
412
444
  # The chunks may go into different threads
413
445
 
414
- if chunk_lines.size > CHUNK_SIZE
446
+ if chunk_lines.size >= CHUNK_SIZE
415
447
  # ---- process one chunk
416
448
  $stderr.print '.' if not options[:quiet]
417
449
  pcows.wait_for_worker_slot()
@@ -421,7 +453,7 @@ begin
421
453
  chunk_lines = []
422
454
  end
423
455
  end
424
- pcows.submit_worker(process,chunk_lines)
456
+ pcows.submit_final_worker(process,chunk_lines)
425
457
  pcows.wait_for_workers()
426
458
  pcows.process_remaining_output()
427
459
 
@@ -429,8 +461,8 @@ begin
429
461
  stats.print if stats
430
462
 
431
463
  rescue Exception => e
432
- # $stderr.print line
433
464
  $stderr.print e.message,"\n" if e.message != 'exit'
465
+ pcows.cleanup()
434
466
  raise if options[:verbose]
435
467
  exit 1
436
468
  end