bio-vcf 0.7.0 → 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/Gemfile +2 -5
- data/Gemfile.lock +3 -3
- data/README.md +101 -23
- data/Rakefile +4 -2
- data/VERSION +1 -1
- data/bin/bio-vcf +133 -73
- data/bio-vcf.gemspec +13 -10
- data/features/cli.feature +9 -1
- data/features/multisample.feature +4 -4
- data/features/sfilter.feature +1 -1
- data/features/step_definitions/cli-feature.rb +4 -0
- data/features/step_definitions/multisample.rb +24 -12
- data/features/step_definitions/sfilter.rb +80 -31
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +45 -9
- data/lib/bio-vcf/vcfheader.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +14 -8
- data/lib/bio-vcf/vcfsample.rb +101 -152
- data/lib/bio-vcf/vcfstatistics.rb +28 -0
- data/test/data/regression/ifilter_s.dp.ref +31 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -0
- metadata +16 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72f63aae77e382c88e04cb07344ab3fce2a57232
|
4
|
+
data.tar.gz: 48f36f4d75d18edf3619124f0b679706a002b646
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1d1513454924e1d84bb9aecf1bd83cf0d8784e3ab8669bb7777129f2a8b33df53a2118d4fea15a825ce1c399de05cf729e0f7b5e7acd35367856a6a1821f328
|
7
|
+
data.tar.gz: 7b40ffdad49cbb690cfaa4d02e7a060095dc13aa4533f091f1f20b2f2b1903c71d2f6c6c5c5f672dc3be4da751305c2583265dbe7d86308501ab0219cca9e414
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -9,9 +9,6 @@ group :development do
|
|
9
9
|
# gem "minitest"
|
10
10
|
gem "rspec"
|
11
11
|
gem "cucumber"
|
12
|
-
gem "jeweler" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
|
14
|
-
# gem "bio", ">= 1.4.2"
|
15
|
-
# gem "rdoc", "~> 3.12"
|
16
|
-
gem "regressiontest"
|
12
|
+
gem "jeweler", "~> 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
+
gem "regressiontest", "~> 0.0.3"
|
17
14
|
end
|
data/Gemfile.lock
CHANGED
@@ -60,7 +60,7 @@ GEM
|
|
60
60
|
rake (10.1.1)
|
61
61
|
rdoc (4.1.1)
|
62
62
|
json (~> 1.4)
|
63
|
-
regressiontest (0.0.
|
63
|
+
regressiontest (0.0.3)
|
64
64
|
rspec (2.14.1)
|
65
65
|
rspec-core (~> 2.14.0)
|
66
66
|
rspec-expectations (~> 2.14.0)
|
@@ -76,6 +76,6 @@ PLATFORMS
|
|
76
76
|
|
77
77
|
DEPENDENCIES
|
78
78
|
cucumber
|
79
|
-
jeweler
|
80
|
-
regressiontest
|
79
|
+
jeweler (~> 2.0.1)
|
80
|
+
regressiontest (~> 0.0.3)
|
81
81
|
rspec
|
data/README.md
CHANGED
@@ -2,14 +2,27 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-vcf.png)](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
|
6
|
-
it also comes with a really nice filtering, evaluation and
|
7
|
-
language.
|
5
|
+
A new generation VCF parser. Bio-vcf is not only fast for genome-wide
|
6
|
+
(WGS) data, it also comes with a really nice filtering, evaluation and
|
7
|
+
rewrite language. Why would you use bio-vcf over other parsers?
|
8
|
+
|
9
|
+
1. Bio-vcf is fast and scales on multi-core computers
|
10
|
+
2. Bio-vcf has an expressive filtering and evaluation language
|
11
|
+
3. Bio-vcf has great multi-sample support
|
12
|
+
4. Bio-vcf has multiple global filters and sample filters
|
13
|
+
5. Bio-vcf can access any VCF format
|
14
|
+
6. Bio-vcf can do calculations on fields
|
15
|
+
7. Bio-vcf allows for genotype processing
|
16
|
+
8. Bio-vcf has support for set analysis
|
17
|
+
9. Bio-vcf has sane error handling
|
18
|
+
10. Bio-vcf can output tabular data, HTML, LaTeX, RDF and (soon) JSON
|
19
|
+
|
20
|
+
Bio-vcf has better performance than other tools
|
8
21
|
because of lazy parsing, multi-threading, and useful combinations of
|
9
|
-
(fancy) command line filtering. For example on an 2 core machine
|
10
|
-
bio-vcf is 50% faster than SnpSift. On an 8 core machine
|
11
|
-
3x faster than SnpSift. Parsing a 1 Gb ESP
|
12
|
-
bio-vcf takes
|
22
|
+
(fancy) command line filtering. For example on an 2 core machine
|
23
|
+
bio-vcf is typically 50% faster than JVM based SnpSift. On an 8 core machine
|
24
|
+
bio-vcf is at least 3x faster than SnpSift. Parsing a 1 Gb ESP
|
25
|
+
VCF with 8 cores with bio-vcf takes
|
13
26
|
|
14
27
|
```sh
|
15
28
|
time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
|
@@ -18,7 +31,7 @@ bio-vcf takes
|
|
18
31
|
sys 0m7.852s
|
19
32
|
```
|
20
33
|
|
21
|
-
|
34
|
+
while parsing with SnpSift takes
|
22
35
|
|
23
36
|
```sh
|
24
37
|
time cat ESP6500SI_V2_SSA137.vcf |java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > test.vcf
|
@@ -32,22 +45,22 @@ Illumina Hiseq VCF file and evaluating the results into a BED format on
|
|
32
45
|
a 16 core machine takes
|
33
46
|
|
34
47
|
```sh
|
35
|
-
time bio-vcf --num-threads
|
48
|
+
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
36
49
|
real 0m47.612s
|
37
50
|
user 8m18.234s
|
38
51
|
sys 0m5.039s
|
39
52
|
```
|
40
53
|
|
41
|
-
which shows
|
54
|
+
which shows pretty decent core utilisation (10x). We are running
|
55
|
+
gzip compressed VCF files of 30+ Gb with similar performance gains.
|
42
56
|
|
43
57
|
Use zcat to
|
44
|
-
pipe gzipped (vcf.gz) files into bio-vcf, e.g.
|
58
|
+
pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
45
59
|
|
46
60
|
```sh
|
47
61
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
48
62
|
--sfilter '!s.empty? and s.dp>20'
|
49
63
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
50
|
-
|
51
64
|
```
|
52
65
|
|
53
66
|
bio-vcf comes with a sensible parser definition language (it is 100%
|
@@ -184,6 +197,13 @@ commands exit for filtering and eval. When a set is defined, based on
|
|
184
197
|
the sample name, you can apply filters on the samples inside the set,
|
185
198
|
outside the set and over all samples. E.g.
|
186
199
|
|
200
|
+
So, why would you use bio-vcf instead of rolling out your own
|
201
|
+
Perl/Python/other ad-hoc script? I think the reason should be that
|
202
|
+
there is less chance of mistakes because of Bio-vcf's clear filtering
|
203
|
+
language and sensible built-in validation. The second reason would be
|
204
|
+
speed. Bio-vcf's multi-threading capability gives it great and hard to
|
205
|
+
replicate performance.
|
206
|
+
|
187
207
|
Also note you can use
|
188
208
|
[bio-table](https://github.com/pjotrp/bioruby-table) to
|
189
209
|
filter/transform data further and convert to other formats, such as
|
@@ -202,7 +222,7 @@ example of a VCF statement you need to work on.
|
|
202
222
|
|
203
223
|
## Installation
|
204
224
|
|
205
|
-
Note that you need Ruby
|
225
|
+
Note that you need Ruby 2.x or later. The 2.x Ruby series also give
|
206
226
|
a performance improvement. Bio-vcf will show the Ruby version when
|
207
227
|
typing the command 'bio-vcf -h'.
|
208
228
|
|
@@ -371,7 +391,7 @@ And even better because of Ruby magic
|
|
371
391
|
Note that only valid method names in lower case get picked up this
|
372
392
|
way. Also by convention normal is sample 1 and tumor is sample 2.
|
373
393
|
|
374
|
-
Even shorter r is an alias for rec
|
394
|
+
Even shorter r is an alias for rec
|
375
395
|
|
376
396
|
```sh
|
377
397
|
bio-vcf --eval "r.original.gt" < file.vcf
|
@@ -380,7 +400,8 @@ Even shorter r is an alias for rec (nyi)
|
|
380
400
|
|
381
401
|
## Special functions
|
382
402
|
|
383
|
-
Note: special functions are not yet implemented!
|
403
|
+
Note: special functions are not yet implemented! Look below
|
404
|
+
for genotype processing which has indexing in 'gti'.
|
384
405
|
|
385
406
|
Sometime you want to use a special function in a filter. For
|
386
407
|
example percentage variant reads can be defined as [a,c,g,t]
|
@@ -440,7 +461,8 @@ example, samples are selected that evaluate to true, all others should
|
|
440
461
|
evaluate to false. For this we create three filters, one for all
|
441
462
|
samples that are included (the --ifilter or -if), for all samples that
|
442
463
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
443
|
-
or -sf). So i=include, e=exclude and s=any sample
|
464
|
+
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
465
|
+
filter).
|
444
466
|
|
445
467
|
The equivalent of the union filter is by using the --sfilter, so
|
446
468
|
|
@@ -448,15 +470,19 @@ The equivalent of the union filter is by using the --sfilter, so
|
|
448
470
|
bio-vcf --sfilter 's.dp>20'
|
449
471
|
```
|
450
472
|
|
451
|
-
Filters DP on all samples
|
473
|
+
Filters DP on all samples and is true if all samples match the
|
474
|
+
criterium (AND). To filter on a subset you can add a
|
452
475
|
selector
|
453
476
|
|
454
477
|
```sh
|
455
478
|
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
456
479
|
```
|
457
480
|
|
458
|
-
For set analysis there are the additional ifilter (include) and
|
459
|
-
|
481
|
+
For set analysis there are the additional ifilter (include) and
|
482
|
+
efilter (exclude). Where sfilter represents an ALL match, the ifilter
|
483
|
+
represents an ANY match, i.e., it is true if one of the samples
|
484
|
+
matches the criterium (OR). To filter on samples 0,1,4 and output the gq
|
485
|
+
values
|
460
486
|
|
461
487
|
```sh
|
462
488
|
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gq<10 or s.gq==99' --seval s.gq
|
@@ -494,8 +520,10 @@ To set an additional filter on the excluded samples:
|
|
494
520
|
```
|
495
521
|
|
496
522
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
523
|
+
Currently the efilter is an ALL filter (AND), i.e. all excluded
|
524
|
+
samples need to match the criterium.
|
497
525
|
|
498
|
-
The following are not yet implemented:
|
526
|
+
The following regular expression matches are not yet implemented:
|
499
527
|
|
500
528
|
In the near future it is also possible to select samples on a regex (here
|
501
529
|
select all samples where the name starts with s3)
|
@@ -560,6 +588,8 @@ and 'gts' as a nucleotide string array
|
|
560
588
|
1 15274 G G G G G G G
|
561
589
|
```
|
562
590
|
|
591
|
+
where gts represents the indexed genotype on [ref] + [alt].
|
592
|
+
|
563
593
|
These values can also be used in filters and output allele depth, for
|
564
594
|
example
|
565
595
|
|
@@ -570,12 +600,18 @@ example
|
|
570
600
|
1 13757 47 47 4 47 47 4 47
|
571
601
|
```
|
572
602
|
|
573
|
-
|
603
|
+
You can use the genotype index gti to fetch values from, for example,
|
604
|
+
allele depth:
|
574
605
|
|
575
606
|
```ruby
|
576
607
|
bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0' --seval 'rec.original.ad[s.gti[1]]'
|
608
|
+
|
609
|
+
1 10257 151 151 151 151 151 8 151
|
610
|
+
1 13302 26 10 10 10 10 10 10
|
611
|
+
1 13757 47 47 4 47 47 4 47
|
577
612
|
```
|
578
613
|
|
614
|
+
|
579
615
|
## Modify VCF files
|
580
616
|
|
581
617
|
Add or modify the sample file name in the INFO fields:
|
@@ -584,7 +620,7 @@ Add or modify the sample file name in the INFO fields:
|
|
584
620
|
bio-vcf --rewrite 'rec.info["sample"]="mytest"' < mytest.vcf
|
585
621
|
```
|
586
622
|
|
587
|
-
To remove/select 3 samples
|
623
|
+
To remove/select 3 samples:
|
588
624
|
|
589
625
|
```sh
|
590
626
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
@@ -614,11 +650,50 @@ bio-vcf --id evs --filter 'r.info.maf[0]<5.0' --rdf --tags '{"db:evs" => true, "
|
|
614
650
|
Similarly for GoNL
|
615
651
|
|
616
652
|
```ruby
|
617
|
-
bio-vcf --id gonl --rdf --tags '{"db:
|
653
|
+
bio-vcf --id gonl --rdf --tags '{"db:gonl" => true, "seq:freq" => rec.info.af }' < GoNL.vcf
|
618
654
|
```
|
619
655
|
|
656
|
+
or without AF
|
657
|
+
|
658
|
+
|
659
|
+
```ruby
|
660
|
+
bio-vcf --id gonl --rdf --tags '{"db:gonl" => true, "seq:freq" => (rec.info.ac.to_f/rec.info.an).round(2) }' < gonl_germline_overlap_r4.vcf
|
661
|
+
```
|
662
|
+
|
663
|
+
|
664
|
+
|
620
665
|
Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
621
666
|
|
667
|
+
## Statistics
|
668
|
+
|
669
|
+
Simple statistics are available for REF>ALT changes:
|
670
|
+
|
671
|
+
```sh
|
672
|
+
./bin/bio-vcf -v --statistics < test/data/input/dbsnp.vcf
|
673
|
+
```
|
674
|
+
|
675
|
+
## ==== Statistics ==================================
|
676
|
+
G>A 59 45%
|
677
|
+
C>T 30 23%
|
678
|
+
A>G 5 4%
|
679
|
+
C>G 5 4%
|
680
|
+
C>A 5 4%
|
681
|
+
G>T 4 3%
|
682
|
+
T>C 4 3%
|
683
|
+
G>C 4 3%
|
684
|
+
T>A 3 2%
|
685
|
+
A>C 3 2%
|
686
|
+
A>T 2 2%
|
687
|
+
GTCCGACCGCTCC>G 1 1%
|
688
|
+
CGACCGCTCC>C 1 1%
|
689
|
+
T>TGGAGC 1 1%
|
690
|
+
C>CGTCTTCA 1 1%
|
691
|
+
TG>T 1 1%
|
692
|
+
AC>A 1 1%
|
693
|
+
|
694
|
+
Total 130
|
695
|
+
## ==================================================
|
696
|
+
|
622
697
|
## Other examples
|
623
698
|
|
624
699
|
For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
@@ -654,6 +729,9 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
654
729
|
|
655
730
|
## Trouble shooting
|
656
731
|
|
732
|
+
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
733
|
+
in single threaded mode (for now).
|
734
|
+
|
657
735
|
The multi-threading creates temporary files using the system TMPDIR.
|
658
736
|
This behaviour can be overridden by setting the environment variable.
|
659
737
|
Also, for genome-wide sequencing it may be useful to increase
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ Jeweler::Tasks.new do |gem|
|
|
17
17
|
gem.name = "bio-vcf"
|
18
18
|
gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{VCF parser}
|
21
|
-
gem.description = %Q{Smart parser for VCF format}
|
20
|
+
gem.summary = %Q{Fast multi-threaded VCF parser}
|
21
|
+
gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
|
22
22
|
gem.email = "pjotr.public01@thebird.nl"
|
23
23
|
gem.authors = ["Pjotr Prins"]
|
24
24
|
# dependencies defined in Gemfile
|
@@ -47,6 +47,8 @@ Cucumber::Rake::Task.new(:features)
|
|
47
47
|
|
48
48
|
task :default => :features
|
49
49
|
|
50
|
+
task :test => [ :features ]
|
51
|
+
|
50
52
|
require 'rdoc/task'
|
51
53
|
Rake::RDocTask.new do |rdoc|
|
52
54
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.3
|
data/bin/bio-vcf
CHANGED
@@ -26,7 +26,7 @@ require 'tempfile'
|
|
26
26
|
# Bio::Log::CLI.logger('stderr')
|
27
27
|
# Bio::Log::CLI.trace('info')
|
28
28
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000 }
|
29
|
+
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000, num_threads: 4 }
|
30
30
|
opts = OptionParser.new do |o|
|
31
31
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
32
|
|
@@ -40,7 +40,7 @@ opts = OptionParser.new do |o|
|
|
40
40
|
o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd|
|
41
41
|
options[:sfilter] = cmd
|
42
42
|
end
|
43
|
-
o.on("--sfilter-samples list", Array, "Filter on selected samples") do |l|
|
43
|
+
o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l|
|
44
44
|
options[:sfilter_samples] = l
|
45
45
|
end
|
46
46
|
|
@@ -80,10 +80,10 @@ opts = OptionParser.new do |o|
|
|
80
80
|
options[:rdf] = true
|
81
81
|
options[:skip_header] = true
|
82
82
|
end
|
83
|
-
o.on("--num-threads [num]", Integer, "Multi-core version") do |i|
|
83
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
|
84
84
|
options[:num_threads] = i
|
85
85
|
end
|
86
|
-
o.on("--thread-lines num", Integer, "Fork thread on num lines (default
|
86
|
+
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
87
87
|
options[:thread_lines] = i
|
88
88
|
end
|
89
89
|
o.on_tail("--id name", String, "Identifier") do |s|
|
@@ -112,6 +112,10 @@ opts = OptionParser.new do |o|
|
|
112
112
|
# Bio::Log::CLI.trace(s)
|
113
113
|
# end
|
114
114
|
#
|
115
|
+
o.on("--statistics", "Output statistics") do |q|
|
116
|
+
options[:statistics] = true
|
117
|
+
options[:num_threads] = nil
|
118
|
+
end
|
115
119
|
o.on("-q", "--quiet", "Run quietly") do |q|
|
116
120
|
# Bio::Log::CLI.trace('error')
|
117
121
|
options[:quiet] = true
|
@@ -168,7 +172,7 @@ def parse_header line, samples, options
|
|
168
172
|
end
|
169
173
|
|
170
174
|
# Parse a VCF line
|
171
|
-
def parse_line line,header,options,samples
|
175
|
+
def parse_line line,header,options,samples,stats=nil
|
172
176
|
fields = VcfLine.parse(line)
|
173
177
|
rec = VcfRecord.new(fields,header)
|
174
178
|
r = rec # alias
|
@@ -189,26 +193,34 @@ def parse_line line,header,options,samples
|
|
189
193
|
|
190
194
|
# --------------------------
|
191
195
|
# Filtering and set analysis
|
192
|
-
return if filter and not rec.filter(filter,ignore_missing,quiet)
|
196
|
+
return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
|
193
197
|
|
194
198
|
if sfilter
|
195
199
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
196
|
-
return if not sample.sfilter(sfilter,ignore_missing,quiet)
|
200
|
+
return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
197
201
|
end
|
198
202
|
end
|
199
203
|
|
200
204
|
if ifilter
|
205
|
+
found = false
|
201
206
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
202
|
-
|
207
|
+
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
208
|
+
found = true
|
209
|
+
break
|
210
|
+
end
|
203
211
|
end
|
212
|
+
# Skip if there are no matches
|
213
|
+
return if not found
|
204
214
|
end
|
205
215
|
|
206
216
|
if efilter
|
207
217
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
208
|
-
return if not sample.efilter(efilter,ignore_missing,quiet)
|
218
|
+
return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
209
219
|
end
|
210
220
|
end
|
211
221
|
|
222
|
+
stats.add(rec) if stats
|
223
|
+
|
212
224
|
# -----------------------------
|
213
225
|
# From here on decide on output
|
214
226
|
if samples
|
@@ -223,13 +235,13 @@ def parse_line line,header,options,samples
|
|
223
235
|
begin
|
224
236
|
results = nil # result string
|
225
237
|
if options[:eval]
|
226
|
-
res = rec.eval(options[:eval],ignore_missing,quiet)
|
238
|
+
res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
|
227
239
|
results = res if res
|
228
240
|
end
|
229
241
|
if seval
|
230
242
|
list = (results ? [] : [rec.chr,rec.pos])
|
231
243
|
rec.each_sample(options[:sfilter_samples]) { | sample |
|
232
|
-
list << sample.eval(seval,ignore_missing,quiet)
|
244
|
+
list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet)
|
233
245
|
}
|
234
246
|
results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
|
235
247
|
end
|
@@ -249,6 +261,8 @@ def parse_line line,header,options,samples
|
|
249
261
|
# Default behaviour prints VCF line, but rewrite info
|
250
262
|
eval(options[:rewrite])
|
251
263
|
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
264
|
+
elsif stats
|
265
|
+
# do nothing
|
252
266
|
else
|
253
267
|
# Default behaviour prints VCF line
|
254
268
|
$stdout.print fields.join("\t")+"\n"
|
@@ -261,18 +275,17 @@ end
|
|
261
275
|
# Collect a buffer of lines and feed them to a thread
|
262
276
|
# Returns the created pid, tempfilen and count_threads
|
263
277
|
# (Note: this function should be turned into a closure)
|
264
|
-
def parse_lines lines,header,options,samples,tempdir,count_threads
|
278
|
+
def parse_lines lines,header,options,samples,tempdir,count_threads,stats
|
265
279
|
pid = nil
|
266
280
|
threadfilen = nil
|
267
281
|
if options[:num_threads]
|
268
|
-
lines2 = lines.map { |l| l.clone }
|
269
282
|
count_threads += 1
|
270
283
|
threadfilen = tempdir+sprintf("/%0.6d-pid",count_threads)+'.bio-vcf'
|
271
284
|
pid = fork do
|
272
285
|
count_lines = 0
|
273
286
|
tempfn = threadfilen+'.running'
|
274
287
|
STDOUT.reopen(File.open(tempfn, 'w+'))
|
275
|
-
|
288
|
+
lines.each do | line |
|
276
289
|
count_lines +=1 if parse_line(line,header,options,samples)
|
277
290
|
end
|
278
291
|
STDOUT.flush
|
@@ -280,10 +293,9 @@ def parse_lines lines,header,options,samples,tempdir,count_threads
|
|
280
293
|
FileUtils::mv(tempfn,threadfilen)
|
281
294
|
exit 0
|
282
295
|
end
|
283
|
-
Process::detach(pid)
|
284
296
|
else
|
285
297
|
lines.each do | line |
|
286
|
-
parse_line line,header,options,samples
|
298
|
+
parse_line line,header,options,samples,stats
|
287
299
|
end
|
288
300
|
end
|
289
301
|
return pid,threadfilen,count_threads
|
@@ -293,12 +305,30 @@ end
|
|
293
305
|
def manage_thread_pool(workers, thread_list, num_threads)
|
294
306
|
while true
|
295
307
|
# ---- count running pids
|
296
|
-
running = thread_list.reduce(0)
|
308
|
+
running = thread_list.reduce(0) do | sum, thread_info |
|
309
|
+
if thread_info[0] && pid_running?(thread_info[0])
|
310
|
+
sum+1
|
311
|
+
elsif nil == thread_info[0] && File.exist?(thread_info[1]+'.running')
|
312
|
+
sum+1
|
313
|
+
else
|
314
|
+
sum
|
315
|
+
end
|
316
|
+
end
|
297
317
|
break if running < num_threads
|
298
318
|
sleep 0.1
|
299
319
|
end
|
300
320
|
end
|
301
321
|
|
322
|
+
def pid_running?(pid)
|
323
|
+
begin
|
324
|
+
fpid,status=Process.waitpid2(pid,Process::WNOHANG)
|
325
|
+
rescue Errno::ECHILD, Errno::ESRCH
|
326
|
+
return false
|
327
|
+
end
|
328
|
+
return true if nil == fpid && nil == status
|
329
|
+
return ! (status.exited? || status.signaled?)
|
330
|
+
end
|
331
|
+
|
302
332
|
opts.parse!(ARGV)
|
303
333
|
|
304
334
|
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
@@ -309,8 +339,23 @@ if options[:show_help]
|
|
309
339
|
exit 1
|
310
340
|
end
|
311
341
|
|
342
|
+
if RUBY_VERSION =~ /^1/
|
343
|
+
$stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n"
|
344
|
+
end
|
345
|
+
|
312
346
|
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
313
347
|
|
348
|
+
stats = nil
|
349
|
+
if options[:statistics]
|
350
|
+
options[:num_threads] = nil
|
351
|
+
stats = BioVcf::VcfStatistics.new
|
352
|
+
end
|
353
|
+
|
354
|
+
# Check for option combinations
|
355
|
+
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
356
|
+
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
357
|
+
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
358
|
+
|
314
359
|
if options[:samples]
|
315
360
|
samples = options[:samples].map { |s| s.to_i }
|
316
361
|
end
|
@@ -329,14 +374,15 @@ count_threads=0
|
|
329
374
|
|
330
375
|
orig_std_out = STDOUT.clone
|
331
376
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
377
|
+
begin
|
378
|
+
|
379
|
+
Dir::mktmpdir("bio-vcf_") do |tempdir|
|
380
|
+
$stderr.print "Using #{tempdir} for temporary files\n" if num_threads
|
381
|
+
|
382
|
+
# ---- Main loop
|
383
|
+
STDIN.each_line do | line |
|
384
|
+
line_number += 1
|
385
|
+
$stderr.print '.' if line_number % thread_lines == 0 and not options[:quiet]
|
340
386
|
# ---- In this section header information is handled
|
341
387
|
next if header_output_completed and line =~ /^#/
|
342
388
|
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
@@ -353,60 +399,74 @@ Dir::mktmpdir("bio-vcf_") do |tempdir|
|
|
353
399
|
lines << line
|
354
400
|
if lines.size > thread_lines
|
355
401
|
manage_thread_pool(workers,thread_list,num_threads) if options[:num_threads]
|
356
|
-
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
402
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads,stats)
|
357
403
|
count_threads = thread_list.last[2]
|
358
404
|
lines = []
|
359
405
|
end
|
360
|
-
rescue Exception => e
|
361
|
-
# $stderr.print line
|
362
|
-
$stderr.print e.message,"\n"
|
363
|
-
raise if options[:verbose]
|
364
|
-
exit 1
|
365
406
|
end
|
366
|
-
end
|
367
|
-
|
368
|
-
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
369
|
-
count_threads = thread_list.last[2]
|
370
407
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
408
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads,stats)
|
409
|
+
count_threads = thread_list.last[2]
|
410
|
+
|
411
|
+
# ---- In this section the output gets collected and printed on STDOUT
|
412
|
+
if options[:num_threads]
|
413
|
+
STDOUT.reopen(orig_std_out)
|
414
|
+
$stderr.print "Final pid=#{thread_list.last[0]}, size=#{lines.size}\n"
|
415
|
+
lines = []
|
416
|
+
|
417
|
+
fault = false
|
418
|
+
# Wait for the running threads to complete
|
419
|
+
thread_list.each do |info|
|
420
|
+
(pid,threadfn) = info
|
421
|
+
tempfn = threadfn + '.running'
|
422
|
+
timeout = 180
|
423
|
+
if (pid && !pid_running?(pid)) || fault
|
424
|
+
# no point to wait for a long time if we've failed one already or the proc is dead
|
425
|
+
timeout = 1
|
426
|
+
end
|
427
|
+
$stderr.print "Waiting up to #{timeout/60} minutes for pid=#{pid} to complete\n"
|
428
|
+
begin
|
429
|
+
Timeout.timeout(timeout) do
|
430
|
+
while not File.exist?(threadfn) # wait for the result to appear
|
431
|
+
sleep 0.2
|
432
|
+
end
|
387
433
|
end
|
434
|
+
# Thread file should have gone:
|
435
|
+
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
436
|
+
$stderr.print "OK pid=#{pid}\n"
|
437
|
+
rescue Timeout::Error
|
438
|
+
if pid_running?(pid)
|
439
|
+
Process.kill 9, pid
|
440
|
+
Process.wait pid
|
441
|
+
end
|
442
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
443
|
+
fault = true
|
388
444
|
end
|
389
|
-
# Thread file should have gone:
|
390
|
-
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
391
|
-
$stderr.print "OK pid=#{pid}\n"
|
392
|
-
rescue Timeout::Error
|
393
|
-
Process.kill 9, pid
|
394
|
-
Process.wait pid
|
395
|
-
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
396
|
-
fault = true
|
397
445
|
end
|
446
|
+
# Collate the output
|
447
|
+
thread_list.each do | info |
|
448
|
+
(pid,fn) = info
|
449
|
+
if !fault
|
450
|
+
# This should never happen
|
451
|
+
raise "FATAL: child process output #{fn} is missing" if not File.exist?(fn)
|
452
|
+
$stderr.print "Reading #{fn}\n"
|
453
|
+
File.new(fn).each_line { |buf|
|
454
|
+
print buf
|
455
|
+
}
|
456
|
+
File.unlink(fn)
|
457
|
+
end
|
458
|
+
Process.wait(pid) if pid && pid_running?(pid)
|
459
|
+
end
|
460
|
+
return 1 if fault
|
398
461
|
end
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
return 1 if fault
|
411
|
-
end
|
412
|
-
end # cleans up tempdir
|
462
|
+
end # cleans up tempdir
|
463
|
+
|
464
|
+
stats.print if stats
|
465
|
+
|
466
|
+
rescue Exception => e
|
467
|
+
# $stderr.print line
|
468
|
+
$stderr.print e.message,"\n"
|
469
|
+
raise if options[:verbose]
|
470
|
+
exit 1
|
471
|
+
end
|
472
|
+
|