bio-vcf 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/Gemfile +2 -5
- data/Gemfile.lock +3 -3
- data/README.md +101 -23
- data/Rakefile +4 -2
- data/VERSION +1 -1
- data/bin/bio-vcf +133 -73
- data/bio-vcf.gemspec +13 -10
- data/features/cli.feature +9 -1
- data/features/multisample.feature +4 -4
- data/features/sfilter.feature +1 -1
- data/features/step_definitions/cli-feature.rb +4 -0
- data/features/step_definitions/multisample.rb +24 -12
- data/features/step_definitions/sfilter.rb +80 -31
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +45 -9
- data/lib/bio-vcf/vcfheader.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +14 -8
- data/lib/bio-vcf/vcfsample.rb +101 -152
- data/lib/bio-vcf/vcfstatistics.rb +28 -0
- data/test/data/regression/ifilter_s.dp.ref +31 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -0
- metadata +16 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72f63aae77e382c88e04cb07344ab3fce2a57232
|
4
|
+
data.tar.gz: 48f36f4d75d18edf3619124f0b679706a002b646
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1d1513454924e1d84bb9aecf1bd83cf0d8784e3ab8669bb7777129f2a8b33df53a2118d4fea15a825ce1c399de05cf729e0f7b5e7acd35367856a6a1821f328
|
7
|
+
data.tar.gz: 7b40ffdad49cbb690cfaa4d02e7a060095dc13aa4533f091f1f20b2f2b1903c71d2f6c6c5c5f672dc3be4da751305c2583265dbe7d86308501ab0219cca9e414
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -9,9 +9,6 @@ group :development do
|
|
9
9
|
# gem "minitest"
|
10
10
|
gem "rspec"
|
11
11
|
gem "cucumber"
|
12
|
-
gem "jeweler" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
|
14
|
-
# gem "bio", ">= 1.4.2"
|
15
|
-
# gem "rdoc", "~> 3.12"
|
16
|
-
gem "regressiontest"
|
12
|
+
gem "jeweler", "~> 2.0.1" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
+
gem "regressiontest", "~> 0.0.3"
|
17
14
|
end
|
data/Gemfile.lock
CHANGED
@@ -60,7 +60,7 @@ GEM
|
|
60
60
|
rake (10.1.1)
|
61
61
|
rdoc (4.1.1)
|
62
62
|
json (~> 1.4)
|
63
|
-
regressiontest (0.0.
|
63
|
+
regressiontest (0.0.3)
|
64
64
|
rspec (2.14.1)
|
65
65
|
rspec-core (~> 2.14.0)
|
66
66
|
rspec-expectations (~> 2.14.0)
|
@@ -76,6 +76,6 @@ PLATFORMS
|
|
76
76
|
|
77
77
|
DEPENDENCIES
|
78
78
|
cucumber
|
79
|
-
jeweler
|
80
|
-
regressiontest
|
79
|
+
jeweler (~> 2.0.1)
|
80
|
+
regressiontest (~> 0.0.3)
|
81
81
|
rspec
|
data/README.md
CHANGED
@@ -2,14 +2,27 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
|
6
|
-
it also comes with a really nice filtering, evaluation and
|
7
|
-
language.
|
5
|
+
A new generation VCF parser. Bio-vcf is not only fast for genome-wide
|
6
|
+
(WGS) data, it also comes with a really nice filtering, evaluation and
|
7
|
+
rewrite language. Why would you use bio-vcf over other parsers?
|
8
|
+
|
9
|
+
1. Bio-vcf is fast and scales on multi-core computers
|
10
|
+
2. Bio-vcf has an expressive filtering and evaluation language
|
11
|
+
3. Bio-vcf has great multi-sample support
|
12
|
+
4. Bio-vcf has multiple global filters and sample filters
|
13
|
+
5. Bio-vcf can access any VCF format
|
14
|
+
6. Bio-vcf can do calculations on fields
|
15
|
+
7. Bio-vcf allows for genotype processing
|
16
|
+
8. Bio-vcf has support for set analysis
|
17
|
+
9. Bio-vcf has sane error handling
|
18
|
+
10. Bio-vcf can output tabular data, HTML, LaTeX, RDF and (soon) JSON
|
19
|
+
|
20
|
+
Bio-vcf has better performance than other tools
|
8
21
|
because of lazy parsing, multi-threading, and useful combinations of
|
9
|
-
(fancy) command line filtering. For example on an 2 core machine
|
10
|
-
bio-vcf is 50% faster than SnpSift. On an 8 core machine
|
11
|
-
3x faster than SnpSift. Parsing a 1 Gb ESP
|
12
|
-
bio-vcf takes
|
22
|
+
(fancy) command line filtering. For example on an 2 core machine
|
23
|
+
bio-vcf is typically 50% faster than JVM based SnpSift. On an 8 core machine
|
24
|
+
bio-vcf is at least 3x faster than SnpSift. Parsing a 1 Gb ESP
|
25
|
+
VCF with 8 cores with bio-vcf takes
|
13
26
|
|
14
27
|
```sh
|
15
28
|
time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
|
@@ -18,7 +31,7 @@ bio-vcf takes
|
|
18
31
|
sys 0m7.852s
|
19
32
|
```
|
20
33
|
|
21
|
-
|
34
|
+
while parsing with SnpSift takes
|
22
35
|
|
23
36
|
```sh
|
24
37
|
time cat ESP6500SI_V2_SSA137.vcf |java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > test.vcf
|
@@ -32,22 +45,22 @@ Illumina Hiseq VCF file and evaluating the results into a BED format on
|
|
32
45
|
a 16 core machine takes
|
33
46
|
|
34
47
|
```sh
|
35
|
-
time bio-vcf --num-threads
|
48
|
+
time bio-vcf --num-threads 16 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
36
49
|
real 0m47.612s
|
37
50
|
user 8m18.234s
|
38
51
|
sys 0m5.039s
|
39
52
|
```
|
40
53
|
|
41
|
-
which shows
|
54
|
+
which shows pretty decent core utilisation (10x). We are running
|
55
|
+
gzip compressed VCF files of 30+ Gb with similar performance gains.
|
42
56
|
|
43
57
|
Use zcat to
|
44
|
-
pipe gzipped (vcf.gz) files into bio-vcf, e.g.
|
58
|
+
pipe such gzipped (vcf.gz) files into bio-vcf, e.g.
|
45
59
|
|
46
60
|
```sh
|
47
61
|
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
48
62
|
--sfilter '!s.empty? and s.dp>20'
|
49
63
|
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
50
|
-
|
51
64
|
```
|
52
65
|
|
53
66
|
bio-vcf comes with a sensible parser definition language (it is 100%
|
@@ -184,6 +197,13 @@ commands exit for filtering and eval. When a set is defined, based on
|
|
184
197
|
the sample name, you can apply filters on the samples inside the set,
|
185
198
|
outside the set and over all samples. E.g.
|
186
199
|
|
200
|
+
So, why would you use bio-vcf instead of rolling out your own
|
201
|
+
Perl/Python/other ad-hoc script? I think the reason should be that
|
202
|
+
there is less chance of mistakes because of Bio-vcf's clear filtering
|
203
|
+
language and sensible built-in validation. The second reason would be
|
204
|
+
speed. Bio-vcf's multi-threading capability gives it great and hard to
|
205
|
+
replicate performance.
|
206
|
+
|
187
207
|
Also note you can use
|
188
208
|
[bio-table](https://github.com/pjotrp/bioruby-table) to
|
189
209
|
filter/transform data further and convert to other formats, such as
|
@@ -202,7 +222,7 @@ example of a VCF statement you need to work on.
|
|
202
222
|
|
203
223
|
## Installation
|
204
224
|
|
205
|
-
Note that you need Ruby
|
225
|
+
Note that you need Ruby 2.x or later. The 2.x Ruby series also give
|
206
226
|
a performance improvement. Bio-vcf will show the Ruby version when
|
207
227
|
typing the command 'bio-vcf -h'.
|
208
228
|
|
@@ -371,7 +391,7 @@ And even better because of Ruby magic
|
|
371
391
|
Note that only valid method names in lower case get picked up this
|
372
392
|
way. Also by convention normal is sample 1 and tumor is sample 2.
|
373
393
|
|
374
|
-
Even shorter r is an alias for rec
|
394
|
+
Even shorter r is an alias for rec
|
375
395
|
|
376
396
|
```sh
|
377
397
|
bio-vcf --eval "r.original.gt" < file.vcf
|
@@ -380,7 +400,8 @@ Even shorter r is an alias for rec (nyi)
|
|
380
400
|
|
381
401
|
## Special functions
|
382
402
|
|
383
|
-
Note: special functions are not yet implemented!
|
403
|
+
Note: special functions are not yet implemented! Look below
|
404
|
+
for genotype processing which has indexing in 'gti'.
|
384
405
|
|
385
406
|
Sometime you want to use a special function in a filter. For
|
386
407
|
example percentage variant reads can be defined as [a,c,g,t]
|
@@ -440,7 +461,8 @@ example, samples are selected that evaluate to true, all others should
|
|
440
461
|
evaluate to false. For this we create three filters, one for all
|
441
462
|
samples that are included (the --ifilter or -if), for all samples that
|
442
463
|
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
443
|
-
or -sf). So i=include, e=exclude and s=any sample
|
464
|
+
or -sf). So i=include (OR filter), e=exclude and s=any sample (AND
|
465
|
+
filter).
|
444
466
|
|
445
467
|
The equivalent of the union filter is by using the --sfilter, so
|
446
468
|
|
@@ -448,15 +470,19 @@ The equivalent of the union filter is by using the --sfilter, so
|
|
448
470
|
bio-vcf --sfilter 's.dp>20'
|
449
471
|
```
|
450
472
|
|
451
|
-
Filters DP on all samples
|
473
|
+
Filters DP on all samples and is true if all samples match the
|
474
|
+
criterium (AND). To filter on a subset you can add a
|
452
475
|
selector
|
453
476
|
|
454
477
|
```sh
|
455
478
|
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
456
479
|
```
|
457
480
|
|
458
|
-
For set analysis there are the additional ifilter (include) and
|
459
|
-
|
481
|
+
For set analysis there are the additional ifilter (include) and
|
482
|
+
efilter (exclude). Where sfilter represents an ALL match, the ifilter
|
483
|
+
represents an ANY match, i.e., it is true if one of the samples
|
484
|
+
matches the criterium (OR). To filter on samples 0,1,4 and output the gq
|
485
|
+
values
|
460
486
|
|
461
487
|
```sh
|
462
488
|
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gq<10 or s.gq==99' --seval s.gq
|
@@ -494,8 +520,10 @@ To set an additional filter on the excluded samples:
|
|
494
520
|
```
|
495
521
|
|
496
522
|
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
523
|
+
Currently the efilter is an ALL filter (AND), i.e. all excluded
|
524
|
+
samples need to match the criterium.
|
497
525
|
|
498
|
-
The following are not yet implemented:
|
526
|
+
The following regular expression matches are not yet implemented:
|
499
527
|
|
500
528
|
In the near future it is also possible to select samples on a regex (here
|
501
529
|
select all samples where the name starts with s3)
|
@@ -560,6 +588,8 @@ and 'gts' as a nucleotide string array
|
|
560
588
|
1 15274 G G G G G G G
|
561
589
|
```
|
562
590
|
|
591
|
+
where gts represents the indexed genotype on [ref] + [alt].
|
592
|
+
|
563
593
|
These values can also be used in filters and output allele depth, for
|
564
594
|
example
|
565
595
|
|
@@ -570,12 +600,18 @@ example
|
|
570
600
|
1 13757 47 47 4 47 47 4 47
|
571
601
|
```
|
572
602
|
|
573
|
-
|
603
|
+
You can use the genotype index gti to fetch values from, for example,
|
604
|
+
allele depth:
|
574
605
|
|
575
606
|
```ruby
|
576
607
|
bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0' --seval 'rec.original.ad[s.gti[1]]'
|
608
|
+
|
609
|
+
1 10257 151 151 151 151 151 8 151
|
610
|
+
1 13302 26 10 10 10 10 10 10
|
611
|
+
1 13757 47 47 4 47 47 4 47
|
577
612
|
```
|
578
613
|
|
614
|
+
|
579
615
|
## Modify VCF files
|
580
616
|
|
581
617
|
Add or modify the sample file name in the INFO fields:
|
@@ -584,7 +620,7 @@ Add or modify the sample file name in the INFO fields:
|
|
584
620
|
bio-vcf --rewrite 'rec.info["sample"]="mytest"' < mytest.vcf
|
585
621
|
```
|
586
622
|
|
587
|
-
To remove/select 3 samples
|
623
|
+
To remove/select 3 samples:
|
588
624
|
|
589
625
|
```sh
|
590
626
|
bio-vcf --samples 0,1,3 < mytest.vcf
|
@@ -614,11 +650,50 @@ bio-vcf --id evs --filter 'r.info.maf[0]<5.0' --rdf --tags '{"db:evs" => true, "
|
|
614
650
|
Similarly for GoNL
|
615
651
|
|
616
652
|
```ruby
|
617
|
-
bio-vcf --id gonl --rdf --tags '{"db:
|
653
|
+
bio-vcf --id gonl --rdf --tags '{"db:gonl" => true, "seq:freq" => rec.info.af }' < GoNL.vcf
|
618
654
|
```
|
619
655
|
|
656
|
+
or without AF
|
657
|
+
|
658
|
+
|
659
|
+
```ruby
|
660
|
+
bio-vcf --id gonl --rdf --tags '{"db:gonl" => true, "seq:freq" => (rec.info.ac.to_f/rec.info.an).round(2) }' < gonl_germline_overlap_r4.vcf
|
661
|
+
```
|
662
|
+
|
663
|
+
|
664
|
+
|
620
665
|
Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
621
666
|
|
667
|
+
## Statistics
|
668
|
+
|
669
|
+
Simple statistics are available for REF>ALT changes:
|
670
|
+
|
671
|
+
```sh
|
672
|
+
./bin/bio-vcf -v --statistics < test/data/input/dbsnp.vcf
|
673
|
+
```
|
674
|
+
|
675
|
+
## ==== Statistics ==================================
|
676
|
+
G>A 59 45%
|
677
|
+
C>T 30 23%
|
678
|
+
A>G 5 4%
|
679
|
+
C>G 5 4%
|
680
|
+
C>A 5 4%
|
681
|
+
G>T 4 3%
|
682
|
+
T>C 4 3%
|
683
|
+
G>C 4 3%
|
684
|
+
T>A 3 2%
|
685
|
+
A>C 3 2%
|
686
|
+
A>T 2 2%
|
687
|
+
GTCCGACCGCTCC>G 1 1%
|
688
|
+
CGACCGCTCC>C 1 1%
|
689
|
+
T>TGGAGC 1 1%
|
690
|
+
C>CGTCTTCA 1 1%
|
691
|
+
TG>T 1 1%
|
692
|
+
AC>A 1 1%
|
693
|
+
|
694
|
+
Total 130
|
695
|
+
## ==================================================
|
696
|
+
|
622
697
|
## Other examples
|
623
698
|
|
624
699
|
For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
@@ -654,6 +729,9 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
654
729
|
|
655
730
|
## Trouble shooting
|
656
731
|
|
732
|
+
Note that Ruby 2.x is required for Bio-vcf. JRuby works, but only
|
733
|
+
in single threaded mode (for now).
|
734
|
+
|
657
735
|
The multi-threading creates temporary files using the system TMPDIR.
|
658
736
|
This behaviour can be overridden by setting the environment variable.
|
659
737
|
Also, for genome-wide sequencing it may be useful to increase
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ Jeweler::Tasks.new do |gem|
|
|
17
17
|
gem.name = "bio-vcf"
|
18
18
|
gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{VCF parser}
|
21
|
-
gem.description = %Q{Smart parser for VCF format}
|
20
|
+
gem.summary = %Q{Fast multi-threaded VCF parser}
|
21
|
+
gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
|
22
22
|
gem.email = "pjotr.public01@thebird.nl"
|
23
23
|
gem.authors = ["Pjotr Prins"]
|
24
24
|
# dependencies defined in Gemfile
|
@@ -47,6 +47,8 @@ Cucumber::Rake::Task.new(:features)
|
|
47
47
|
|
48
48
|
task :default => :features
|
49
49
|
|
50
|
+
task :test => [ :features ]
|
51
|
+
|
50
52
|
require 'rdoc/task'
|
51
53
|
Rake::RDocTask.new do |rdoc|
|
52
54
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.3
|
data/bin/bio-vcf
CHANGED
@@ -26,7 +26,7 @@ require 'tempfile'
|
|
26
26
|
# Bio::Log::CLI.logger('stderr')
|
27
27
|
# Bio::Log::CLI.trace('info')
|
28
28
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000 }
|
29
|
+
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000, num_threads: 4 }
|
30
30
|
opts = OptionParser.new do |o|
|
31
31
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
32
|
|
@@ -40,7 +40,7 @@ opts = OptionParser.new do |o|
|
|
40
40
|
o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd|
|
41
41
|
options[:sfilter] = cmd
|
42
42
|
end
|
43
|
-
o.on("--sfilter-samples list", Array, "Filter on selected samples") do |l|
|
43
|
+
o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l|
|
44
44
|
options[:sfilter_samples] = l
|
45
45
|
end
|
46
46
|
|
@@ -80,10 +80,10 @@ opts = OptionParser.new do |o|
|
|
80
80
|
options[:rdf] = true
|
81
81
|
options[:skip_header] = true
|
82
82
|
end
|
83
|
-
o.on("--num-threads [num]", Integer, "Multi-core version") do |i|
|
83
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
|
84
84
|
options[:num_threads] = i
|
85
85
|
end
|
86
|
-
o.on("--thread-lines num", Integer, "Fork thread on num lines (default
|
86
|
+
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
87
87
|
options[:thread_lines] = i
|
88
88
|
end
|
89
89
|
o.on_tail("--id name", String, "Identifier") do |s|
|
@@ -112,6 +112,10 @@ opts = OptionParser.new do |o|
|
|
112
112
|
# Bio::Log::CLI.trace(s)
|
113
113
|
# end
|
114
114
|
#
|
115
|
+
o.on("--statistics", "Output statistics") do |q|
|
116
|
+
options[:statistics] = true
|
117
|
+
options[:num_threads] = nil
|
118
|
+
end
|
115
119
|
o.on("-q", "--quiet", "Run quietly") do |q|
|
116
120
|
# Bio::Log::CLI.trace('error')
|
117
121
|
options[:quiet] = true
|
@@ -168,7 +172,7 @@ def parse_header line, samples, options
|
|
168
172
|
end
|
169
173
|
|
170
174
|
# Parse a VCF line
|
171
|
-
def parse_line line,header,options,samples
|
175
|
+
def parse_line line,header,options,samples,stats=nil
|
172
176
|
fields = VcfLine.parse(line)
|
173
177
|
rec = VcfRecord.new(fields,header)
|
174
178
|
r = rec # alias
|
@@ -189,26 +193,34 @@ def parse_line line,header,options,samples
|
|
189
193
|
|
190
194
|
# --------------------------
|
191
195
|
# Filtering and set analysis
|
192
|
-
return if filter and not rec.filter(filter,ignore_missing,quiet)
|
196
|
+
return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
|
193
197
|
|
194
198
|
if sfilter
|
195
199
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
196
|
-
return if not sample.sfilter(sfilter,ignore_missing,quiet)
|
200
|
+
return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
197
201
|
end
|
198
202
|
end
|
199
203
|
|
200
204
|
if ifilter
|
205
|
+
found = false
|
201
206
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
202
|
-
|
207
|
+
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
208
|
+
found = true
|
209
|
+
break
|
210
|
+
end
|
203
211
|
end
|
212
|
+
# Skip if there are no matches
|
213
|
+
return if not found
|
204
214
|
end
|
205
215
|
|
206
216
|
if efilter
|
207
217
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
208
|
-
return if not sample.efilter(efilter,ignore_missing,quiet)
|
218
|
+
return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
209
219
|
end
|
210
220
|
end
|
211
221
|
|
222
|
+
stats.add(rec) if stats
|
223
|
+
|
212
224
|
# -----------------------------
|
213
225
|
# From here on decide on output
|
214
226
|
if samples
|
@@ -223,13 +235,13 @@ def parse_line line,header,options,samples
|
|
223
235
|
begin
|
224
236
|
results = nil # result string
|
225
237
|
if options[:eval]
|
226
|
-
res = rec.eval(options[:eval],ignore_missing,quiet)
|
238
|
+
res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
|
227
239
|
results = res if res
|
228
240
|
end
|
229
241
|
if seval
|
230
242
|
list = (results ? [] : [rec.chr,rec.pos])
|
231
243
|
rec.each_sample(options[:sfilter_samples]) { | sample |
|
232
|
-
list << sample.eval(seval,ignore_missing,quiet)
|
244
|
+
list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet)
|
233
245
|
}
|
234
246
|
results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
|
235
247
|
end
|
@@ -249,6 +261,8 @@ def parse_line line,header,options,samples
|
|
249
261
|
# Default behaviour prints VCF line, but rewrite info
|
250
262
|
eval(options[:rewrite])
|
251
263
|
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
264
|
+
elsif stats
|
265
|
+
# do nothing
|
252
266
|
else
|
253
267
|
# Default behaviour prints VCF line
|
254
268
|
$stdout.print fields.join("\t")+"\n"
|
@@ -261,18 +275,17 @@ end
|
|
261
275
|
# Collect a buffer of lines and feed them to a thread
|
262
276
|
# Returns the created pid, tempfilen and count_threads
|
263
277
|
# (Note: this function should be turned into a closure)
|
264
|
-
def parse_lines lines,header,options,samples,tempdir,count_threads
|
278
|
+
def parse_lines lines,header,options,samples,tempdir,count_threads,stats
|
265
279
|
pid = nil
|
266
280
|
threadfilen = nil
|
267
281
|
if options[:num_threads]
|
268
|
-
lines2 = lines.map { |l| l.clone }
|
269
282
|
count_threads += 1
|
270
283
|
threadfilen = tempdir+sprintf("/%0.6d-pid",count_threads)+'.bio-vcf'
|
271
284
|
pid = fork do
|
272
285
|
count_lines = 0
|
273
286
|
tempfn = threadfilen+'.running'
|
274
287
|
STDOUT.reopen(File.open(tempfn, 'w+'))
|
275
|
-
|
288
|
+
lines.each do | line |
|
276
289
|
count_lines +=1 if parse_line(line,header,options,samples)
|
277
290
|
end
|
278
291
|
STDOUT.flush
|
@@ -280,10 +293,9 @@ def parse_lines lines,header,options,samples,tempdir,count_threads
|
|
280
293
|
FileUtils::mv(tempfn,threadfilen)
|
281
294
|
exit 0
|
282
295
|
end
|
283
|
-
Process::detach(pid)
|
284
296
|
else
|
285
297
|
lines.each do | line |
|
286
|
-
parse_line line,header,options,samples
|
298
|
+
parse_line line,header,options,samples,stats
|
287
299
|
end
|
288
300
|
end
|
289
301
|
return pid,threadfilen,count_threads
|
@@ -293,12 +305,30 @@ end
|
|
293
305
|
def manage_thread_pool(workers, thread_list, num_threads)
|
294
306
|
while true
|
295
307
|
# ---- count running pids
|
296
|
-
running = thread_list.reduce(0)
|
308
|
+
running = thread_list.reduce(0) do | sum, thread_info |
|
309
|
+
if thread_info[0] && pid_running?(thread_info[0])
|
310
|
+
sum+1
|
311
|
+
elsif nil == thread_info[0] && File.exist?(thread_info[1]+'.running')
|
312
|
+
sum+1
|
313
|
+
else
|
314
|
+
sum
|
315
|
+
end
|
316
|
+
end
|
297
317
|
break if running < num_threads
|
298
318
|
sleep 0.1
|
299
319
|
end
|
300
320
|
end
|
301
321
|
|
322
|
+
def pid_running?(pid)
|
323
|
+
begin
|
324
|
+
fpid,status=Process.waitpid2(pid,Process::WNOHANG)
|
325
|
+
rescue Errno::ECHILD, Errno::ESRCH
|
326
|
+
return false
|
327
|
+
end
|
328
|
+
return true if nil == fpid && nil == status
|
329
|
+
return ! (status.exited? || status.signaled?)
|
330
|
+
end
|
331
|
+
|
302
332
|
opts.parse!(ARGV)
|
303
333
|
|
304
334
|
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
@@ -309,8 +339,23 @@ if options[:show_help]
|
|
309
339
|
exit 1
|
310
340
|
end
|
311
341
|
|
342
|
+
if RUBY_VERSION =~ /^1/
|
343
|
+
$stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n"
|
344
|
+
end
|
345
|
+
|
312
346
|
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
313
347
|
|
348
|
+
stats = nil
|
349
|
+
if options[:statistics]
|
350
|
+
options[:num_threads] = nil
|
351
|
+
stats = BioVcf::VcfStatistics.new
|
352
|
+
end
|
353
|
+
|
354
|
+
# Check for option combinations
|
355
|
+
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
356
|
+
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
357
|
+
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
358
|
+
|
314
359
|
if options[:samples]
|
315
360
|
samples = options[:samples].map { |s| s.to_i }
|
316
361
|
end
|
@@ -329,14 +374,15 @@ count_threads=0
|
|
329
374
|
|
330
375
|
orig_std_out = STDOUT.clone
|
331
376
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
377
|
+
begin
|
378
|
+
|
379
|
+
Dir::mktmpdir("bio-vcf_") do |tempdir|
|
380
|
+
$stderr.print "Using #{tempdir} for temporary files\n" if num_threads
|
381
|
+
|
382
|
+
# ---- Main loop
|
383
|
+
STDIN.each_line do | line |
|
384
|
+
line_number += 1
|
385
|
+
$stderr.print '.' if line_number % thread_lines == 0 and not options[:quiet]
|
340
386
|
# ---- In this section header information is handled
|
341
387
|
next if header_output_completed and line =~ /^#/
|
342
388
|
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
@@ -353,60 +399,74 @@ Dir::mktmpdir("bio-vcf_") do |tempdir|
|
|
353
399
|
lines << line
|
354
400
|
if lines.size > thread_lines
|
355
401
|
manage_thread_pool(workers,thread_list,num_threads) if options[:num_threads]
|
356
|
-
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
402
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads,stats)
|
357
403
|
count_threads = thread_list.last[2]
|
358
404
|
lines = []
|
359
405
|
end
|
360
|
-
rescue Exception => e
|
361
|
-
# $stderr.print line
|
362
|
-
$stderr.print e.message,"\n"
|
363
|
-
raise if options[:verbose]
|
364
|
-
exit 1
|
365
406
|
end
|
366
|
-
end
|
367
|
-
|
368
|
-
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
369
|
-
count_threads = thread_list.last[2]
|
370
407
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
408
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads,stats)
|
409
|
+
count_threads = thread_list.last[2]
|
410
|
+
|
411
|
+
# ---- In this section the output gets collected and printed on STDOUT
|
412
|
+
if options[:num_threads]
|
413
|
+
STDOUT.reopen(orig_std_out)
|
414
|
+
$stderr.print "Final pid=#{thread_list.last[0]}, size=#{lines.size}\n"
|
415
|
+
lines = []
|
416
|
+
|
417
|
+
fault = false
|
418
|
+
# Wait for the running threads to complete
|
419
|
+
thread_list.each do |info|
|
420
|
+
(pid,threadfn) = info
|
421
|
+
tempfn = threadfn + '.running'
|
422
|
+
timeout = 180
|
423
|
+
if (pid && !pid_running?(pid)) || fault
|
424
|
+
# no point to wait for a long time if we've failed one already or the proc is dead
|
425
|
+
timeout = 1
|
426
|
+
end
|
427
|
+
$stderr.print "Waiting up to #{timeout/60} minutes for pid=#{pid} to complete\n"
|
428
|
+
begin
|
429
|
+
Timeout.timeout(timeout) do
|
430
|
+
while not File.exist?(threadfn) # wait for the result to appear
|
431
|
+
sleep 0.2
|
432
|
+
end
|
387
433
|
end
|
434
|
+
# Thread file should have gone:
|
435
|
+
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
436
|
+
$stderr.print "OK pid=#{pid}\n"
|
437
|
+
rescue Timeout::Error
|
438
|
+
if pid_running?(pid)
|
439
|
+
Process.kill 9, pid
|
440
|
+
Process.wait pid
|
441
|
+
end
|
442
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
443
|
+
fault = true
|
388
444
|
end
|
389
|
-
# Thread file should have gone:
|
390
|
-
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
391
|
-
$stderr.print "OK pid=#{pid}\n"
|
392
|
-
rescue Timeout::Error
|
393
|
-
Process.kill 9, pid
|
394
|
-
Process.wait pid
|
395
|
-
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
396
|
-
fault = true
|
397
445
|
end
|
446
|
+
# Collate the output
|
447
|
+
thread_list.each do | info |
|
448
|
+
(pid,fn) = info
|
449
|
+
if !fault
|
450
|
+
# This should never happen
|
451
|
+
raise "FATAL: child process output #{fn} is missing" if not File.exist?(fn)
|
452
|
+
$stderr.print "Reading #{fn}\n"
|
453
|
+
File.new(fn).each_line { |buf|
|
454
|
+
print buf
|
455
|
+
}
|
456
|
+
File.unlink(fn)
|
457
|
+
end
|
458
|
+
Process.wait(pid) if pid && pid_running?(pid)
|
459
|
+
end
|
460
|
+
return 1 if fault
|
398
461
|
end
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
return 1 if fault
|
411
|
-
end
|
412
|
-
end # cleans up tempdir
|
462
|
+
end # cleans up tempdir
|
463
|
+
|
464
|
+
stats.print if stats
|
465
|
+
|
466
|
+
rescue Exception => e
|
467
|
+
# $stderr.print line
|
468
|
+
$stderr.print e.message,"\n"
|
469
|
+
raise if options[:verbose]
|
470
|
+
exit 1
|
471
|
+
end
|
472
|
+
|