bio-vcf 0.0.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +145 -20
- data/VERSION +1 -1
- data/bin/bio-vcf +204 -62
- data/bio-vcf.gemspec +7 -3
- data/features/cli.feature +16 -0
- data/features/multisample.feature +10 -0
- data/features/sfilter.feature +60 -0
- data/features/step_definitions/cli-feature.rb +1 -1
- data/features/step_definitions/multisample.rb +32 -0
- data/features/step_definitions/sfilter.rb +90 -0
- data/lib/bio-vcf/utils.rb +12 -6
- data/lib/bio-vcf/vcfgenotypefield.rb +4 -1
- data/lib/bio-vcf/vcfheader.rb +24 -0
- data/lib/bio-vcf/vcfrdf.rb +15 -8
- data/lib/bio-vcf/vcfrecord.rb +45 -9
- data/lib/bio-vcf/vcfsample.rb +94 -5
- data/test/data/regression/sfilter_seval_s.dp.ref +31 -0
- data/test/data/regression/{sfilter001.ref → thread4.ref} +5 -0
- data/test/data/regression/thread4_4.ref +150 -0
- data/test/performance/metrics.md +53 -19
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4325d76baee5956ed3f58277ad622cf9a0a6ce7
|
4
|
+
data.tar.gz: e971b0fb0f760aafb32af51a647f1ba39f59f26f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a99e0be8ce0fd84d8afc557e5e30418da2fd98d2ad7458b242e9402e4605d5b7f816758da39b9248dad3ebf53e8a5a3c17862927c284832772cbf68c3b9d2fbc
|
7
|
+
data.tar.gz: 49f0e38cf66781a2d35bb45849d83bc137e299d42d3539928e70079b4ffbcc70bca20c4149d7cae355696e18670aab50dfbe0f8d87ffc03b6b7445b3d676eb95
|
data/README.md
CHANGED
@@ -2,14 +2,62 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-vcf.png)](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
Yet another VCF parser.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
Yet another VCF parser. Bio-vcf is not only fast for genome-wide data,
|
6
|
+
it also comes with a really nice filtering, evaluation and rewrite
|
7
|
+
language. Bio-vcf has better performance than other tools
|
8
|
+
because of lazy parsing, multi-threading, and useful combinations of
|
9
|
+
(fancy) command line filtering. For example on an 2 core machine
|
10
|
+
bio-vcf is 50% faster than SnpSift. On an 8 core machine bio-vcf is
|
11
|
+
3x faster than SnpSift. Parsing a 1 Gb ESP VCF with 8 cores with
|
12
|
+
bio-vcf takes
|
11
13
|
|
12
|
-
|
14
|
+
```sh
|
15
|
+
time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
|
16
|
+
real 0m21.095s
|
17
|
+
user 1m41.101s
|
18
|
+
sys 0m7.852s
|
19
|
+
```
|
20
|
+
|
21
|
+
and parsing with SnpSift takes
|
22
|
+
|
23
|
+
```sh
|
24
|
+
time cat ESP6500SI_V2_SSA137.vcf |java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > test.vcf
|
25
|
+
real 1m4.913s
|
26
|
+
user 0m58.071s
|
27
|
+
sys 0m7.982s
|
28
|
+
```
|
29
|
+
|
30
|
+
Bio-vcf is perfect for parsing large data files. Parsing a 650 Mb GATK
|
31
|
+
Illumina Hiseq VCF file and evaluating the results into a BED format on
|
32
|
+
a 16 core machine takes
|
33
|
+
|
34
|
+
```sh
|
35
|
+
time bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
36
|
+
real 0m47.612s
|
37
|
+
user 8m18.234s
|
38
|
+
sys 0m5.039s
|
39
|
+
```
|
40
|
+
|
41
|
+
which shows some pretty decent core utilisation (10x).
|
42
|
+
|
43
|
+
Use zcat to
|
44
|
+
pipe gzipped (vcf.gz) files into bio-vcf, e.g.
|
45
|
+
|
46
|
+
```sh
|
47
|
+
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
48
|
+
--sfilter '!s.empty? and s.dp>20'
|
49
|
+
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
50
|
+
|
51
|
+
```
|
52
|
+
|
53
|
+
bio-vcf comes with a sensible parser definition language (it is 100%
|
54
|
+
Ruby), as well as primitives for set analysis. Few
|
55
|
+
assumptions are made about the actual contents of the VCF file (field
|
56
|
+
names are resolved on the fly), so bio-vcf should practically work with
|
57
|
+
all VCF files.
|
58
|
+
|
59
|
+
To fetch all entries where all samples have depth larger than 20 use
|
60
|
+
a sample filter
|
13
61
|
|
14
62
|
```ruby
|
15
63
|
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
@@ -38,7 +86,7 @@ use the --eval switch, e.g.,
|
|
38
86
|
bio-vcf --eval 'rec.alt+"\t"+rec.info.dp+"\t"+rec.tumor.gq.to_s' < file.vcf
|
39
87
|
```
|
40
88
|
|
41
|
-
In fact, if the result is an Array the output gets tab dilimited so
|
89
|
+
In fact, if the result is an Array the output gets tab dilimited, so
|
42
90
|
the nicer version is
|
43
91
|
|
44
92
|
```ruby
|
@@ -61,13 +109,42 @@ bio-vcf -i --sfilter 's.dp>100' --seval 's.dp' < file.vcf
|
|
61
109
|
Where -i ignores missing samples. Pick up sample allele depth
|
62
110
|
|
63
111
|
```ruby
|
64
|
-
bio-vcf -i --seval 's.ad'
|
65
|
-
|
66
|
-
|
67
|
-
|
112
|
+
bio-vcf -i --seval 's.ad.to_s'
|
113
|
+
1 10257 [151, 8] [219, 22] [227, 22] [226, 22] [166, 18] [185, 27] [201, 15]
|
114
|
+
1 10291 [145, 16] [218, 26] [214, 30] [213, 32] [122, 36] [131, 27] [156, 31]
|
115
|
+
1 10297 [155, 18] [218, 23] [219, 26] [207, 30] [137, 20] [124, 27] [151, 27]
|
116
|
+
1 10303 [169, 25] [211, 31] [214, 28] [214, 32] [146, 17] [123, 23] [156, 22]
|
117
|
+
```
|
118
|
+
|
119
|
+
To get the alt depth per sample
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
bio-vcf -i --seval 's.ad[1]'
|
123
|
+
1 10257 8 22 22 22 18 27 15
|
124
|
+
1 10291 16 26 30 32 36 27 31
|
125
|
+
1 10297 18 23 26 30 20 27 27
|
126
|
+
1 10303 25 31 28 32 17 23 22
|
127
|
+
```
|
128
|
+
|
129
|
+
To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
133
|
+
1 10257 0.050314465408805034 0.0912863070539419 0.08835341365461848 0.088709677419354840.09782608695652174 0.12735849056603774 0.06944444444444445
|
134
|
+
1 10291 0.09937888198757763 0.10655737704918032 0.12295081967213115 0.1306122448979592 0.22784810126582278 0.17088607594936708 0.1657754010695187
|
135
|
+
```
|
136
|
+
|
137
|
+
note the floating point conversion .to_f is needed, otherwise you get
|
138
|
+
an integer division. To account for multiple alleles
|
139
|
+
|
140
|
+
```ruby
|
141
|
+
bio-vcf -i --eval 'r.ref+">"+r.alt[0]' --seval 'tot=s.ad.reduce(:+) ; (tot-s.ad[0].to_f)/tot' --set-header "mutation,#samples"
|
142
|
+
mutation Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
143
|
+
A>C 0.050314465408805034 0.0912863070539419 0.08835341365461848 0.08870967741935484 0.09782608695652174 0.12735849056603774 0.06944444444444445
|
144
|
+
C>T 0.09937888198757763 0.10655737704918032 0.12295081967213115 0.1306122448979592 0.22784810126582278 0.17088607594936708 0.1657754010695187
|
68
145
|
```
|
69
146
|
|
70
|
-
|
147
|
+
To output DP ang GQ values for tumor normal:
|
71
148
|
|
72
149
|
```ruby
|
73
150
|
bio-vcf --filter 'r.normal.dp>=7 and r.tumor.dp>=5' --seval '[s.dp,s.gq]' < freebayes.vcf
|
@@ -83,13 +160,25 @@ bio-vcf --filter 'r.normal.dp>=7 and r.tumor.dp>=5' --seval '[s.dp,s.gq]' < free
|
|
83
160
|
To parse and output genotype
|
84
161
|
|
85
162
|
```ruby
|
86
|
-
bio-vcf -iq --sfilter 's.dp>=20 and s.gq>=20' --ifilter-
|
163
|
+
bio-vcf -iq --sfilter 's.dp>=20 and s.gq>=20' --ifilter-samples 's.gt!="0/0"' --seval s.gt < test/data/input/multisample.vcf
|
87
164
|
1 10257 0/0 0/0 0/0 0/0 0/0 0/1 0/0
|
88
165
|
1 10291 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
89
166
|
1 10297 0/1 0/1 0/1 0/0 0/0 0/1 0/1
|
90
167
|
1 12783 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
91
168
|
```
|
92
169
|
|
170
|
+
And use --set-header if you want to add a header
|
171
|
+
|
172
|
+
```ruby
|
173
|
+
bio-vcf -iq --set-header 'chr,pos,#samples' --sfilter 's.dp>=20 and s.gq>=20' --ifilter-samples 's.gt!="0/0"' --seval s.gt < test/data/input/multisample.vcf
|
174
|
+
chr pos orig s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
175
|
+
1 10257 0/0 0/0 0/0 0/0 0/0 0/1 0/0
|
176
|
+
1 10291 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
177
|
+
(etc)
|
178
|
+
```
|
179
|
+
|
180
|
+
where #samples gets expanded.
|
181
|
+
|
93
182
|
Most filter and eval commands can be used at the same time. Special set
|
94
183
|
commands exit for filtering and eval. When a set is defined, based on
|
95
184
|
the sample name, you can apply filters on the samples inside the set,
|
@@ -111,13 +200,17 @@ If something is not working, check out the feature descriptions and
|
|
111
200
|
the source code. It is not hard to add features. Otherwise, send a short
|
112
201
|
example of a VCF statement you need to work on.
|
113
202
|
|
114
|
-
bio-vcf is fast. Parsing a 55K line DbSNP file (22Mb) takes 1.5 seconds on a
|
115
|
-
Macbook PRO running 64-bits Linux (Ruby 2.1.0).
|
116
|
-
|
117
203
|
## Installation
|
118
204
|
|
205
|
+
Note that you need Ruby 1.9.3 or later. The 2.x Ruby series also give
|
206
|
+
a performance improvement. Bio-vcf will show the Ruby version when
|
207
|
+
typing the command 'bio-vcf -h'.
|
208
|
+
|
209
|
+
To intall bio-vcf with gem:
|
210
|
+
|
119
211
|
```sh
|
120
212
|
gem install bio-vcf
|
213
|
+
bio-vcf -h
|
121
214
|
```
|
122
215
|
|
123
216
|
## Command line interface (CLI)
|
@@ -192,7 +285,7 @@ Output
|
|
192
285
|
|
193
286
|
```ruby
|
194
287
|
bio-vcf --filter 'rec.tumor.gq>30'
|
195
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]
|
288
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
196
289
|
< file.vcf
|
197
290
|
```
|
198
291
|
|
@@ -322,7 +415,7 @@ ref should always be identical across samples.
|
|
322
415
|
One clinical variant DbSNP example
|
323
416
|
|
324
417
|
```sh
|
325
|
-
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]
|
418
|
+
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
326
419
|
```
|
327
420
|
|
328
421
|
renders
|
@@ -499,7 +592,32 @@ To remove/select 3 samples and create a new file:
|
|
499
592
|
|
500
593
|
## RDF output
|
501
594
|
|
502
|
-
|
595
|
+
You can use --rdf for turtle RDF output, note the use of --id and
|
596
|
+
--tags which includes the MAF record:
|
597
|
+
|
598
|
+
```ruby
|
599
|
+
bio-vcf --id evs --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.maf[0]/100 }' < EVS.vcf
|
600
|
+
:evs_ch9_139266496_T seq:chr "9" .
|
601
|
+
:evs_ch9_139266496_T seq:pos 139266496 .
|
602
|
+
:evs_ch9_139266496_T seq:alt T .
|
603
|
+
:evs_ch9_139266496_T db:vcf true .
|
604
|
+
:evs_ch9_139266496_T db:evs true .
|
605
|
+
:evs_ch9_139266496_T seq:freq 0.419801 .
|
606
|
+
```
|
607
|
+
|
608
|
+
It is possible to filter too! Pick out the rare variants with
|
609
|
+
|
610
|
+
```ruby
|
611
|
+
bio-vcf --id evs --filter 'r.info.maf[0]<5.0' --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.maf[0]/100 }' < EVS.vcf
|
612
|
+
```
|
613
|
+
|
614
|
+
Similarly for GoNL
|
615
|
+
|
616
|
+
```ruby
|
617
|
+
bio-vcf --id gonl --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.af }' < GoNL.vcf
|
618
|
+
```
|
619
|
+
|
620
|
+
Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
503
621
|
|
504
622
|
## Other examples
|
505
623
|
|
@@ -534,6 +652,13 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
534
652
|
end
|
535
653
|
```
|
536
654
|
|
655
|
+
## Trouble shooting
|
656
|
+
|
657
|
+
The multi-threading creates temporary files using the system TMPDIR.
|
658
|
+
This behaviour can be overridden by setting the environment variable.
|
659
|
+
Also, for genome-wide sequencing it may be useful to increase
|
660
|
+
--thread-lines to a value larger than 1_000_000.
|
661
|
+
|
537
662
|
## Project home page
|
538
663
|
|
539
664
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.7.0
|
data/bin/bio-vcf
CHANGED
@@ -15,6 +15,9 @@ version = File.new(VERSION_FILENAME).read.chomp
|
|
15
15
|
|
16
16
|
require 'bio-vcf'
|
17
17
|
require 'optparse'
|
18
|
+
require 'timeout'
|
19
|
+
require 'fileutils'
|
20
|
+
require 'tempfile'
|
18
21
|
|
19
22
|
# Uncomment when using the bio-logger
|
20
23
|
# require 'bio-logger'
|
@@ -23,7 +26,7 @@ require 'optparse'
|
|
23
26
|
# Bio::Log::CLI.logger('stderr')
|
24
27
|
# Bio::Log::CLI.trace('info')
|
25
28
|
|
26
|
-
options = { show_help: false}
|
29
|
+
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000 }
|
27
30
|
opts = OptionParser.new do |o|
|
28
31
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
29
32
|
|
@@ -77,13 +80,28 @@ opts = OptionParser.new do |o|
|
|
77
80
|
options[:rdf] = true
|
78
81
|
options[:skip_header] = true
|
79
82
|
end
|
83
|
+
o.on("--num-threads [num]", Integer, "Multi-core version") do |i|
|
84
|
+
options[:num_threads] = i
|
85
|
+
end
|
86
|
+
o.on("--thread-lines num", Integer, "Fork thread on num lines (default 100_000)") do |i|
|
87
|
+
options[:thread_lines] = i
|
88
|
+
end
|
80
89
|
o.on_tail("--id name", String, "Identifier") do |s|
|
81
90
|
options[:id] = s
|
82
91
|
end
|
83
92
|
o.on_tail("--tags list", String, "Add tags") do |s|
|
84
|
-
options[:tags] =
|
93
|
+
options[:tags] = s
|
85
94
|
end
|
86
95
|
|
96
|
+
o.on("--skip-header", "Do not output VCF header info") do
|
97
|
+
options[:skip_header] = true
|
98
|
+
end
|
99
|
+
|
100
|
+
o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list|
|
101
|
+
options[:set_header] = list
|
102
|
+
options[:skip_header] = true
|
103
|
+
end
|
104
|
+
|
87
105
|
# Uncomment the following when using the bio-logger
|
88
106
|
# o.separator ""
|
89
107
|
# o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
@@ -113,9 +131,44 @@ opts = OptionParser.new do |o|
|
|
113
131
|
end
|
114
132
|
end
|
115
133
|
|
134
|
+
include BioVcf
|
116
135
|
|
136
|
+
# Parse the header section of a VCF file
|
137
|
+
def parse_header line, samples, options
|
138
|
+
header = VcfHeader.new
|
139
|
+
header.add(line)
|
140
|
+
print line if not options[:skip_header]
|
141
|
+
STDIN.each_line do | headerline |
|
142
|
+
if headerline !~ /^#/
|
143
|
+
line = headerline
|
144
|
+
break # end of header
|
145
|
+
end
|
146
|
+
header.add(headerline)
|
147
|
+
if not options[:skip_header]
|
148
|
+
if headerline =~ /^#CHR/
|
149
|
+
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
150
|
+
print header.tag(options),"\n" if not options[:skip_header]
|
151
|
+
selected = header.column_names
|
152
|
+
if samples
|
153
|
+
newfields = selected[0..8]
|
154
|
+
samples.each do |s|
|
155
|
+
newfields << selected[s+9]
|
156
|
+
end
|
157
|
+
selected = newfields
|
158
|
+
end
|
159
|
+
print "#",selected.join("\t"),"\n"
|
160
|
+
else
|
161
|
+
print headerline
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
|
166
|
+
VcfRdf::header if options[:rdf]
|
167
|
+
return header,line
|
168
|
+
end
|
169
|
+
|
170
|
+
# Parse a VCF line
|
117
171
|
def parse_line line,header,options,samples
|
118
|
-
# fields = VcfLine.parse(line,header.columns)
|
119
172
|
fields = VcfLine.parse(line)
|
120
173
|
rec = VcfRecord.new(fields,header)
|
121
174
|
r = rec # alias
|
@@ -124,27 +177,35 @@ def parse_line line,header,options,samples
|
|
124
177
|
sfilter = options[:sfilter]
|
125
178
|
efilter = options[:efilter]
|
126
179
|
ifilter = options[:ifilter]
|
180
|
+
seval = options[:seval]
|
127
181
|
ignore_missing = options[:ignore_missing]
|
128
182
|
quiet = options[:quiet]
|
183
|
+
|
184
|
+
if sfilter or efilter or ifilter or seval
|
185
|
+
# check for samples
|
186
|
+
header_samples = header.column_names[9..-1]
|
187
|
+
raise "Empty sample list, can not execute query!" if not header_samples
|
188
|
+
end
|
189
|
+
|
129
190
|
# --------------------------
|
130
191
|
# Filtering and set analysis
|
131
|
-
return if filter and not rec.
|
192
|
+
return if filter and not rec.filter(filter,ignore_missing,quiet)
|
132
193
|
|
133
194
|
if sfilter
|
134
195
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
135
|
-
return if not sample.
|
196
|
+
return if not sample.sfilter(sfilter,ignore_missing,quiet)
|
136
197
|
end
|
137
198
|
end
|
138
199
|
|
139
200
|
if ifilter
|
140
201
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
141
|
-
return if not sample.
|
202
|
+
return if not sample.ifilter(ifilter,ignore_missing,quiet)
|
142
203
|
end
|
143
204
|
end
|
144
205
|
|
145
206
|
if efilter
|
146
207
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
147
|
-
return if not sample.
|
208
|
+
return if not sample.efilter(efilter,ignore_missing,quiet)
|
148
209
|
end
|
149
210
|
end
|
150
211
|
|
@@ -158,19 +219,19 @@ def parse_line line,header,options,samples
|
|
158
219
|
end
|
159
220
|
fields = newfields
|
160
221
|
end
|
161
|
-
if options[:eval] or
|
222
|
+
if options[:eval] or seval
|
162
223
|
begin
|
163
224
|
results = nil # result string
|
164
225
|
if options[:eval]
|
165
226
|
res = rec.eval(options[:eval],ignore_missing,quiet)
|
166
227
|
results = res if res
|
167
228
|
end
|
168
|
-
if
|
229
|
+
if seval
|
169
230
|
list = (results ? [] : [rec.chr,rec.pos])
|
170
231
|
rec.each_sample(options[:sfilter_samples]) { | sample |
|
171
|
-
list << sample.eval(
|
232
|
+
list << sample.eval(seval,ignore_missing,quiet)
|
172
233
|
}
|
173
|
-
results = (results ? results + "\t" : "" ) + list.join("\t")
|
234
|
+
results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
|
174
235
|
end
|
175
236
|
rescue => e
|
176
237
|
$stderr.print "\nLine: ",line
|
@@ -183,23 +244,60 @@ def parse_line line,header,options,samples
|
|
183
244
|
else
|
184
245
|
if options[:rdf]
|
185
246
|
# Output Turtle RDF
|
186
|
-
if not header_out
|
187
|
-
VcfRdf::header
|
188
|
-
header_out = true
|
189
|
-
end
|
190
247
|
VcfRdf::record(options[:id],rec,options[:tags])
|
191
248
|
elsif options[:rewrite]
|
192
249
|
# Default behaviour prints VCF line, but rewrite info
|
193
250
|
eval(options[:rewrite])
|
194
|
-
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")
|
251
|
+
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
195
252
|
else
|
196
253
|
# Default behaviour prints VCF line
|
197
|
-
print fields.join("\t")
|
254
|
+
$stdout.print fields.join("\t")+"\n"
|
255
|
+
$stdout.flush
|
256
|
+
return true
|
198
257
|
end
|
199
258
|
end
|
200
259
|
end
|
201
260
|
|
202
|
-
|
261
|
+
# Collect a buffer of lines and feed them to a thread
|
262
|
+
# Returns the created pid, tempfilen and count_threads
|
263
|
+
# (Note: this function should be turned into a closure)
|
264
|
+
def parse_lines lines,header,options,samples,tempdir,count_threads
|
265
|
+
pid = nil
|
266
|
+
threadfilen = nil
|
267
|
+
if options[:num_threads]
|
268
|
+
lines2 = lines.map { |l| l.clone }
|
269
|
+
count_threads += 1
|
270
|
+
threadfilen = tempdir+sprintf("/%0.6d-pid",count_threads)+'.bio-vcf'
|
271
|
+
pid = fork do
|
272
|
+
count_lines = 0
|
273
|
+
tempfn = threadfilen+'.running'
|
274
|
+
STDOUT.reopen(File.open(tempfn, 'w+'))
|
275
|
+
lines2.each do | line |
|
276
|
+
count_lines +=1 if parse_line(line,header,options,samples)
|
277
|
+
end
|
278
|
+
STDOUT.flush
|
279
|
+
STDOUT.close
|
280
|
+
FileUtils::mv(tempfn,threadfilen)
|
281
|
+
exit 0
|
282
|
+
end
|
283
|
+
Process::detach(pid)
|
284
|
+
else
|
285
|
+
lines.each do | line |
|
286
|
+
parse_line line,header,options,samples
|
287
|
+
end
|
288
|
+
end
|
289
|
+
return pid,threadfilen,count_threads
|
290
|
+
end
|
291
|
+
|
292
|
+
# Make sure no more than num_threads are running at the same time
|
293
|
+
def manage_thread_pool(workers, thread_list, num_threads)
|
294
|
+
while true
|
295
|
+
# ---- count running pids
|
296
|
+
running = thread_list.reduce(0) { | sum, thread_info | ( File.exist?(thread_info[1]+'.running') ? sum+1 : sum ) }
|
297
|
+
break if running < num_threads
|
298
|
+
sleep 0.1
|
299
|
+
end
|
300
|
+
end
|
203
301
|
|
204
302
|
opts.parse!(ARGV)
|
205
303
|
|
@@ -216,55 +314,99 @@ $stderr.print "Options: ",options,"\n" if !options[:quiet]
|
|
216
314
|
if options[:samples]
|
217
315
|
samples = options[:samples].map { |s| s.to_i }
|
218
316
|
end
|
219
|
-
|
220
|
-
|
317
|
+
|
318
|
+
num_threads = options[:num_threads]
|
319
|
+
num_threads = 8 if num_threads != nil and num_threads < 2
|
320
|
+
|
321
|
+
header = nil
|
322
|
+
header_output_completed = false
|
221
323
|
line_number=0
|
324
|
+
lines = []
|
325
|
+
thread_list = []
|
326
|
+
workers = []
|
327
|
+
thread_lines = options[:thread_lines]
|
328
|
+
count_threads=0
|
222
329
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
330
|
+
orig_std_out = STDOUT.clone
|
331
|
+
|
332
|
+
Dir::mktmpdir("bio-vcf_") do |tempdir|
|
333
|
+
$stderr.print "Using #{tempdir} for temporary files\n" if num_threads
|
334
|
+
|
335
|
+
# ---- Main loop
|
336
|
+
STDIN.each_line do | line |
|
337
|
+
line_number += 1
|
338
|
+
$stderr.print '.' if line_number % thread_lines == 0 and not options[:quiet]
|
339
|
+
begin
|
340
|
+
# ---- In this section header information is handled
|
341
|
+
next if header_output_completed and line =~ /^#/
|
342
|
+
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
343
|
+
header,line = parse_header(line,samples,options)
|
344
|
+
end
|
345
|
+
next if line =~ /^##/ # empty file
|
346
|
+
header_output_completed = true
|
347
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
348
|
+
# Create exclude set as a complement of include set
|
349
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
350
|
+
end
|
351
|
+
|
352
|
+
# ---- In this section the VCF variant lines are parsed
|
353
|
+
lines << line
|
354
|
+
if lines.size > thread_lines
|
355
|
+
manage_thread_pool(workers,thread_list,num_threads) if options[:num_threads]
|
356
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
357
|
+
count_threads = thread_list.last[2]
|
358
|
+
lines = []
|
359
|
+
end
|
360
|
+
rescue Exception => e
|
361
|
+
# $stderr.print line
|
362
|
+
$stderr.print e.message,"\n"
|
363
|
+
raise if options[:verbose]
|
364
|
+
exit 1
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
369
|
+
count_threads = thread_list.last[2]
|
370
|
+
|
371
|
+
# ---- In this section the output gets collected and printed on STDOUT
|
372
|
+
if options[:num_threads]
|
373
|
+
STDOUT.reopen(orig_std_out)
|
374
|
+
$stderr.print "Final pid=#{thread_list.last[0]}, size=#{lines.size}\n"
|
375
|
+
lines = []
|
376
|
+
|
377
|
+
fault = false
|
378
|
+
# Wait for the running threads to complete
|
379
|
+
thread_list.each do |info|
|
380
|
+
(pid,threadfn) = info
|
381
|
+
tempfn = threadfn + '.running'
|
382
|
+
$stderr.print "Waiting up to 3 minutes for pid=#{pid} to complete\n"
|
383
|
+
begin
|
384
|
+
Timeout.timeout(180) do
|
385
|
+
while not File.exist?(threadfn) # wait for the result to appear
|
386
|
+
sleep 0.2
|
252
387
|
end
|
253
388
|
end
|
389
|
+
# Thread file should have gone:
|
390
|
+
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
391
|
+
$stderr.print "OK pid=#{pid}\n"
|
392
|
+
rescue Timeout::Error
|
393
|
+
Process.kill 9, pid
|
394
|
+
Process.wait pid
|
395
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
396
|
+
fault = true
|
254
397
|
end
|
255
398
|
end
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
399
|
+
# Collate the output
|
400
|
+
thread_list.each do | info |
|
401
|
+
(pid,fn) = info
|
402
|
+
# This should never happen
|
403
|
+
raise "FATAL: child process output #{fn} is missing" if not File.exist?(fn)
|
404
|
+
$stderr.print "Reading #{fn}\n"
|
405
|
+
File.new(fn).each_line { |buf|
|
406
|
+
print buf
|
407
|
+
}
|
408
|
+
File.unlink(fn)
|
260
409
|
end
|
261
|
-
|
262
|
-
parse_line line,header,options,samples
|
263
|
-
rescue Exception => e
|
264
|
-
# $stderr.print line
|
265
|
-
$stderr.print e.message,"\n"
|
266
|
-
raise if options[:verbose]
|
267
|
-
exit 1
|
410
|
+
return 1 if fault
|
268
411
|
end
|
269
|
-
end
|
270
|
-
|
412
|
+
end # cleans up tempdir
|