bio-vcf 0.0.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +145 -20
- data/VERSION +1 -1
- data/bin/bio-vcf +204 -62
- data/bio-vcf.gemspec +7 -3
- data/features/cli.feature +16 -0
- data/features/multisample.feature +10 -0
- data/features/sfilter.feature +60 -0
- data/features/step_definitions/cli-feature.rb +1 -1
- data/features/step_definitions/multisample.rb +32 -0
- data/features/step_definitions/sfilter.rb +90 -0
- data/lib/bio-vcf/utils.rb +12 -6
- data/lib/bio-vcf/vcfgenotypefield.rb +4 -1
- data/lib/bio-vcf/vcfheader.rb +24 -0
- data/lib/bio-vcf/vcfrdf.rb +15 -8
- data/lib/bio-vcf/vcfrecord.rb +45 -9
- data/lib/bio-vcf/vcfsample.rb +94 -5
- data/test/data/regression/sfilter_seval_s.dp.ref +31 -0
- data/test/data/regression/{sfilter001.ref → thread4.ref} +5 -0
- data/test/data/regression/thread4_4.ref +150 -0
- data/test/performance/metrics.md +53 -19
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4325d76baee5956ed3f58277ad622cf9a0a6ce7
|
4
|
+
data.tar.gz: e971b0fb0f760aafb32af51a647f1ba39f59f26f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a99e0be8ce0fd84d8afc557e5e30418da2fd98d2ad7458b242e9402e4605d5b7f816758da39b9248dad3ebf53e8a5a3c17862927c284832772cbf68c3b9d2fbc
|
7
|
+
data.tar.gz: 49f0e38cf66781a2d35bb45849d83bc137e299d42d3539928e70079b4ffbcc70bca20c4149d7cae355696e18670aab50dfbe0f8d87ffc03b6b7445b3d676eb95
|
data/README.md
CHANGED
@@ -2,14 +2,62 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
Yet another VCF parser.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
Yet another VCF parser. Bio-vcf is not only fast for genome-wide data,
|
6
|
+
it also comes with a really nice filtering, evaluation and rewrite
|
7
|
+
language. Bio-vcf has better performance than other tools
|
8
|
+
because of lazy parsing, multi-threading, and useful combinations of
|
9
|
+
(fancy) command line filtering. For example on an 2 core machine
|
10
|
+
bio-vcf is 50% faster than SnpSift. On an 8 core machine bio-vcf is
|
11
|
+
3x faster than SnpSift. Parsing a 1 Gb ESP VCF with 8 cores with
|
12
|
+
bio-vcf takes
|
11
13
|
|
12
|
-
|
14
|
+
```sh
|
15
|
+
time ./bin/bio-vcf -iv --num-threads 8 --filter 'r.info.cp>0.3' < ESP6500SI_V2_SSA137.vcf > test1.vcf
|
16
|
+
real 0m21.095s
|
17
|
+
user 1m41.101s
|
18
|
+
sys 0m7.852s
|
19
|
+
```
|
20
|
+
|
21
|
+
and parsing with SnpSift takes
|
22
|
+
|
23
|
+
```sh
|
24
|
+
time cat ESP6500SI_V2_SSA137.vcf |java -jar snpEff/SnpSift.jar filter "( CP>0.3 )" > test.vcf
|
25
|
+
real 1m4.913s
|
26
|
+
user 0m58.071s
|
27
|
+
sys 0m7.982s
|
28
|
+
```
|
29
|
+
|
30
|
+
Bio-vcf is perfect for parsing large data files. Parsing a 650 Mb GATK
|
31
|
+
Illumina Hiseq VCF file and evaluating the results into a BED format on
|
32
|
+
a 16 core machine takes
|
33
|
+
|
34
|
+
```sh
|
35
|
+
time bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50' --sfilter '!s.empty? and s.dp>20' --eval '[r.chrom,r.pos,r.pos+1]' < test.large2.vcf > test.out.3
|
36
|
+
real 0m47.612s
|
37
|
+
user 8m18.234s
|
38
|
+
sys 0m5.039s
|
39
|
+
```
|
40
|
+
|
41
|
+
which shows some pretty decent core utilisation (10x).
|
42
|
+
|
43
|
+
Use zcat to
|
44
|
+
pipe gzipped (vcf.gz) files into bio-vcf, e.g.
|
45
|
+
|
46
|
+
```sh
|
47
|
+
zcat huge_file.vcf.gz| bio-vcf --num-threads 36 --filter 'r.chrom.to_i>0 and r.chrom.to_i<21 and r.qual>50'
|
48
|
+
--sfilter '!s.empty? and s.dp>20'
|
49
|
+
--eval '[r.chrom,r.pos,r.pos+1]' > test.bed
|
50
|
+
|
51
|
+
```
|
52
|
+
|
53
|
+
bio-vcf comes with a sensible parser definition language (it is 100%
|
54
|
+
Ruby), as well as primitives for set analysis. Few
|
55
|
+
assumptions are made about the actual contents of the VCF file (field
|
56
|
+
names are resolved on the fly), so bio-vcf should practically work with
|
57
|
+
all VCF files.
|
58
|
+
|
59
|
+
To fetch all entries where all samples have depth larger than 20 use
|
60
|
+
a sample filter
|
13
61
|
|
14
62
|
```ruby
|
15
63
|
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
@@ -38,7 +86,7 @@ use the --eval switch, e.g.,
|
|
38
86
|
bio-vcf --eval 'rec.alt+"\t"+rec.info.dp+"\t"+rec.tumor.gq.to_s' < file.vcf
|
39
87
|
```
|
40
88
|
|
41
|
-
In fact, if the result is an Array the output gets tab dilimited so
|
89
|
+
In fact, if the result is an Array the output gets tab dilimited, so
|
42
90
|
the nicer version is
|
43
91
|
|
44
92
|
```ruby
|
@@ -61,13 +109,42 @@ bio-vcf -i --sfilter 's.dp>100' --seval 's.dp' < file.vcf
|
|
61
109
|
Where -i ignores missing samples. Pick up sample allele depth
|
62
110
|
|
63
111
|
```ruby
|
64
|
-
bio-vcf -i --seval 's.ad'
|
65
|
-
|
66
|
-
|
67
|
-
|
112
|
+
bio-vcf -i --seval 's.ad.to_s'
|
113
|
+
1 10257 [151, 8] [219, 22] [227, 22] [226, 22] [166, 18] [185, 27] [201, 15]
|
114
|
+
1 10291 [145, 16] [218, 26] [214, 30] [213, 32] [122, 36] [131, 27] [156, 31]
|
115
|
+
1 10297 [155, 18] [218, 23] [219, 26] [207, 30] [137, 20] [124, 27] [151, 27]
|
116
|
+
1 10303 [169, 25] [211, 31] [214, 28] [214, 32] [146, 17] [123, 23] [156, 22]
|
117
|
+
```
|
118
|
+
|
119
|
+
To get the alt depth per sample
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
bio-vcf -i --seval 's.ad[1]'
|
123
|
+
1 10257 8 22 22 22 18 27 15
|
124
|
+
1 10291 16 26 30 32 36 27 31
|
125
|
+
1 10297 18 23 26 30 20 27 27
|
126
|
+
1 10303 25 31 28 32 17 23 22
|
127
|
+
```
|
128
|
+
|
129
|
+
To calculate alt frequencies from s.ad which is sample (alt dp)/(ref dp + alt dp)
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
bio-vcf -i --seval 's.ad[1].to_f/(s.ad[0]+s.ad[1])'
|
133
|
+
1 10257 0.050314465408805034 0.0912863070539419 0.08835341365461848 0.088709677419354840.09782608695652174 0.12735849056603774 0.06944444444444445
|
134
|
+
1 10291 0.09937888198757763 0.10655737704918032 0.12295081967213115 0.1306122448979592 0.22784810126582278 0.17088607594936708 0.1657754010695187
|
135
|
+
```
|
136
|
+
|
137
|
+
note the floating point conversion .to_f is needed, otherwise you get
|
138
|
+
an integer division. To account for multiple alleles
|
139
|
+
|
140
|
+
```ruby
|
141
|
+
bio-vcf -i --eval 'r.ref+">"+r.alt[0]' --seval 'tot=s.ad.reduce(:+) ; (tot-s.ad[0].to_f)/tot' --set-header "mutation,#samples"
|
142
|
+
mutation Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
143
|
+
A>C 0.050314465408805034 0.0912863070539419 0.08835341365461848 0.08870967741935484 0.09782608695652174 0.12735849056603774 0.06944444444444445
|
144
|
+
C>T 0.09937888198757763 0.10655737704918032 0.12295081967213115 0.1306122448979592 0.22784810126582278 0.17088607594936708 0.1657754010695187
|
68
145
|
```
|
69
146
|
|
70
|
-
|
147
|
+
To output DP ang GQ values for tumor normal:
|
71
148
|
|
72
149
|
```ruby
|
73
150
|
bio-vcf --filter 'r.normal.dp>=7 and r.tumor.dp>=5' --seval '[s.dp,s.gq]' < freebayes.vcf
|
@@ -83,13 +160,25 @@ bio-vcf --filter 'r.normal.dp>=7 and r.tumor.dp>=5' --seval '[s.dp,s.gq]' < free
|
|
83
160
|
To parse and output genotype
|
84
161
|
|
85
162
|
```ruby
|
86
|
-
bio-vcf -iq --sfilter 's.dp>=20 and s.gq>=20' --ifilter-
|
163
|
+
bio-vcf -iq --sfilter 's.dp>=20 and s.gq>=20' --ifilter-samples 's.gt!="0/0"' --seval s.gt < test/data/input/multisample.vcf
|
87
164
|
1 10257 0/0 0/0 0/0 0/0 0/0 0/1 0/0
|
88
165
|
1 10291 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
89
166
|
1 10297 0/1 0/1 0/1 0/0 0/0 0/1 0/1
|
90
167
|
1 12783 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
91
168
|
```
|
92
169
|
|
170
|
+
And use --set-header if you want to add a header
|
171
|
+
|
172
|
+
```ruby
|
173
|
+
bio-vcf -iq --set-header 'chr,pos,#samples' --sfilter 's.dp>=20 and s.gq>=20' --ifilter-samples 's.gt!="0/0"' --seval s.gt < test/data/input/multisample.vcf
|
174
|
+
chr pos orig s1t1 s2t1 s3t1 s1t2 s2t2 s3t2
|
175
|
+
1 10257 0/0 0/0 0/0 0/0 0/0 0/1 0/0
|
176
|
+
1 10291 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
177
|
+
(etc)
|
178
|
+
```
|
179
|
+
|
180
|
+
where #samples gets expanded.
|
181
|
+
|
93
182
|
Most filter and eval commands can be used at the same time. Special set
|
94
183
|
commands exit for filtering and eval. When a set is defined, based on
|
95
184
|
the sample name, you can apply filters on the samples inside the set,
|
@@ -111,13 +200,17 @@ If something is not working, check out the feature descriptions and
|
|
111
200
|
the source code. It is not hard to add features. Otherwise, send a short
|
112
201
|
example of a VCF statement you need to work on.
|
113
202
|
|
114
|
-
bio-vcf is fast. Parsing a 55K line DbSNP file (22Mb) takes 1.5 seconds on a
|
115
|
-
Macbook PRO running 64-bits Linux (Ruby 2.1.0).
|
116
|
-
|
117
203
|
## Installation
|
118
204
|
|
205
|
+
Note that you need Ruby 1.9.3 or later. The 2.x Ruby series also give
|
206
|
+
a performance improvement. Bio-vcf will show the Ruby version when
|
207
|
+
typing the command 'bio-vcf -h'.
|
208
|
+
|
209
|
+
To intall bio-vcf with gem:
|
210
|
+
|
119
211
|
```sh
|
120
212
|
gem install bio-vcf
|
213
|
+
bio-vcf -h
|
121
214
|
```
|
122
215
|
|
123
216
|
## Command line interface (CLI)
|
@@ -192,7 +285,7 @@ Output
|
|
192
285
|
|
193
286
|
```ruby
|
194
287
|
bio-vcf --filter 'rec.tumor.gq>30'
|
195
|
-
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]
|
288
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq]'
|
196
289
|
< file.vcf
|
197
290
|
```
|
198
291
|
|
@@ -322,7 +415,7 @@ ref should always be identical across samples.
|
|
322
415
|
One clinical variant DbSNP example
|
323
416
|
|
324
417
|
```sh
|
325
|
-
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]
|
418
|
+
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN]' < clinvar_20140303.vcf
|
326
419
|
```
|
327
420
|
|
328
421
|
renders
|
@@ -499,7 +592,32 @@ To remove/select 3 samples and create a new file:
|
|
499
592
|
|
500
593
|
## RDF output
|
501
594
|
|
502
|
-
|
595
|
+
You can use --rdf for turtle RDF output, note the use of --id and
|
596
|
+
--tags which includes the MAF record:
|
597
|
+
|
598
|
+
```ruby
|
599
|
+
bio-vcf --id evs --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.maf[0]/100 }' < EVS.vcf
|
600
|
+
:evs_ch9_139266496_T seq:chr "9" .
|
601
|
+
:evs_ch9_139266496_T seq:pos 139266496 .
|
602
|
+
:evs_ch9_139266496_T seq:alt T .
|
603
|
+
:evs_ch9_139266496_T db:vcf true .
|
604
|
+
:evs_ch9_139266496_T db:evs true .
|
605
|
+
:evs_ch9_139266496_T seq:freq 0.419801 .
|
606
|
+
```
|
607
|
+
|
608
|
+
It is possible to filter too! Pick out the rare variants with
|
609
|
+
|
610
|
+
```ruby
|
611
|
+
bio-vcf --id evs --filter 'r.info.maf[0]<5.0' --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.maf[0]/100 }' < EVS.vcf
|
612
|
+
```
|
613
|
+
|
614
|
+
Similarly for GoNL
|
615
|
+
|
616
|
+
```ruby
|
617
|
+
bio-vcf --id gonl --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.af }' < GoNL.vcf
|
618
|
+
```
|
619
|
+
|
620
|
+
Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
503
621
|
|
504
622
|
## Other examples
|
505
623
|
|
@@ -534,6 +652,13 @@ what the command line interface uses (see ./bin/bio-vcf)
|
|
534
652
|
end
|
535
653
|
```
|
536
654
|
|
655
|
+
## Trouble shooting
|
656
|
+
|
657
|
+
The multi-threading creates temporary files using the system TMPDIR.
|
658
|
+
This behaviour can be overridden by setting the environment variable.
|
659
|
+
Also, for genome-wide sequencing it may be useful to increase
|
660
|
+
--thread-lines to a value larger than 1_000_000.
|
661
|
+
|
537
662
|
## Project home page
|
538
663
|
|
539
664
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.7.0
|
data/bin/bio-vcf
CHANGED
@@ -15,6 +15,9 @@ version = File.new(VERSION_FILENAME).read.chomp
|
|
15
15
|
|
16
16
|
require 'bio-vcf'
|
17
17
|
require 'optparse'
|
18
|
+
require 'timeout'
|
19
|
+
require 'fileutils'
|
20
|
+
require 'tempfile'
|
18
21
|
|
19
22
|
# Uncomment when using the bio-logger
|
20
23
|
# require 'bio-logger'
|
@@ -23,7 +26,7 @@ require 'optparse'
|
|
23
26
|
# Bio::Log::CLI.logger('stderr')
|
24
27
|
# Bio::Log::CLI.trace('info')
|
25
28
|
|
26
|
-
options = { show_help: false}
|
29
|
+
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 100_000 }
|
27
30
|
opts = OptionParser.new do |o|
|
28
31
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
29
32
|
|
@@ -77,13 +80,28 @@ opts = OptionParser.new do |o|
|
|
77
80
|
options[:rdf] = true
|
78
81
|
options[:skip_header] = true
|
79
82
|
end
|
83
|
+
o.on("--num-threads [num]", Integer, "Multi-core version") do |i|
|
84
|
+
options[:num_threads] = i
|
85
|
+
end
|
86
|
+
o.on("--thread-lines num", Integer, "Fork thread on num lines (default 100_000)") do |i|
|
87
|
+
options[:thread_lines] = i
|
88
|
+
end
|
80
89
|
o.on_tail("--id name", String, "Identifier") do |s|
|
81
90
|
options[:id] = s
|
82
91
|
end
|
83
92
|
o.on_tail("--tags list", String, "Add tags") do |s|
|
84
|
-
options[:tags] =
|
93
|
+
options[:tags] = s
|
85
94
|
end
|
86
95
|
|
96
|
+
o.on("--skip-header", "Do not output VCF header info") do
|
97
|
+
options[:skip_header] = true
|
98
|
+
end
|
99
|
+
|
100
|
+
o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list|
|
101
|
+
options[:set_header] = list
|
102
|
+
options[:skip_header] = true
|
103
|
+
end
|
104
|
+
|
87
105
|
# Uncomment the following when using the bio-logger
|
88
106
|
# o.separator ""
|
89
107
|
# o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
@@ -113,9 +131,44 @@ opts = OptionParser.new do |o|
|
|
113
131
|
end
|
114
132
|
end
|
115
133
|
|
134
|
+
include BioVcf
|
116
135
|
|
136
|
+
# Parse the header section of a VCF file
|
137
|
+
def parse_header line, samples, options
|
138
|
+
header = VcfHeader.new
|
139
|
+
header.add(line)
|
140
|
+
print line if not options[:skip_header]
|
141
|
+
STDIN.each_line do | headerline |
|
142
|
+
if headerline !~ /^#/
|
143
|
+
line = headerline
|
144
|
+
break # end of header
|
145
|
+
end
|
146
|
+
header.add(headerline)
|
147
|
+
if not options[:skip_header]
|
148
|
+
if headerline =~ /^#CHR/
|
149
|
+
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
150
|
+
print header.tag(options),"\n" if not options[:skip_header]
|
151
|
+
selected = header.column_names
|
152
|
+
if samples
|
153
|
+
newfields = selected[0..8]
|
154
|
+
samples.each do |s|
|
155
|
+
newfields << selected[s+9]
|
156
|
+
end
|
157
|
+
selected = newfields
|
158
|
+
end
|
159
|
+
print "#",selected.join("\t"),"\n"
|
160
|
+
else
|
161
|
+
print headerline
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
|
166
|
+
VcfRdf::header if options[:rdf]
|
167
|
+
return header,line
|
168
|
+
end
|
169
|
+
|
170
|
+
# Parse a VCF line
|
117
171
|
def parse_line line,header,options,samples
|
118
|
-
# fields = VcfLine.parse(line,header.columns)
|
119
172
|
fields = VcfLine.parse(line)
|
120
173
|
rec = VcfRecord.new(fields,header)
|
121
174
|
r = rec # alias
|
@@ -124,27 +177,35 @@ def parse_line line,header,options,samples
|
|
124
177
|
sfilter = options[:sfilter]
|
125
178
|
efilter = options[:efilter]
|
126
179
|
ifilter = options[:ifilter]
|
180
|
+
seval = options[:seval]
|
127
181
|
ignore_missing = options[:ignore_missing]
|
128
182
|
quiet = options[:quiet]
|
183
|
+
|
184
|
+
if sfilter or efilter or ifilter or seval
|
185
|
+
# check for samples
|
186
|
+
header_samples = header.column_names[9..-1]
|
187
|
+
raise "Empty sample list, can not execute query!" if not header_samples
|
188
|
+
end
|
189
|
+
|
129
190
|
# --------------------------
|
130
191
|
# Filtering and set analysis
|
131
|
-
return if filter and not rec.
|
192
|
+
return if filter and not rec.filter(filter,ignore_missing,quiet)
|
132
193
|
|
133
194
|
if sfilter
|
134
195
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
135
|
-
return if not sample.
|
196
|
+
return if not sample.sfilter(sfilter,ignore_missing,quiet)
|
136
197
|
end
|
137
198
|
end
|
138
199
|
|
139
200
|
if ifilter
|
140
201
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
141
|
-
return if not sample.
|
202
|
+
return if not sample.ifilter(ifilter,ignore_missing,quiet)
|
142
203
|
end
|
143
204
|
end
|
144
205
|
|
145
206
|
if efilter
|
146
207
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
147
|
-
return if not sample.
|
208
|
+
return if not sample.efilter(efilter,ignore_missing,quiet)
|
148
209
|
end
|
149
210
|
end
|
150
211
|
|
@@ -158,19 +219,19 @@ def parse_line line,header,options,samples
|
|
158
219
|
end
|
159
220
|
fields = newfields
|
160
221
|
end
|
161
|
-
if options[:eval] or
|
222
|
+
if options[:eval] or seval
|
162
223
|
begin
|
163
224
|
results = nil # result string
|
164
225
|
if options[:eval]
|
165
226
|
res = rec.eval(options[:eval],ignore_missing,quiet)
|
166
227
|
results = res if res
|
167
228
|
end
|
168
|
-
if
|
229
|
+
if seval
|
169
230
|
list = (results ? [] : [rec.chr,rec.pos])
|
170
231
|
rec.each_sample(options[:sfilter_samples]) { | sample |
|
171
|
-
list << sample.eval(
|
232
|
+
list << sample.eval(seval,ignore_missing,quiet)
|
172
233
|
}
|
173
|
-
results = (results ? results + "\t" : "" ) + list.join("\t")
|
234
|
+
results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
|
174
235
|
end
|
175
236
|
rescue => e
|
176
237
|
$stderr.print "\nLine: ",line
|
@@ -183,23 +244,60 @@ def parse_line line,header,options,samples
|
|
183
244
|
else
|
184
245
|
if options[:rdf]
|
185
246
|
# Output Turtle RDF
|
186
|
-
if not header_out
|
187
|
-
VcfRdf::header
|
188
|
-
header_out = true
|
189
|
-
end
|
190
247
|
VcfRdf::record(options[:id],rec,options[:tags])
|
191
248
|
elsif options[:rewrite]
|
192
249
|
# Default behaviour prints VCF line, but rewrite info
|
193
250
|
eval(options[:rewrite])
|
194
|
-
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")
|
251
|
+
print (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
195
252
|
else
|
196
253
|
# Default behaviour prints VCF line
|
197
|
-
print fields.join("\t")
|
254
|
+
$stdout.print fields.join("\t")+"\n"
|
255
|
+
$stdout.flush
|
256
|
+
return true
|
198
257
|
end
|
199
258
|
end
|
200
259
|
end
|
201
260
|
|
202
|
-
|
261
|
+
# Collect a buffer of lines and feed them to a thread
|
262
|
+
# Returns the created pid, tempfilen and count_threads
|
263
|
+
# (Note: this function should be turned into a closure)
|
264
|
+
def parse_lines lines,header,options,samples,tempdir,count_threads
|
265
|
+
pid = nil
|
266
|
+
threadfilen = nil
|
267
|
+
if options[:num_threads]
|
268
|
+
lines2 = lines.map { |l| l.clone }
|
269
|
+
count_threads += 1
|
270
|
+
threadfilen = tempdir+sprintf("/%0.6d-pid",count_threads)+'.bio-vcf'
|
271
|
+
pid = fork do
|
272
|
+
count_lines = 0
|
273
|
+
tempfn = threadfilen+'.running'
|
274
|
+
STDOUT.reopen(File.open(tempfn, 'w+'))
|
275
|
+
lines2.each do | line |
|
276
|
+
count_lines +=1 if parse_line(line,header,options,samples)
|
277
|
+
end
|
278
|
+
STDOUT.flush
|
279
|
+
STDOUT.close
|
280
|
+
FileUtils::mv(tempfn,threadfilen)
|
281
|
+
exit 0
|
282
|
+
end
|
283
|
+
Process::detach(pid)
|
284
|
+
else
|
285
|
+
lines.each do | line |
|
286
|
+
parse_line line,header,options,samples
|
287
|
+
end
|
288
|
+
end
|
289
|
+
return pid,threadfilen,count_threads
|
290
|
+
end
|
291
|
+
|
292
|
+
# Make sure no more than num_threads are running at the same time
|
293
|
+
def manage_thread_pool(workers, thread_list, num_threads)
|
294
|
+
while true
|
295
|
+
# ---- count running pids
|
296
|
+
running = thread_list.reduce(0) { | sum, thread_info | ( File.exist?(thread_info[1]+'.running') ? sum+1 : sum ) }
|
297
|
+
break if running < num_threads
|
298
|
+
sleep 0.1
|
299
|
+
end
|
300
|
+
end
|
203
301
|
|
204
302
|
opts.parse!(ARGV)
|
205
303
|
|
@@ -216,55 +314,99 @@ $stderr.print "Options: ",options,"\n" if !options[:quiet]
|
|
216
314
|
if options[:samples]
|
217
315
|
samples = options[:samples].map { |s| s.to_i }
|
218
316
|
end
|
219
|
-
|
220
|
-
|
317
|
+
|
318
|
+
num_threads = options[:num_threads]
|
319
|
+
num_threads = 8 if num_threads != nil and num_threads < 2
|
320
|
+
|
321
|
+
header = nil
|
322
|
+
header_output_completed = false
|
221
323
|
line_number=0
|
324
|
+
lines = []
|
325
|
+
thread_list = []
|
326
|
+
workers = []
|
327
|
+
thread_lines = options[:thread_lines]
|
328
|
+
count_threads=0
|
222
329
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
330
|
+
orig_std_out = STDOUT.clone
|
331
|
+
|
332
|
+
Dir::mktmpdir("bio-vcf_") do |tempdir|
|
333
|
+
$stderr.print "Using #{tempdir} for temporary files\n" if num_threads
|
334
|
+
|
335
|
+
# ---- Main loop
|
336
|
+
STDIN.each_line do | line |
|
337
|
+
line_number += 1
|
338
|
+
$stderr.print '.' if line_number % thread_lines == 0 and not options[:quiet]
|
339
|
+
begin
|
340
|
+
# ---- In this section header information is handled
|
341
|
+
next if header_output_completed and line =~ /^#/
|
342
|
+
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
343
|
+
header,line = parse_header(line,samples,options)
|
344
|
+
end
|
345
|
+
next if line =~ /^##/ # empty file
|
346
|
+
header_output_completed = true
|
347
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
348
|
+
# Create exclude set as a complement of include set
|
349
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
350
|
+
end
|
351
|
+
|
352
|
+
# ---- In this section the VCF variant lines are parsed
|
353
|
+
lines << line
|
354
|
+
if lines.size > thread_lines
|
355
|
+
manage_thread_pool(workers,thread_list,num_threads) if options[:num_threads]
|
356
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
357
|
+
count_threads = thread_list.last[2]
|
358
|
+
lines = []
|
359
|
+
end
|
360
|
+
rescue Exception => e
|
361
|
+
# $stderr.print line
|
362
|
+
$stderr.print e.message,"\n"
|
363
|
+
raise if options[:verbose]
|
364
|
+
exit 1
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
thread_list << parse_lines(lines,header,options,samples,tempdir,count_threads)
|
369
|
+
count_threads = thread_list.last[2]
|
370
|
+
|
371
|
+
# ---- In this section the output gets collected and printed on STDOUT
|
372
|
+
if options[:num_threads]
|
373
|
+
STDOUT.reopen(orig_std_out)
|
374
|
+
$stderr.print "Final pid=#{thread_list.last[0]}, size=#{lines.size}\n"
|
375
|
+
lines = []
|
376
|
+
|
377
|
+
fault = false
|
378
|
+
# Wait for the running threads to complete
|
379
|
+
thread_list.each do |info|
|
380
|
+
(pid,threadfn) = info
|
381
|
+
tempfn = threadfn + '.running'
|
382
|
+
$stderr.print "Waiting up to 3 minutes for pid=#{pid} to complete\n"
|
383
|
+
begin
|
384
|
+
Timeout.timeout(180) do
|
385
|
+
while not File.exist?(threadfn) # wait for the result to appear
|
386
|
+
sleep 0.2
|
252
387
|
end
|
253
388
|
end
|
389
|
+
# Thread file should have gone:
|
390
|
+
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
391
|
+
$stderr.print "OK pid=#{pid}\n"
|
392
|
+
rescue Timeout::Error
|
393
|
+
Process.kill 9, pid
|
394
|
+
Process.wait pid
|
395
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
396
|
+
fault = true
|
254
397
|
end
|
255
398
|
end
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
399
|
+
# Collate the output
|
400
|
+
thread_list.each do | info |
|
401
|
+
(pid,fn) = info
|
402
|
+
# This should never happen
|
403
|
+
raise "FATAL: child process output #{fn} is missing" if not File.exist?(fn)
|
404
|
+
$stderr.print "Reading #{fn}\n"
|
405
|
+
File.new(fn).each_line { |buf|
|
406
|
+
print buf
|
407
|
+
}
|
408
|
+
File.unlink(fn)
|
260
409
|
end
|
261
|
-
|
262
|
-
parse_line line,header,options,samples
|
263
|
-
rescue Exception => e
|
264
|
-
# $stderr.print line
|
265
|
-
$stderr.print e.message,"\n"
|
266
|
-
raise if options[:verbose]
|
267
|
-
exit 1
|
410
|
+
return 1 if fault
|
268
411
|
end
|
269
|
-
end
|
270
|
-
|
412
|
+
end # cleans up tempdir
|