bio-vcf 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +89 -4
- data/VERSION +1 -1
- data/bin/bio-vcf +138 -189
- data/bio-vcf.gemspec +5 -2
- data/features/cli.feature +1 -1
- data/features/step_definitions/sfilter.rb +1 -1
- data/lib/bio-vcf/vcfrdf.rb +82 -0
- data/lib/bio-vcf/vcfsample.rb +2 -2
- data/template/gatk_vcf2rdf.erb +35 -0
- data/template/vcf2json.erb +8 -0
- data/template/vcf2rdf.erb +12 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: da9c14380c66a497089fef836a22d44c7651f264
|
4
|
+
data.tar.gz: 3291080afde13a1b7cd392d4f1f06ff275ec1f1b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1584a14d8de45dc04115ae6458f3d1b6e9c69c9d19092dfefc18e411fc347d660ed1ca586e3e3b4327d4774e323a78eabe3434a4cd8eab7d87b1f4e37915c0e
|
7
|
+
data.tar.gz: d0a56668f17a272807167bde9caa066ba960ac597b9ce57a205fb2564b9944421e5ef4035f43ab855f72ab2c9ece6b963ef01043372db5601520f5593a1fe31b
|
data/README.md
CHANGED
@@ -4,7 +4,8 @@
|
|
4
4
|
|
5
5
|
A new generation VCF parser. Bio-vcf is not only fast for genome-wide
|
6
6
|
(WGS) data, it also comes with a really nice filtering, evaluation and
|
7
|
-
rewrite language
|
7
|
+
rewrite language and it can output any type of textual data, including
|
8
|
+
RDF and JSON. Why would you use bio-vcf over other parsers?
|
8
9
|
|
9
10
|
1. Bio-vcf is fast and scales on multi-core computers
|
10
11
|
2. Bio-vcf has an expressive filtering and evaluation language
|
@@ -15,7 +16,7 @@ rewrite language. Why would you use bio-vcf over other parsers?
|
|
15
16
|
7. Bio-vcf allows for genotype processing
|
16
17
|
8. Bio-vcf has support for set analysis
|
17
18
|
9. Bio-vcf has sane error handling
|
18
|
-
10. Bio-vcf can output tabular data, HTML, LaTeX, RDF and
|
19
|
+
10. Bio-vcf can output tabular data, HTML, LaTeX, RDF, JSON and JSON-LD using templates
|
19
20
|
|
20
21
|
Bio-vcf has better performance than other tools
|
21
22
|
because of lazy parsing, multi-threading, and useful combinations of
|
@@ -233,6 +234,12 @@ gem install bio-vcf
|
|
233
234
|
bio-vcf -h
|
234
235
|
```
|
235
236
|
|
237
|
+
For multi-core also install the parallel gem
|
238
|
+
|
239
|
+
```sh
|
240
|
+
gem install parallel
|
241
|
+
```
|
242
|
+
|
236
243
|
## Command line interface (CLI)
|
237
244
|
|
238
245
|
Get the version of the VCF file
|
@@ -628,7 +635,7 @@ To remove/select 3 samples:
|
|
628
635
|
|
629
636
|
## RDF output
|
630
637
|
|
631
|
-
You can use --rdf for turtle RDF output, note the use of --id and
|
638
|
+
You can use --rdf for turtle RDF output from simple one-liners, note the use of --id and
|
632
639
|
--tags which includes the MAF record:
|
633
640
|
|
634
641
|
```ruby
|
@@ -641,6 +648,8 @@ bio-vcf --id evs --rdf --tags '{"db:evs" => true, "seq:freq" => rec.info.maf[0]/
|
|
641
648
|
:evs_ch9_139266496_T seq:freq 0.419801 .
|
642
649
|
```
|
643
650
|
|
651
|
+
Also check out the more powerful templating system below.
|
652
|
+
|
644
653
|
It is possible to filter too! Pick out the rare variants with
|
645
654
|
|
646
655
|
```ruby
|
@@ -660,9 +669,85 @@ or without AF
|
|
660
669
|
bio-vcf --id gonl --rdf --tags '{"db:gonl" => true, "seq:freq" => (rec.info.ac.to_f/rec.info.an).round(2) }' < gonl_germline_overlap_r4.vcf
|
661
670
|
```
|
662
671
|
|
672
|
+
Also check out [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
673
|
+
|
674
|
+
## Templates
|
663
675
|
|
676
|
+
To have more output options blastxmlparser can use an [ERB
|
677
|
+
template](http://www.stuartellis.eu/articles/erb/) for every match. This is a
|
678
|
+
very flexible option that can output textual formats such as JSON, YAML, HTML
|
679
|
+
and RDF. Examples are provided in
|
680
|
+
[./templates](https://github.com/pjotrp/bioruby-vcf/templates/). A JSON
|
681
|
+
template could be
|
664
682
|
|
665
|
-
|
683
|
+
```Javascript
|
684
|
+
{
|
685
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
686
|
+
"seq:pos": <%= rec.pos %> ,
|
687
|
+
"seq:ref": "<%= rec.ref %>" ,
|
688
|
+
"seq:alt": "<%= rec.alt[0] %>" ,
|
689
|
+
"seq:maf": <%= rec.info.maf[0] %> ,
|
690
|
+
"dp": <%= rec.info.dp %> ,
|
691
|
+
};
|
692
|
+
```
|
693
|
+
|
694
|
+
To get JSON, run with something like
|
695
|
+
|
696
|
+
```sh
|
697
|
+
bio-vcf --template template/vcf2json.erb --filter 'r.info.maf[0]<0.01' < dbsnp.vcf
|
698
|
+
```
|
699
|
+
|
700
|
+
which renders
|
701
|
+
|
702
|
+
```Javascript
|
703
|
+
{
|
704
|
+
"seq:chr": "13" ,
|
705
|
+
"seq:pos": 35745475 ,
|
706
|
+
"seq:ref": "C" ,
|
707
|
+
"seq:alt": "T" ,
|
708
|
+
"seq:maf": 0.0151 ,
|
709
|
+
"dp": 86 ,
|
710
|
+
};
|
711
|
+
```
|
712
|
+
|
713
|
+
Likewise for RDF output:
|
714
|
+
|
715
|
+
```sh
|
716
|
+
bio-vcf --template template/vcf2rdf.erb --filter 'r.info.maf[0]<0.01' < dbsnp.vcf
|
717
|
+
```
|
718
|
+
|
719
|
+
renders the ERB template
|
720
|
+
|
721
|
+
```ruby
|
722
|
+
<%
|
723
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
724
|
+
%>
|
725
|
+
:<%= id %>
|
726
|
+
:query_id "<%= id %>",
|
727
|
+
seq:chr "<%= rec.chrom %>" ,
|
728
|
+
seq:pos <%= rec.pos %> ,
|
729
|
+
seq:ref "<%= rec.ref %>" ,
|
730
|
+
seq:alt "<%= rec.alt[0] %>" ,
|
731
|
+
seq:maf <%= rec.info.maf[0] %> ,
|
732
|
+
seq:dp <%= rec.info.dp %> ,
|
733
|
+
db:vcf true .
|
734
|
+
```
|
735
|
+
|
736
|
+
into
|
737
|
+
|
738
|
+
```
|
739
|
+
:ch13_33703698_A
|
740
|
+
:query_id "ch13_33703698_A",
|
741
|
+
seq:chr "13" ,
|
742
|
+
seq:pos 33703698 ,
|
743
|
+
seq:ref "C" ,
|
744
|
+
seq:alt "A" ,
|
745
|
+
seq:maf 0.1567 ,
|
746
|
+
seq:dp 92 ,
|
747
|
+
db:vcf true .
|
748
|
+
```
|
749
|
+
|
750
|
+
Be creative! You can write templates for csv, HTML, XML, LaTeX, RDF, JSON, YAML, JSON-LD, etc. etc.!
|
666
751
|
|
667
752
|
## Statistics
|
668
753
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.8.0
|
data/bin/bio-vcf
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
#
|
3
|
+
# bio-vcf parser and transformer
|
4
4
|
# Author:: Pjotr Prins
|
5
|
+
# License:: MIT
|
5
6
|
#
|
6
7
|
# Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
|
7
8
|
|
@@ -17,7 +18,6 @@ require 'bio-vcf'
|
|
17
18
|
require 'optparse'
|
18
19
|
require 'timeout'
|
19
20
|
require 'fileutils'
|
20
|
-
require 'tempfile'
|
21
21
|
|
22
22
|
# Uncomment when using the bio-logger
|
23
23
|
# require 'bio-logger'
|
@@ -26,7 +26,7 @@ require 'tempfile'
|
|
26
26
|
# Bio::Log::CLI.logger('stderr')
|
27
27
|
# Bio::Log::CLI.trace('info')
|
28
28
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines:
|
29
|
+
options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
|
30
30
|
opts = OptionParser.new do |o|
|
31
31
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
32
|
|
@@ -75,7 +75,7 @@ opts = OptionParser.new do |o|
|
|
75
75
|
o.on("--samples list", Array, "Output selected samples") do |l|
|
76
76
|
options[:samples] = l
|
77
77
|
end
|
78
|
-
o.on("--rdf", "Generate Turtle RDF") do |b|
|
78
|
+
o.on("--rdf", "Generate Turtle RDF (also check out --template!)") do |b|
|
79
79
|
require 'bio-vcf/vcfrdf'
|
80
80
|
options[:rdf] = true
|
81
81
|
options[:skip_header] = true
|
@@ -101,6 +101,14 @@ opts = OptionParser.new do |o|
|
|
101
101
|
options[:set_header] = list
|
102
102
|
options[:skip_header] = true
|
103
103
|
end
|
104
|
+
|
105
|
+
o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
|
106
|
+
require 'bio-vcf/vcfrdf'
|
107
|
+
require 'erb'
|
108
|
+
options[:template] = s
|
109
|
+
options[:skip_header] = true
|
110
|
+
end
|
111
|
+
|
104
112
|
|
105
113
|
# Uncomment the following when using the bio-logger
|
106
114
|
# o.separator ""
|
@@ -135,6 +143,53 @@ opts = OptionParser.new do |o|
|
|
135
143
|
end
|
136
144
|
end
|
137
145
|
|
146
|
+
opts.parse!(ARGV)
|
147
|
+
|
148
|
+
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
149
|
+
|
150
|
+
if options[:show_help]
|
151
|
+
print opts
|
152
|
+
print USAGE
|
153
|
+
exit 1
|
154
|
+
end
|
155
|
+
|
156
|
+
if RUBY_VERSION =~ /^1/
|
157
|
+
$stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n"
|
158
|
+
end
|
159
|
+
|
160
|
+
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
161
|
+
|
162
|
+
if options[:template]
|
163
|
+
include BioVcf::RDF
|
164
|
+
fn = options[:template]
|
165
|
+
raise "No template #{fn}!" if not File.exist?(fn)
|
166
|
+
template = ERB.new(File.read(fn))
|
167
|
+
end
|
168
|
+
|
169
|
+
if options[:num_threads] != 1
|
170
|
+
begin
|
171
|
+
require 'parallel'
|
172
|
+
rescue LoadError
|
173
|
+
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
174
|
+
options[:num_threads] = 1
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
stats = nil
|
179
|
+
if options[:statistics]
|
180
|
+
options[:num_threads] = nil
|
181
|
+
stats = BioVcf::VcfStatistics.new
|
182
|
+
end
|
183
|
+
|
184
|
+
# Check for option combinations
|
185
|
+
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
186
|
+
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
187
|
+
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
188
|
+
|
189
|
+
if options[:samples]
|
190
|
+
samples = options[:samples].map { |s| s.to_i }
|
191
|
+
end
|
192
|
+
|
138
193
|
include BioVcf
|
139
194
|
|
140
195
|
# Parse the header section of a VCF file
|
@@ -171,8 +226,8 @@ def parse_header line, samples, options
|
|
171
226
|
return header,line
|
172
227
|
end
|
173
228
|
|
174
|
-
# Parse a VCF line
|
175
|
-
def parse_line line,header,options,samples,stats=nil
|
229
|
+
# Parse a VCF line and return the result as a string
|
230
|
+
def parse_line line,header,options,samples,template,stats=nil
|
176
231
|
fields = VcfLine.parse(line)
|
177
232
|
rec = VcfRecord.new(fields,header)
|
178
233
|
r = rec # alias
|
@@ -251,216 +306,110 @@ def parse_line line,header,options,samples,stats=nil
|
|
251
306
|
raise if options[:verbose]
|
252
307
|
exit 1
|
253
308
|
end
|
254
|
-
|
255
|
-
exit(1) if options[:eval_once]
|
309
|
+
return results.to_s+"\n" if results
|
310
|
+
exit(1) if options[:eval_once] # <--- can this be reached?
|
256
311
|
else
|
257
312
|
if options[:rdf]
|
258
313
|
# Output Turtle RDF
|
259
314
|
VcfRdf::record(options[:id],rec,options[:tags])
|
315
|
+
elsif options[:template]
|
316
|
+
# Ruby ERB template
|
317
|
+
begin
|
318
|
+
template.result(binding)
|
319
|
+
rescue Exception => e
|
320
|
+
$stderr.print e,": ",fields,"\n"
|
321
|
+
$stderr.print e.backtrace.inspect if options[:verbose]
|
322
|
+
raise
|
323
|
+
end
|
260
324
|
elsif options[:rewrite]
|
261
325
|
# Default behaviour prints VCF line, but rewrite info
|
262
326
|
eval(options[:rewrite])
|
263
|
-
|
327
|
+
(fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
264
328
|
elsif stats
|
265
329
|
# do nothing
|
266
330
|
else
|
267
331
|
# Default behaviour prints VCF line
|
268
|
-
|
269
|
-
$stdout.flush
|
270
|
-
return true
|
332
|
+
fields.join("\t")+"\n"
|
271
333
|
end
|
272
334
|
end
|
273
335
|
end
|
274
336
|
|
275
|
-
# Collect a buffer of lines and feed them to a thread
|
276
|
-
# Returns the created pid, tempfilen and count_threads
|
277
|
-
# (Note: this function should be turned into a closure)
|
278
|
-
def parse_lines lines,header,options,samples,tempdir,count_threads,stats
|
279
|
-
pid = nil
|
280
|
-
threadfilen = nil
|
281
|
-
if options[:num_threads]
|
282
|
-
count_threads += 1
|
283
|
-
threadfilen = tempdir+sprintf("/%0.6d-pid",count_threads)+'.bio-vcf'
|
284
|
-
pid = fork do
|
285
|
-
count_lines = 0
|
286
|
-
tempfn = threadfilen+'.running'
|
287
|
-
STDOUT.reopen(File.open(tempfn, 'w+'))
|
288
|
-
lines.each do | line |
|
289
|
-
count_lines +=1 if parse_line(line,header,options,samples)
|
290
|
-
end
|
291
|
-
STDOUT.flush
|
292
|
-
STDOUT.close
|
293
|
-
FileUtils::mv(tempfn,threadfilen)
|
294
|
-
exit 0
|
295
|
-
end
|
296
|
-
else
|
297
|
-
lines.each do | line |
|
298
|
-
parse_line line,header,options,samples,stats
|
299
|
-
end
|
300
|
-
end
|
301
|
-
return pid,threadfilen,count_threads
|
302
|
-
end
|
303
|
-
|
304
|
-
# Make sure no more than num_threads are running at the same time
|
305
|
-
def manage_thread_pool(workers, thread_list, num_threads)
|
306
|
-
while true
|
307
|
-
# ---- count running pids
|
308
|
-
running = thread_list.reduce(0) do | sum, thread_info |
|
309
|
-
if thread_info[0] && pid_running?(thread_info[0])
|
310
|
-
sum+1
|
311
|
-
elsif nil == thread_info[0] && File.exist?(thread_info[1]+'.running')
|
312
|
-
sum+1
|
313
|
-
else
|
314
|
-
sum
|
315
|
-
end
|
316
|
-
end
|
317
|
-
break if running < num_threads
|
318
|
-
sleep 0.1
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
def pid_running?(pid)
|
323
|
-
begin
|
324
|
-
fpid,status=Process.waitpid2(pid,Process::WNOHANG)
|
325
|
-
rescue Errno::ECHILD, Errno::ESRCH
|
326
|
-
return false
|
327
|
-
end
|
328
|
-
return true if nil == fpid && nil == status
|
329
|
-
return ! (status.exited? || status.signaled?)
|
330
|
-
end
|
331
|
-
|
332
|
-
opts.parse!(ARGV)
|
333
|
-
|
334
|
-
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
335
|
-
|
336
|
-
if options[:show_help]
|
337
|
-
print opts
|
338
|
-
print USAGE
|
339
|
-
exit 1
|
340
|
-
end
|
341
|
-
|
342
|
-
if RUBY_VERSION =~ /^1/
|
343
|
-
$stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n"
|
344
|
-
end
|
345
|
-
|
346
|
-
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
347
|
-
|
348
|
-
stats = nil
|
349
|
-
if options[:statistics]
|
350
|
-
options[:num_threads] = nil
|
351
|
-
stats = BioVcf::VcfStatistics.new
|
352
|
-
end
|
353
|
-
|
354
|
-
# Check for option combinations
|
355
|
-
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
356
|
-
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
357
|
-
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
358
|
-
|
359
|
-
if options[:samples]
|
360
|
-
samples = options[:samples].map { |s| s.to_i }
|
361
|
-
end
|
362
|
-
|
363
|
-
num_threads = options[:num_threads]
|
364
|
-
num_threads = 8 if num_threads != nil and num_threads < 2
|
365
|
-
|
366
337
|
header = nil
|
367
338
|
header_output_completed = false
|
368
|
-
|
339
|
+
NUM_THREADS = options[:num_threads]
|
340
|
+
CHUNK_SIZE = options[:thread_lines]
|
341
|
+
CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
|
342
|
+
chunks = []
|
369
343
|
lines = []
|
370
|
-
|
371
|
-
workers = []
|
372
|
-
thread_lines = options[:thread_lines]
|
373
|
-
count_threads=0
|
374
|
-
|
375
|
-
orig_std_out = STDOUT.clone
|
344
|
+
line_number=0
|
376
345
|
|
377
346
|
begin
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
lines = []
|
405
|
-
end
|
347
|
+
process = lambda { | lines |
|
348
|
+
res = []
|
349
|
+
lines.each do | line |
|
350
|
+
res << parse_line(line,header,options,samples,template,stats)
|
351
|
+
end
|
352
|
+
res
|
353
|
+
}
|
354
|
+
output = lambda { |collection|
|
355
|
+
collection.each do | result |
|
356
|
+
result.each { |line| print line }
|
357
|
+
end
|
358
|
+
} # end output
|
359
|
+
|
360
|
+
# ---- Main loop
|
361
|
+
STDIN.each_line do | line |
|
362
|
+
line_number += 1
|
363
|
+
# ---- In this section header information is handled
|
364
|
+
next if header_output_completed and line =~ /^#/
|
365
|
+
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
366
|
+
header,line = parse_header(line,samples,options)
|
367
|
+
end
|
368
|
+
next if line =~ /^##/ # empty file
|
369
|
+
header_output_completed = true
|
370
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
371
|
+
# Create exclude set as a complement of include set
|
372
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
406
373
|
end
|
407
374
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
lines = []
|
416
|
-
|
417
|
-
fault = false
|
418
|
-
# Wait for the running threads to complete
|
419
|
-
thread_list.each do |info|
|
420
|
-
(pid,threadfn) = info
|
421
|
-
tempfn = threadfn + '.running'
|
422
|
-
timeout = 180
|
423
|
-
if (pid && !pid_running?(pid)) || fault
|
424
|
-
# no point to wait for a long time if we've failed one already or the proc is dead
|
425
|
-
timeout = 1
|
426
|
-
end
|
427
|
-
$stderr.print "Waiting up to #{timeout/60} minutes for pid=#{pid} to complete\n"
|
428
|
-
begin
|
429
|
-
Timeout.timeout(timeout) do
|
430
|
-
while not File.exist?(threadfn) # wait for the result to appear
|
431
|
-
sleep 0.2
|
432
|
-
end
|
433
|
-
end
|
434
|
-
# Thread file should have gone:
|
435
|
-
raise "FATAL: child process appears to have crashed #{tempfn}" if File.exist?(tempfn)
|
436
|
-
$stderr.print "OK pid=#{pid}\n"
|
437
|
-
rescue Timeout::Error
|
438
|
-
if pid_running?(pid)
|
439
|
-
Process.kill 9, pid
|
440
|
-
Process.wait pid
|
441
|
-
end
|
442
|
-
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}\n"
|
443
|
-
fault = true
|
444
|
-
end
|
375
|
+
# ---- In this section the VCF variant lines are parsed
|
376
|
+
lines << line
|
377
|
+
if NUM_THREADS == 1
|
378
|
+
$stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
|
379
|
+
if lines.size > CHUNK_SIZE
|
380
|
+
process.call(lines).each { | l | print l }
|
381
|
+
lines = []
|
445
382
|
end
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
if
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
File.new(fn).each_line { |buf|
|
454
|
-
print buf
|
383
|
+
else
|
384
|
+
if lines.size > CHUNK_SIZE
|
385
|
+
chunks << lines
|
386
|
+
if chunks.size > CHUNK_NUM
|
387
|
+
$stderr.print '.' if not options[:quiet]
|
388
|
+
out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
|
389
|
+
process.call(chunk)
|
455
390
|
}
|
456
|
-
|
391
|
+
chunks = []
|
392
|
+
# Output is forked to a separate process too
|
393
|
+
fork do
|
394
|
+
output.call out
|
395
|
+
STDOUT.flush
|
396
|
+
STDOUT.close
|
397
|
+
exit 0
|
398
|
+
end
|
457
399
|
end
|
458
|
-
|
400
|
+
lines = []
|
459
401
|
end
|
460
|
-
return 1 if fault
|
461
402
|
end
|
462
|
-
end
|
463
|
-
|
403
|
+
end
|
404
|
+
$stderr.print '.' if not options[:quiet]
|
405
|
+
if NUM_THREADS == 1
|
406
|
+
process.call(lines).each { |l| print l}
|
407
|
+
else
|
408
|
+
chunks << lines
|
409
|
+
output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
|
410
|
+
process.call(chunk)
|
411
|
+
}
|
412
|
+
end
|
464
413
|
stats.print if stats
|
465
414
|
|
466
415
|
rescue Exception => e
|
data/bio-vcf.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-vcf"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.8.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "2014-09-
|
12
|
+
s.date = "2014-09-19"
|
13
13
|
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
|
14
14
|
s.email = "pjotr.public01@thebird.nl"
|
15
15
|
s.executables = ["bio-vcf"]
|
@@ -50,6 +50,9 @@ Gem::Specification.new do |s|
|
|
50
50
|
"lib/bio-vcf/vcfrecord.rb",
|
51
51
|
"lib/bio-vcf/vcfsample.rb",
|
52
52
|
"lib/bio-vcf/vcfstatistics.rb",
|
53
|
+
"template/gatk_vcf2rdf.erb",
|
54
|
+
"template/vcf2json.erb",
|
55
|
+
"template/vcf2rdf.erb",
|
53
56
|
"test/data/input/dbsnp.vcf",
|
54
57
|
"test/data/input/multisample.vcf",
|
55
58
|
"test/data/input/somaticsniper.vcf",
|
data/features/cli.feature
CHANGED
@@ -51,6 +51,6 @@ Feature: Command-line interface (CLI)
|
|
51
51
|
|
52
52
|
Scenario: Test deadlock on failed filter with threads
|
53
53
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
54
|
-
When I execute "./bin/bio-vcf
|
54
|
+
When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
55
55
|
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
56
56
|
|
@@ -117,7 +117,7 @@ When(/^I evaluate empty '\.\/\.' with ignore missing$/) do
|
|
117
117
|
end
|
118
118
|
|
119
119
|
Then(/^I expect s\.what\? to throw an error$/) do
|
120
|
-
expect { @s.eval("s.what?",do_cache: false) }.to raise_error
|
120
|
+
expect { @s.eval("s.what?",do_cache: false) }.to raise_error NoMethodError
|
121
121
|
end
|
122
122
|
|
123
123
|
Then(/^I expect s\.what to throw an error$/) do
|
data/lib/bio-vcf/vcfrdf.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
3
|
# This is some primarily RDF support - which may be moved to another gem
|
4
|
+
#
|
5
|
+
# Note that this functionality is superceded by the --template command! Though
|
6
|
+
# this can be useful for one-liners.
|
4
7
|
|
5
8
|
module VcfRdf
|
6
9
|
|
@@ -34,4 +37,83 @@ OUT
|
|
34
37
|
print "\n"
|
35
38
|
end
|
36
39
|
end
|
40
|
+
|
41
|
+
|
42
|
+
# RDF support module. Original is part of bioruby-rdf by Pjotr Prins
|
43
|
+
#
|
44
|
+
|
45
|
+
module RDF
|
46
|
+
|
47
|
+
def RDF::valid_uri? uri
|
48
|
+
uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
|
49
|
+
end
|
50
|
+
|
51
|
+
def RDF::escape_string_literal(literal)
|
52
|
+
s = literal.to_s
|
53
|
+
# Put a slash before every double quote if there is no such slash already
|
54
|
+
s = s.gsub(/(?<!\\)"/,'\"')
|
55
|
+
# Put a slash before a single slash if it is not \["utnr>\]
|
56
|
+
if s =~ /[^\\]\\[^\\]/
|
57
|
+
s2 = []
|
58
|
+
s.each_char.with_index { |c,i|
|
59
|
+
res = c
|
60
|
+
if i>0 and c == '\\' and s[i-1] != '\\' and s[i+1] !~ /^[uUtnr\\"]/
|
61
|
+
res = '\\' + c
|
62
|
+
end
|
63
|
+
# p [i,c,s[i+1],res]
|
64
|
+
s2 << res
|
65
|
+
}
|
66
|
+
s = s2.join('')
|
67
|
+
end
|
68
|
+
s
|
69
|
+
end
|
70
|
+
|
71
|
+
def RDF::stringify_literal(literal)
|
72
|
+
RDF::escape_string_literal(literal.to_s)
|
73
|
+
end
|
74
|
+
|
75
|
+
def RDF::quoted_stringify_literal(literal)
|
76
|
+
'"' + stringify_literal(literal) + '"'
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
module Turtle
|
81
|
+
|
82
|
+
def Turtle::stringify_literal(literal)
|
83
|
+
RDF::stringify_literal(literal)
|
84
|
+
end
|
85
|
+
|
86
|
+
def Turtle::identifier(id)
|
87
|
+
raise "Illegal identifier #{id}" if id != Turtle::mangle_identifier(id)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Replace letters/symbols that are not allowed in a Turtle identifier
|
91
|
+
# (short hand URI). This should be the definite mangler and replace the
|
92
|
+
# ones in bioruby-table and bio-exominer. Manglers are useful when using
|
93
|
+
# data from other sources and trying to transform them into simple RDF
|
94
|
+
# identifiers.
|
95
|
+
|
96
|
+
def Turtle::mangle_identifier(s)
|
97
|
+
id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\|\>)+/,"_")
|
98
|
+
id = id.gsub(/\[|\]/,'')
|
99
|
+
# id = URI::escape(id)
|
100
|
+
id = id.gsub(/\|/,'_')
|
101
|
+
id = id.gsub(/\-|:/,'_')
|
102
|
+
if id != s
|
103
|
+
# Don't want Bio depency in templates!
|
104
|
+
# logger = Bio::Log::LoggerPlus.new 'bio-rdf'
|
105
|
+
# logger.warn "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
106
|
+
# $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
107
|
+
end
|
108
|
+
if not RDF::valid_uri?(id)
|
109
|
+
raise "Invalid URI after mangling <#{s}> to <#{id}>!"
|
110
|
+
end
|
111
|
+
valid_id = if id =~ /^\d/
|
112
|
+
'r' + id
|
113
|
+
else
|
114
|
+
id
|
115
|
+
end
|
116
|
+
valid_id # we certainly hope so!
|
117
|
+
end
|
118
|
+
end
|
37
119
|
end
|
data/lib/bio-vcf/vcfsample.rb
CHANGED
@@ -3,7 +3,7 @@ module BioVcf
|
|
3
3
|
|
4
4
|
# Check whether a sample is empty (on the raw string value)
|
5
5
|
def VcfSample::empty? s
|
6
|
-
s == './.' or s == '' or s
|
6
|
+
s==nil or s == './.' or s == '' or s[0..2]=='./.'
|
7
7
|
end
|
8
8
|
|
9
9
|
class Sample
|
@@ -77,7 +77,7 @@ module BioVcf
|
|
77
77
|
|
78
78
|
def fetch_values name
|
79
79
|
n = @format[name]
|
80
|
-
raise "Unknown sample field <#{name}>" if not n
|
80
|
+
raise NoMethodError.new("Unknown sample field <#{name}>") if not n
|
81
81
|
@values[n] # <-- save names with upcase!
|
82
82
|
end
|
83
83
|
|
@@ -0,0 +1,35 @@
|
|
1
|
+
<%
|
2
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
3
|
+
sample_num = 0
|
4
|
+
%>
|
5
|
+
:<%= id %>
|
6
|
+
:query_id "<%= id %>";
|
7
|
+
seq:chr "<%= rec.chrom %>" ;
|
8
|
+
seq:pos <%= rec.pos %> ;
|
9
|
+
seq:ref "<%= rec.ref %>" ;
|
10
|
+
seq:alt "<%= rec.alt[0] %>" ;
|
11
|
+
db:gatk true .
|
12
|
+
|
13
|
+
<% rec.each_sample do | s | %>
|
14
|
+
<% if not s.empty?
|
15
|
+
sample_name = header.samples[sample_num]
|
16
|
+
sample_id = id + '_' + Turtle::mangle_identifier(sample_name)
|
17
|
+
sample_num += 1
|
18
|
+
if s.ad[0]+s.ad[1] != 0
|
19
|
+
alt_bias = (s.ad[1].to_f/(s.ad[0]+s.ad[1])).round(2)
|
20
|
+
end
|
21
|
+
%>
|
22
|
+
:<%= sample_id %>
|
23
|
+
:call_id :<%= id %> ;
|
24
|
+
sample:name "<%= sample_name %>" ;
|
25
|
+
sample:gt "<%= s.gt %>" ;
|
26
|
+
<% s.gti.each do | index | %>
|
27
|
+
sample:ad<%= index %> <%= s.ad[index] %> ;
|
28
|
+
sample:gts<%= index %> "<%= s.gts[index] %>" ;
|
29
|
+
<% end %>
|
30
|
+
sample:dp <%= s.dp %> ;
|
31
|
+
sample:alt_bias <%= alt_bias %> .
|
32
|
+
<% end %>
|
33
|
+
<% end %>
|
34
|
+
|
35
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<%
|
2
|
+
id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_'))
|
3
|
+
%>
|
4
|
+
:<%= id %>
|
5
|
+
:query_id "<%= id %>";
|
6
|
+
seq:chr "<%= rec.chrom %>" ;
|
7
|
+
seq:pos <%= rec.pos %> ;
|
8
|
+
seq:ref "<%= rec.ref %>" ;
|
9
|
+
seq:alt "<%= rec.alt[0] %>" ;
|
10
|
+
seq:dp <%= rec.info.dp %> ;
|
11
|
+
db:vcf true .
|
12
|
+
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-vcf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -108,6 +108,9 @@ files:
|
|
108
108
|
- lib/bio-vcf/vcfrecord.rb
|
109
109
|
- lib/bio-vcf/vcfsample.rb
|
110
110
|
- lib/bio-vcf/vcfstatistics.rb
|
111
|
+
- template/gatk_vcf2rdf.erb
|
112
|
+
- template/vcf2json.erb
|
113
|
+
- template/vcf2rdf.erb
|
111
114
|
- test/data/input/dbsnp.vcf
|
112
115
|
- test/data/input/multisample.vcf
|
113
116
|
- test/data/input/somaticsniper.vcf
|