bio-vcf 0.8.1 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -13,38 +13,49 @@ module BioVcf
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
def []= k, v
|
18
|
-
split_fields if not @h
|
19
|
-
kupper = k.upcase
|
20
|
-
@h[kupper] = v
|
21
|
-
@original_key[kupper] = k
|
22
|
-
end
|
23
|
-
|
24
|
-
def method_missing(m, *args, &block)
|
16
|
+
def [] k
|
25
17
|
# split_fields if not @h
|
26
18
|
# /#{m}=(?<value>[^;])/.@info
|
19
|
+
kupper = k.upcase
|
27
20
|
v = if @h
|
28
|
-
@h[
|
21
|
+
@h[kupper]
|
29
22
|
else
|
30
|
-
@info =~
|
23
|
+
@info =~ /[\A;]#{k}=([^;]+)/i
|
31
24
|
value = $1
|
32
25
|
# p [m,value]
|
33
26
|
# m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
|
34
27
|
# value = m[:value]
|
35
28
|
if value == nil
|
36
29
|
split_fields # no option but to split
|
37
|
-
@h[
|
30
|
+
@h[kupper]
|
38
31
|
else
|
39
32
|
value
|
40
33
|
end
|
41
34
|
end
|
42
35
|
ConvertStringToValue::convert(v)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set INFO fields (used by --rewrite)
|
39
|
+
def []= k, v
|
40
|
+
split_fields if not @h
|
41
|
+
kupper = k.upcase
|
42
|
+
@h[kupper] = v
|
43
|
+
@original_key[kupper] = k
|
44
|
+
end
|
45
|
+
|
46
|
+
def fields
|
47
|
+
split_fields
|
48
|
+
@h.keys
|
49
|
+
end
|
50
|
+
|
51
|
+
def method_missing(m, *args, &block)
|
52
|
+
self[m.to_s]
|
43
53
|
end
|
44
54
|
|
45
55
|
private
|
46
56
|
|
47
57
|
def split_fields
|
58
|
+
return @h if @h
|
48
59
|
@h = {}
|
49
60
|
@original_key = {}
|
50
61
|
@info.split(/;/).each do |f|
|
@@ -151,6 +162,10 @@ module BioVcf
|
|
151
162
|
@qual ||= @fields[5].to_f
|
152
163
|
end
|
153
164
|
|
165
|
+
def filter
|
166
|
+
@filter ||= @fields[6]
|
167
|
+
end
|
168
|
+
|
154
169
|
def info
|
155
170
|
@info ||= VcfRecordParser.get_info(@fields[7])
|
156
171
|
end
|
@@ -184,15 +199,21 @@ module BioVcf
|
|
184
199
|
end
|
185
200
|
|
186
201
|
def sample_by_index i
|
187
|
-
# p @fields
|
188
202
|
raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
|
189
203
|
@sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
|
190
204
|
end
|
191
205
|
|
192
206
|
# Walk the samples. list contains an Array of int (the index)
|
193
207
|
def each_sample(list = nil)
|
194
|
-
|
195
|
-
|
208
|
+
@header.sample_subset_index(list).each { |i|
|
209
|
+
yield VcfSample::Sample.new(self,sample_by_index(i))
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
def samples
|
214
|
+
list = []
|
215
|
+
each_sample { |s| list << s }
|
216
|
+
list
|
196
217
|
end
|
197
218
|
|
198
219
|
def missing_samples?
|
@@ -229,6 +250,7 @@ module BioVcf
|
|
229
250
|
$stderr.print "RECORD ERROR!\n"
|
230
251
|
$stderr.print [@fields],"\n"
|
231
252
|
$stderr.print expr,"\n"
|
253
|
+
$stderr.print "To ignore this error use the -i switch!\n"
|
232
254
|
end
|
233
255
|
if ignore_missing_data
|
234
256
|
$stderr.print e.message if not quiet
|
@@ -239,19 +261,19 @@ module BioVcf
|
|
239
261
|
end
|
240
262
|
end
|
241
263
|
|
242
|
-
def
|
264
|
+
def gfilter expr, ignore_missing_data: true, quiet: false
|
243
265
|
begin
|
244
266
|
if not respond_to?(:call_cached_filter)
|
245
267
|
code =
|
246
268
|
"""
|
247
|
-
def
|
269
|
+
def call_cached_gfilter(rec,fields)
|
248
270
|
r = rec
|
249
271
|
#{expr}
|
250
272
|
end
|
251
273
|
"""
|
252
274
|
self.class.class_eval(code)
|
253
275
|
end
|
254
|
-
res =
|
276
|
+
res = call_cached_gfilter(self,@fields)
|
255
277
|
if res.kind_of?(Array)
|
256
278
|
res.join("\t")
|
257
279
|
else
|
@@ -262,6 +284,7 @@ module BioVcf
|
|
262
284
|
$stderr.print "RECORD ERROR!\n"
|
263
285
|
$stderr.print [@fields],"\n"
|
264
286
|
$stderr.print expr,"\n"
|
287
|
+
$stderr.print "To ignore this error use the -i switch!\n"
|
265
288
|
end
|
266
289
|
if ignore_missing_data
|
267
290
|
$stderr.print e.message if not quiet
|
@@ -272,6 +295,21 @@ module BioVcf
|
|
272
295
|
end
|
273
296
|
end
|
274
297
|
|
298
|
+
def add_to_filter_field str
|
299
|
+
filter = @fields[6]
|
300
|
+
if not filter or filter == '.' or filter == 'PASS'
|
301
|
+
filter = str
|
302
|
+
else
|
303
|
+
values = filter.split(/;/)
|
304
|
+
if not values.include?(str)
|
305
|
+
filter = filter +';'+str
|
306
|
+
end
|
307
|
+
end
|
308
|
+
filter = '.' if filter == nil or filter == ''
|
309
|
+
@fields[6] = filter
|
310
|
+
filter
|
311
|
+
end
|
312
|
+
|
275
313
|
# Return the sample
|
276
314
|
def method_missing(m, *args, &block)
|
277
315
|
name = m.to_s
|
data/lib/bio-vcf/vcfsample.rb
CHANGED
@@ -3,7 +3,7 @@ module BioVcf
|
|
3
3
|
|
4
4
|
# Check whether a sample is empty (on the raw string value)
|
5
5
|
def VcfSample::empty? s
|
6
|
-
s==nil or s == './.' or s == '' or s[0..2]=='./.'
|
6
|
+
s==nil or s == './.' or s == '' or s[0..2]=='./.' or s[0..1] == '.:'
|
7
7
|
end
|
8
8
|
|
9
9
|
class Sample
|
@@ -40,9 +40,24 @@ module BioVcf
|
|
40
40
|
# Split GT into index values
|
41
41
|
def gti
|
42
42
|
v = fetch_values("GT")
|
43
|
+
v = './.' if v == '.' #In case that you have a single missing value, make both as missing.
|
43
44
|
v.split(/[\/\|]/).map{ |v| (v=='.' ? nil : v.to_i) }
|
44
45
|
end
|
45
46
|
|
47
|
+
def gtindex
|
48
|
+
v = fetch_values("GT")
|
49
|
+
return case v
|
50
|
+
when nil then nil
|
51
|
+
when '.' then nil
|
52
|
+
when './.' then nil
|
53
|
+
when '0/0' then 0
|
54
|
+
when '0/1' then 1
|
55
|
+
when '1/1' then 2
|
56
|
+
else
|
57
|
+
raise "Unknown genotype #{v}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
46
61
|
# Split GT into into a nucleode sequence
|
47
62
|
def gts
|
48
63
|
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
@@ -51,7 +66,16 @@ module BioVcf
|
|
51
66
|
def cache_method(name, &block)
|
52
67
|
self.class.send(:define_method, name, &block)
|
53
68
|
end
|
54
|
-
|
69
|
+
|
70
|
+
def [] name
|
71
|
+
if @format[name]
|
72
|
+
v = fetch_values(name)
|
73
|
+
return nil if VcfValue::empty?(v)
|
74
|
+
return ConvertStringToValue::convert(v)
|
75
|
+
end
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
55
79
|
def method_missing(m, *args, &block)
|
56
80
|
name = m.to_s.upcase
|
57
81
|
# p [:here,name,m ,@values]
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'regressiontest/cli_exec'
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module RegressionTest
|
4
|
+
|
5
|
+
DEFAULT_TESTDIR = "test/data/regression"
|
6
|
+
|
7
|
+
# Regression test runner compares output in ./test/data/regression
|
8
|
+
# (by default). The convention is to have a file with names .ref
|
9
|
+
# (reference) and create .new
|
10
|
+
#
|
11
|
+
# You can add an :ignore regex option which ignores lines in the
|
12
|
+
# comparson files matching a regex
|
13
|
+
#
|
14
|
+
# :timeout sets the time out for calling a system command
|
15
|
+
#
|
16
|
+
# :should_fail expects the system command to return a non-zero
|
17
|
+
module CliExec
|
18
|
+
FilePair = Struct.new(:outfn,:reffn)
|
19
|
+
|
20
|
+
def CliExec::exec command, testname, options = {}
|
21
|
+
# ---- Find .ref file
|
22
|
+
fullname = DEFAULT_TESTDIR + "/" + testname
|
23
|
+
basefn = if File.exist?(testname+".ref") || File.exist?(testname+"-stderr.ref")
|
24
|
+
testname
|
25
|
+
elsif File.exist?(fullname + ".ref") || File.exist?(fullname+"-stderr.ref")
|
26
|
+
FileUtils.mkdir_p DEFAULT_TESTDIR
|
27
|
+
fullname
|
28
|
+
else
|
29
|
+
raise "Can not find reference file for #{testname} - expected #{fullname}.ref"
|
30
|
+
end
|
31
|
+
std_out = FilePair.new(basefn + ".new", basefn + ".ref")
|
32
|
+
std_err = FilePair.new(basefn + "-stderr.new", basefn + "-stderr.ref")
|
33
|
+
files = [std_out,std_err]
|
34
|
+
# ---- Create .new file
|
35
|
+
cmd = command + " > #{std_out.outfn} 2>#{std_err.outfn}"
|
36
|
+
$stderr.print cmd,"\n"
|
37
|
+
exec_ret = nil
|
38
|
+
if options[:timeout] && options[:timeout] > 0
|
39
|
+
Timeout.timeout(options[:timeout]) do
|
40
|
+
begin
|
41
|
+
exec_ret = Kernel.system(cmd)
|
42
|
+
rescue Timeout::Error
|
43
|
+
$stderr.print cmd, " failed to finish in under #{options[:timeout]}\n"
|
44
|
+
return false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
else
|
48
|
+
exec_ret = Kernel.system(cmd)
|
49
|
+
end
|
50
|
+
expect_fail = (options[:should_fail] != nil)
|
51
|
+
if !expect_fail and exec_ret==0
|
52
|
+
$stderr.print cmd," returned an error\n"
|
53
|
+
return false
|
54
|
+
end
|
55
|
+
if expect_fail and exec_ret
|
56
|
+
$stderr.print cmd," did not return an error\n"
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
if options[:ignore]
|
60
|
+
regex = options[:ignore]
|
61
|
+
files.each do |f|
|
62
|
+
outfn = f.outfn
|
63
|
+
outfn1 = outfn + ".1"
|
64
|
+
FileUtils.mv(outfn,outfn1)
|
65
|
+
f1 = File.open(outfn1)
|
66
|
+
f2 = File.open(outfn,"w")
|
67
|
+
f1.each_line do | line |
|
68
|
+
f2.print(line) if line !~ /#{regex}/
|
69
|
+
end
|
70
|
+
f1.close
|
71
|
+
f2.close
|
72
|
+
FileUtils::rm(outfn1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
# ---- Compare the two files
|
76
|
+
files.each do |f|
|
77
|
+
next unless File.exist?(f.reffn)
|
78
|
+
return false unless compare_files(f.outfn,f.reffn,options[:ignore])
|
79
|
+
end
|
80
|
+
return true
|
81
|
+
end
|
82
|
+
|
83
|
+
def CliExec::compare_files fn1, fn2, ignore = nil
|
84
|
+
if not File.exist?(fn2)
|
85
|
+
FileUtils::cp(fn1,fn2)
|
86
|
+
true
|
87
|
+
else
|
88
|
+
cmd = "diff #{fn2} #{fn1}"
|
89
|
+
$stderr.print cmd+"\n"
|
90
|
+
return true if Kernel.system(cmd) == true
|
91
|
+
# Hmmm. We have a different result. We are going to try again
|
92
|
+
# because sometimes threads have not completed
|
93
|
+
sleep 0.25
|
94
|
+
return true if Kernel.system(cmd) == true
|
95
|
+
$stderr.print "If it is correct, execute \"cp #{fn1} #{fn2}\", and run again"
|
96
|
+
false
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# Ragel lexer for VCF-header
|
2
|
+
#
|
3
|
+
# This is compact a parser/lexer for the VCF header format. Bio-vcf
|
4
|
+
# uses the parser to generate meta information that can be output to
|
5
|
+
# (for example) JSON format. The advantage of using ragel as a state
|
6
|
+
# engine is that it allows for easy parsing of key-value pairs with
|
7
|
+
# syntax checking and, for example, escaped quotes in quoted string
|
8
|
+
# values. This ragel parser/lexer generates valid Ruby; it should be
|
9
|
+
# fairly trivial to generate python/C/JAVA instead. Note that this
|
10
|
+
# edition validates ID and Number fields only. Other fields are
|
11
|
+
# dumped 'AS IS'.
|
12
|
+
#
|
13
|
+
# Note the .rb version is generated from ./ragel/gen_vcfheaderline_parser.rl
|
14
|
+
#
|
15
|
+
# by Pjotr Prins (c) 2014/2015
|
16
|
+
|
17
|
+
module BioVcf
|
18
|
+
|
19
|
+
module VcfHeaderParser
|
20
|
+
|
21
|
+
module RagelKeyValues
|
22
|
+
|
23
|
+
def self.debug msg
|
24
|
+
# nothing
|
25
|
+
# $stderr.print "DEBUG: ",msg,"\n"
|
26
|
+
end
|
27
|
+
|
28
|
+
=begin
|
29
|
+
%%{
|
30
|
+
|
31
|
+
machine simple_lexer;
|
32
|
+
|
33
|
+
action mark { ts=p }
|
34
|
+
action endquoted {
|
35
|
+
emit.call(:value,data,ts,p)
|
36
|
+
}
|
37
|
+
|
38
|
+
action kw {
|
39
|
+
emit.call(:kw,data,ts,p)
|
40
|
+
}
|
41
|
+
|
42
|
+
squote = "'";
|
43
|
+
dquote = '"';
|
44
|
+
not_squote_or_escape = [^'\\];
|
45
|
+
not_dquote_or_escape = [^"\\];
|
46
|
+
escaped_something = /\\./;
|
47
|
+
ss = squote ( not_squote_or_escape | escaped_something )* >mark %endquoted squote;
|
48
|
+
dd = dquote ( not_dquote_or_escape | escaped_something )* >mark %endquoted dquote;
|
49
|
+
|
50
|
+
integer = ('+'|'-')?digit+;
|
51
|
+
float = ('+'|'-')?digit+'.'digit+;
|
52
|
+
assignment = '=';
|
53
|
+
identifier = ( alnum (alnum|'.'|'_')* );
|
54
|
+
version = ( digit (alnum|'.'|'_'|'-')* );
|
55
|
+
str = (ss|dd)* ;
|
56
|
+
boolean = '.';
|
57
|
+
date = str;
|
58
|
+
key_word = ( ('Type'|'Description'|'Source'|identifier - ('ID'|'Number'|'length'|'Version'|'assembly'|'Date'|'CommandLineOptions')) >mark %{ emit.call(:key_word,data,ts,p) } );
|
59
|
+
any_value = ( str|( integer|float|boolean|identifier >mark %{ emit.call(:value,data,ts,p) } ));
|
60
|
+
id_value = ( identifier >mark %{ emit.call(:value,data,ts,p) } );
|
61
|
+
|
62
|
+
version_value = ( str| ( version >mark %{ emit.call(:value,data,ts,p) } ));
|
63
|
+
date_value = ( date );
|
64
|
+
gatk_value = ( str );
|
65
|
+
number_value = ( ( integer|boolean|'A'|'R'|'G' ) >mark %{ emit.call(:value,data,ts,p) } );
|
66
|
+
|
67
|
+
id_kv = ( ( ('ID'|'assembly') %kw '=' id_value ) %{ debug("ID FOUND") } @!{ error_code="Malformed ID"} );
|
68
|
+
version_kv = ( ( ('Version') %kw '=' version_value ) @!{ error_code="Version"} );
|
69
|
+
number_kv = ( ( ('Number'|'length') %kw '=' number_value ) @!{ error_code="Number"} );
|
70
|
+
date_kv = ( ( ('Date') %kw '=' date_value ) %{ debug("DATE FOUND") } @!{ error_code="Date"} );
|
71
|
+
gatk_kv = ( ( ('CommandLineOptions') %kw '=' gatk_value ) @!{ error_code="GATK"} );
|
72
|
+
key_value = ( id_kv | version_kv | date_kv | number_kv | gatk_kv | (key_word '=' any_value) ) %{ debug("KEY_VALUE found") } >mark @!{ error_code="unknown key-value " };
|
73
|
+
|
74
|
+
main := ( '##' ('FILTER'|'FORMAT'|'contig'|'INFO'|'ALT'|'GATKCommandLine') '=') (('<'|',') key_value )* '>';
|
75
|
+
}%%
|
76
|
+
=end
|
77
|
+
|
78
|
+
%% write data;
|
79
|
+
# %% this just fixes syntax highlighting...
|
80
|
+
|
81
|
+
def self.run_lexer(buf, options = {})
|
82
|
+
do_debug = (options[:debug] == true)
|
83
|
+
$stderr.print "---> ",buf,"\n" if do_debug
|
84
|
+
data = buf.unpack("c*") if(buf.is_a?(String))
|
85
|
+
eof = data.length
|
86
|
+
values = []
|
87
|
+
stack = []
|
88
|
+
|
89
|
+
emit = lambda { |type, data, ts, p|
|
90
|
+
# Print the type and text of the last read token
|
91
|
+
# p ts,p
|
92
|
+
$stderr.print "EMITTED: #{type}: #{data[ts...p].pack('c*')}\n" if do_debug
|
93
|
+
values << [type,data[ts...p].pack('c*')]
|
94
|
+
}
|
95
|
+
|
96
|
+
error_code = nil
|
97
|
+
|
98
|
+
%% write init;
|
99
|
+
%% write exec;
|
100
|
+
|
101
|
+
raise "ERROR: "+error_code+" in "+buf if error_code
|
102
|
+
|
103
|
+
begin
|
104
|
+
res = {}
|
105
|
+
# p values
|
106
|
+
values.each_slice(2) do | a,b |
|
107
|
+
$stderr.print '*',a,b if do_debug
|
108
|
+
keyword = a[1]
|
109
|
+
value = b[1]
|
110
|
+
value = value.to_i if ['length','Epoch'].index(keyword)
|
111
|
+
res[keyword] = value
|
112
|
+
# p h[:value] if h[:name]==:identifier or h[:name]==:value or h[:name]==:string
|
113
|
+
end
|
114
|
+
rescue
|
115
|
+
print "ERROR: "
|
116
|
+
p values
|
117
|
+
raise
|
118
|
+
end
|
119
|
+
$stderr.print(res,"\n") if do_debug
|
120
|
+
res
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if __FILE__ == $0
|
127
|
+
|
128
|
+
gatkcommandline = <<LINE1
|
129
|
+
##GATKCommandLine=<ID=CombineVariants,Version=3.2-2-gec30cee,Date="Thu Oct 30 13:41:59 CET 2014",Epoch=1414672919266,CommandLineOptions="analysis_type=CombineVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/hpc/cog_bioinf/GENOMES/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 refactor_NDN_cigar_string=false fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false variant=[(RodBindingCollection [(RodBinding name=variant source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_snps.vcf)]), (RodBindingCollection [(RodBinding name=variant2 source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_indels.vcf)])] out=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false excludeNonVariants=false setKey=set assumeIdenticalSamples=false minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
|
130
|
+
LINE1
|
131
|
+
|
132
|
+
h = {}
|
133
|
+
s = gatkcommandline.strip
|
134
|
+
# print s,"\n"
|
135
|
+
result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
|
136
|
+
# h[result['ID']] = result
|
137
|
+
# p result
|
138
|
+
|
139
|
+
lines = <<LINES
|
140
|
+
##FILTER=<ID=HaplotypeScoreHigh,Description="HaplotypeScore > 13.0">
|
141
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
142
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth",Extra="Yes?">
|
143
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
144
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
145
|
+
##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf",Source="dbsnp",Version="138">
|
146
|
+
##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)">
|
147
|
+
##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags.">
|
148
|
+
##INFO=<ID=CLNHGVS1,Number=.,Type=String,Description="Variant names from \\"HGVS\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags.">
|
149
|
+
##contig=<ID=XXXY12>
|
150
|
+
##contig=<ID=Y,length=59373566>
|
151
|
+
LINES
|
152
|
+
|
153
|
+
h = {}
|
154
|
+
lines.strip.split("\n").each { |s|
|
155
|
+
# print s,"\n"
|
156
|
+
result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
|
157
|
+
h[result['ID']] = result
|
158
|
+
p result
|
159
|
+
}
|
160
|
+
p h
|
161
|
+
|
162
|
+
raise "ERROR" if h != {"HaplotypeScoreHigh"=>{"ID"=>"HaplotypeScoreHigh", "Description"=>"HaplotypeScore > 13.0"}, "GT"=>{"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}, "DP"=>{"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth", "Extra"=>"Yes?"}, "DP4"=>{"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}, "PM"=>{"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}, "VP"=>{"ID"=>"VP", "Number"=>"1", "Type"=>"String", "Description"=>"Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf", "Source"=>"dbsnp", "Version"=>"138"}, "GENEINFO"=>{"ID"=>"GENEINFO", "Number"=>"1", "Type"=>"String", "Description"=>"Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)"}, "CLNHGVS"=>{"ID"=>"CLNHGVS", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags."}, "CLNHGVS1"=>{"ID"=>"CLNHGVS1", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from \\\"HGVS\\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags."}, "XXXY12"=>{"ID"=>"XXXY12"}, "Y"=>{"ID"=>"Y", "length"=>59373566}}
|
163
|
+
|
164
|
+
|
165
|
+
end # test
|