bio-vcf 0.8.0 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +4 -5
- data/Gemfile.lock +28 -65
- data/LICENSE.txt +1 -1
- data/README.md +387 -107
- data/RELEASE_NOTES.md +20 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +3 -40
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +176 -109
- data/bio-vcf.gemspec +14 -70
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +25 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +0 -9
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/bedfilter.rb +43 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/template.rb +75 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
- data/lib/bio-vcf/vcfheader.rb +146 -6
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +27 -3
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +19 -7
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/template/vcf2rdf_header.erb +24 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +65 -64
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -13,38 +13,49 @@ module BioVcf
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
def []= k, v
|
18
|
-
split_fields if not @h
|
19
|
-
kupper = k.upcase
|
20
|
-
@h[kupper] = v
|
21
|
-
@original_key[kupper] = k
|
22
|
-
end
|
23
|
-
|
24
|
-
def method_missing(m, *args, &block)
|
16
|
+
def [] k
|
25
17
|
# split_fields if not @h
|
26
18
|
# /#{m}=(?<value>[^;])/.@info
|
19
|
+
kupper = k.upcase
|
27
20
|
v = if @h
|
28
|
-
@h[
|
21
|
+
@h[kupper]
|
29
22
|
else
|
30
|
-
@info =~
|
23
|
+
@info =~ /[\A;]#{k}=([^;]+)/i
|
31
24
|
value = $1
|
32
25
|
# p [m,value]
|
33
26
|
# m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
|
34
27
|
# value = m[:value]
|
35
28
|
if value == nil
|
36
29
|
split_fields # no option but to split
|
37
|
-
@h[
|
30
|
+
@h[kupper]
|
38
31
|
else
|
39
32
|
value
|
40
33
|
end
|
41
34
|
end
|
42
35
|
ConvertStringToValue::convert(v)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set INFO fields (used by --rewrite)
|
39
|
+
def []= k, v
|
40
|
+
split_fields if not @h
|
41
|
+
kupper = k.upcase
|
42
|
+
@h[kupper] = v
|
43
|
+
@original_key[kupper] = k
|
44
|
+
end
|
45
|
+
|
46
|
+
def fields
|
47
|
+
split_fields
|
48
|
+
@h.keys
|
49
|
+
end
|
50
|
+
|
51
|
+
def method_missing(m, *args, &block)
|
52
|
+
self[m.to_s]
|
43
53
|
end
|
44
54
|
|
45
55
|
private
|
46
56
|
|
47
57
|
def split_fields
|
58
|
+
return @h if @h
|
48
59
|
@h = {}
|
49
60
|
@original_key = {}
|
50
61
|
@info.split(/;/).each do |f|
|
@@ -151,6 +162,10 @@ module BioVcf
|
|
151
162
|
@qual ||= @fields[5].to_f
|
152
163
|
end
|
153
164
|
|
165
|
+
def filter
|
166
|
+
@filter ||= @fields[6]
|
167
|
+
end
|
168
|
+
|
154
169
|
def info
|
155
170
|
@info ||= VcfRecordParser.get_info(@fields[7])
|
156
171
|
end
|
@@ -184,15 +199,21 @@ module BioVcf
|
|
184
199
|
end
|
185
200
|
|
186
201
|
def sample_by_index i
|
187
|
-
# p @fields
|
188
202
|
raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
|
189
203
|
@sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
|
190
204
|
end
|
191
205
|
|
192
206
|
# Walk the samples. list contains an Array of int (the index)
|
193
207
|
def each_sample(list = nil)
|
194
|
-
|
195
|
-
|
208
|
+
@header.sample_subset_index(list).each { |i|
|
209
|
+
yield VcfSample::Sample.new(self,sample_by_index(i))
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
def samples
|
214
|
+
list = []
|
215
|
+
each_sample { |s| list << s }
|
216
|
+
list
|
196
217
|
end
|
197
218
|
|
198
219
|
def missing_samples?
|
@@ -229,6 +250,7 @@ module BioVcf
|
|
229
250
|
$stderr.print "RECORD ERROR!\n"
|
230
251
|
$stderr.print [@fields],"\n"
|
231
252
|
$stderr.print expr,"\n"
|
253
|
+
$stderr.print "To ignore this error use the -i switch!\n"
|
232
254
|
end
|
233
255
|
if ignore_missing_data
|
234
256
|
$stderr.print e.message if not quiet
|
@@ -239,19 +261,19 @@ module BioVcf
|
|
239
261
|
end
|
240
262
|
end
|
241
263
|
|
242
|
-
def
|
264
|
+
def gfilter expr, ignore_missing_data: true, quiet: false
|
243
265
|
begin
|
244
266
|
if not respond_to?(:call_cached_filter)
|
245
267
|
code =
|
246
268
|
"""
|
247
|
-
def
|
269
|
+
def call_cached_gfilter(rec,fields)
|
248
270
|
r = rec
|
249
271
|
#{expr}
|
250
272
|
end
|
251
273
|
"""
|
252
274
|
self.class.class_eval(code)
|
253
275
|
end
|
254
|
-
res =
|
276
|
+
res = call_cached_gfilter(self,@fields)
|
255
277
|
if res.kind_of?(Array)
|
256
278
|
res.join("\t")
|
257
279
|
else
|
@@ -262,6 +284,7 @@ module BioVcf
|
|
262
284
|
$stderr.print "RECORD ERROR!\n"
|
263
285
|
$stderr.print [@fields],"\n"
|
264
286
|
$stderr.print expr,"\n"
|
287
|
+
$stderr.print "To ignore this error use the -i switch!\n"
|
265
288
|
end
|
266
289
|
if ignore_missing_data
|
267
290
|
$stderr.print e.message if not quiet
|
@@ -272,6 +295,21 @@ module BioVcf
|
|
272
295
|
end
|
273
296
|
end
|
274
297
|
|
298
|
+
def add_to_filter_field str
|
299
|
+
filter = @fields[6]
|
300
|
+
if not filter or filter == '.' or filter == 'PASS'
|
301
|
+
filter = str
|
302
|
+
else
|
303
|
+
values = filter.split(/;/)
|
304
|
+
if not values.include?(str)
|
305
|
+
filter = filter +';'+str
|
306
|
+
end
|
307
|
+
end
|
308
|
+
filter = '.' if filter == nil or filter == ''
|
309
|
+
@fields[6] = filter
|
310
|
+
filter
|
311
|
+
end
|
312
|
+
|
275
313
|
# Return the sample
|
276
314
|
def method_missing(m, *args, &block)
|
277
315
|
name = m.to_s
|
data/lib/bio-vcf/vcfsample.rb
CHANGED
@@ -3,7 +3,7 @@ module BioVcf
|
|
3
3
|
|
4
4
|
# Check whether a sample is empty (on the raw string value)
|
5
5
|
def VcfSample::empty? s
|
6
|
-
s==nil or s == './.' or s == '' or s[0..2]=='./.'
|
6
|
+
s==nil or s == './.' or s == '' or s[0..2]=='./.' or s[0..1] == '.:'
|
7
7
|
end
|
8
8
|
|
9
9
|
class Sample
|
@@ -40,9 +40,24 @@ module BioVcf
|
|
40
40
|
# Split GT into index values
|
41
41
|
def gti
|
42
42
|
v = fetch_values("GT")
|
43
|
-
v
|
43
|
+
v = './.' if v == '.' #In case that you have a single missing value, make both as missing.
|
44
|
+
v.split(/[\/\|]/).map{ |v| (v=='.' ? nil : v.to_i) }
|
44
45
|
end
|
45
46
|
|
47
|
+
def gtindex
|
48
|
+
v = fetch_values("GT")
|
49
|
+
return case v
|
50
|
+
when nil then nil
|
51
|
+
when '.' then nil
|
52
|
+
when './.' then nil
|
53
|
+
when '0/0' then 0
|
54
|
+
when '0/1' then 1
|
55
|
+
when '1/1' then 2
|
56
|
+
else
|
57
|
+
raise "Unknown genotype #{v}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
46
61
|
# Split GT into into a nucleode sequence
|
47
62
|
def gts
|
48
63
|
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
@@ -51,7 +66,16 @@ module BioVcf
|
|
51
66
|
def cache_method(name, &block)
|
52
67
|
self.class.send(:define_method, name, &block)
|
53
68
|
end
|
54
|
-
|
69
|
+
|
70
|
+
def [] name
|
71
|
+
if @format[name]
|
72
|
+
v = fetch_values(name)
|
73
|
+
return nil if VcfValue::empty?(v)
|
74
|
+
return ConvertStringToValue::convert(v)
|
75
|
+
end
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
55
79
|
def method_missing(m, *args, &block)
|
56
80
|
name = m.to_s.upcase
|
57
81
|
# p [:here,name,m ,@values]
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# Ragel lexer for VCF-header
|
2
|
+
#
|
3
|
+
# This is compact a parser/lexer for the VCF header format. Bio-vcf
|
4
|
+
# uses the parser to generate meta information that can be output to
|
5
|
+
# (for example) JSON format. The advantage of using ragel as a state
|
6
|
+
# engine is that it allows for easy parsing of key-value pairs with
|
7
|
+
# syntax checking and, for example, escaped quotes in quoted string
|
8
|
+
# values. This ragel parser/lexer generates valid Ruby; it should be
|
9
|
+
# fairly trivial to generate python/C/JAVA instead. Note that this
|
10
|
+
# edition validates ID and Number fields only. Other fields are
|
11
|
+
# dumped 'AS IS'.
|
12
|
+
#
|
13
|
+
# Note the .rb version is generated from ./ragel/gen_vcfheaderline_parser.rl
|
14
|
+
#
|
15
|
+
# by Pjotr Prins (c) 2014/2015
|
16
|
+
|
17
|
+
module BioVcf
|
18
|
+
|
19
|
+
module VcfHeaderParser
|
20
|
+
|
21
|
+
module RagelKeyValues
|
22
|
+
|
23
|
+
def self.debug msg
|
24
|
+
# nothing
|
25
|
+
# $stderr.print "DEBUG: ",msg,"\n"
|
26
|
+
end
|
27
|
+
|
28
|
+
=begin
|
29
|
+
%%{
|
30
|
+
|
31
|
+
machine simple_lexer;
|
32
|
+
|
33
|
+
action mark { ts=p }
|
34
|
+
action endquoted {
|
35
|
+
emit.call(:value,data,ts,p)
|
36
|
+
}
|
37
|
+
|
38
|
+
action kw {
|
39
|
+
emit.call(:kw,data,ts,p)
|
40
|
+
}
|
41
|
+
|
42
|
+
squote = "'";
|
43
|
+
dquote = '"';
|
44
|
+
not_squote_or_escape = [^'\\];
|
45
|
+
not_dquote_or_escape = [^"\\];
|
46
|
+
escaped_something = /\\./;
|
47
|
+
ss = squote ( not_squote_or_escape | escaped_something )* >mark %endquoted squote;
|
48
|
+
dd = dquote ( not_dquote_or_escape | escaped_something )* >mark %endquoted dquote;
|
49
|
+
|
50
|
+
integer = ('+'|'-')?digit+;
|
51
|
+
float = ('+'|'-')?digit+'.'digit+;
|
52
|
+
assignment = '=';
|
53
|
+
identifier = ( alnum (alnum|'.'|'_')* );
|
54
|
+
version = ( digit (alnum|'.'|'_'|'-')* );
|
55
|
+
str = (ss|dd)* ;
|
56
|
+
boolean = '.';
|
57
|
+
date = str;
|
58
|
+
key_word = ( ('Type'|'Description'|'Source'|identifier - ('ID'|'Number'|'length'|'Version'|'assembly'|'Date'|'CommandLineOptions')) >mark %{ emit.call(:key_word,data,ts,p) } );
|
59
|
+
any_value = ( str|( integer|float|boolean|identifier >mark %{ emit.call(:value,data,ts,p) } ));
|
60
|
+
id_value = ( identifier >mark %{ emit.call(:value,data,ts,p) } );
|
61
|
+
|
62
|
+
version_value = ( str| ( version >mark %{ emit.call(:value,data,ts,p) } ));
|
63
|
+
date_value = ( date );
|
64
|
+
gatk_value = ( str );
|
65
|
+
number_value = ( ( integer|boolean|'A'|'R'|'G' ) >mark %{ emit.call(:value,data,ts,p) } );
|
66
|
+
|
67
|
+
id_kv = ( ( ('ID'|'assembly') %kw '=' id_value ) %{ debug("ID FOUND") } @!{ error_code="Malformed ID"} );
|
68
|
+
version_kv = ( ( ('Version') %kw '=' version_value ) @!{ error_code="Version"} );
|
69
|
+
number_kv = ( ( ('Number'|'length') %kw '=' number_value ) @!{ error_code="Number"} );
|
70
|
+
date_kv = ( ( ('Date') %kw '=' date_value ) %{ debug("DATE FOUND") } @!{ error_code="Date"} );
|
71
|
+
gatk_kv = ( ( ('CommandLineOptions') %kw '=' gatk_value ) @!{ error_code="GATK"} );
|
72
|
+
key_value = ( id_kv | version_kv | date_kv | number_kv | gatk_kv | (key_word '=' any_value) ) %{ debug("KEY_VALUE found") } >mark @!{ error_code="unknown key-value " };
|
73
|
+
|
74
|
+
main := ( '##' ('FILTER'|'FORMAT'|'contig'|'INFO'|'ALT'|'GATKCommandLine') '=') (('<'|',') key_value )* '>';
|
75
|
+
}%%
|
76
|
+
=end
|
77
|
+
|
78
|
+
%% write data;
|
79
|
+
# %% this just fixes syntax highlighting...
|
80
|
+
|
81
|
+
def self.run_lexer(buf, options = {})
|
82
|
+
do_debug = (options[:debug] == true)
|
83
|
+
$stderr.print "---> ",buf,"\n" if do_debug
|
84
|
+
data = buf.unpack("c*") if(buf.is_a?(String))
|
85
|
+
eof = data.length
|
86
|
+
values = []
|
87
|
+
stack = []
|
88
|
+
|
89
|
+
emit = lambda { |type, data, ts, p|
|
90
|
+
# Print the type and text of the last read token
|
91
|
+
# p ts,p
|
92
|
+
$stderr.print "EMITTED: #{type}: #{data[ts...p].pack('c*')}\n" if do_debug
|
93
|
+
values << [type,data[ts...p].pack('c*')]
|
94
|
+
}
|
95
|
+
|
96
|
+
error_code = nil
|
97
|
+
|
98
|
+
%% write init;
|
99
|
+
%% write exec;
|
100
|
+
|
101
|
+
raise "ERROR: "+error_code+" in "+buf if error_code
|
102
|
+
|
103
|
+
begin
|
104
|
+
res = {}
|
105
|
+
# p values
|
106
|
+
values.each_slice(2) do | a,b |
|
107
|
+
$stderr.print '*',a,b if do_debug
|
108
|
+
keyword = a[1]
|
109
|
+
value = b[1]
|
110
|
+
value = value.to_i if ['length','Epoch'].index(keyword)
|
111
|
+
res[keyword] = value
|
112
|
+
# p h[:value] if h[:name]==:identifier or h[:name]==:value or h[:name]==:string
|
113
|
+
end
|
114
|
+
rescue
|
115
|
+
print "ERROR: "
|
116
|
+
p values
|
117
|
+
raise
|
118
|
+
end
|
119
|
+
$stderr.print(res,"\n") if do_debug
|
120
|
+
res
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if __FILE__ == $0
|
127
|
+
|
128
|
+
gatkcommandline = <<LINE1
|
129
|
+
##GATKCommandLine=<ID=CombineVariants,Version=3.2-2-gec30cee,Date="Thu Oct 30 13:41:59 CET 2014",Epoch=1414672919266,CommandLineOptions="analysis_type=CombineVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/hpc/cog_bioinf/GENOMES/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 refactor_NDN_cigar_string=false fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false variant=[(RodBindingCollection [(RodBinding name=variant source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_snps.vcf)]), (RodBindingCollection [(RodBinding name=variant2 source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_indels.vcf)])] out=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false excludeNonVariants=false setKey=set assumeIdenticalSamples=false minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
|
130
|
+
LINE1
|
131
|
+
|
132
|
+
h = {}
|
133
|
+
s = gatkcommandline.strip
|
134
|
+
# print s,"\n"
|
135
|
+
result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
|
136
|
+
# h[result['ID']] = result
|
137
|
+
# p result
|
138
|
+
|
139
|
+
lines = <<LINES
|
140
|
+
##FILTER=<ID=HaplotypeScoreHigh,Description="HaplotypeScore > 13.0">
|
141
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
142
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth",Extra="Yes?">
|
143
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
144
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
145
|
+
##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf",Source="dbsnp",Version="138">
|
146
|
+
##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)">
|
147
|
+
##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags.">
|
148
|
+
##INFO=<ID=CLNHGVS1,Number=.,Type=String,Description="Variant names from \\"HGVS\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags.">
|
149
|
+
##contig=<ID=XXXY12>
|
150
|
+
##contig=<ID=Y,length=59373566>
|
151
|
+
LINES
|
152
|
+
|
153
|
+
h = {}
|
154
|
+
lines.strip.split("\n").each { |s|
|
155
|
+
# print s,"\n"
|
156
|
+
result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
|
157
|
+
h[result['ID']] = result
|
158
|
+
p result
|
159
|
+
}
|
160
|
+
p h
|
161
|
+
|
162
|
+
raise "ERROR" if h != {"HaplotypeScoreHigh"=>{"ID"=>"HaplotypeScoreHigh", "Description"=>"HaplotypeScore > 13.0"}, "GT"=>{"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}, "DP"=>{"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth", "Extra"=>"Yes?"}, "DP4"=>{"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}, "PM"=>{"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}, "VP"=>{"ID"=>"VP", "Number"=>"1", "Type"=>"String", "Description"=>"Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf", "Source"=>"dbsnp", "Version"=>"138"}, "GENEINFO"=>{"ID"=>"GENEINFO", "Number"=>"1", "Type"=>"String", "Description"=>"Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)"}, "CLNHGVS"=>{"ID"=>"CLNHGVS", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags."}, "CLNHGVS1"=>{"ID"=>"CLNHGVS1", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from \\\"HGVS\\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags."}, "XXXY12"=>{"ID"=>"XXXY12"}, "Y"=>{"ID"=>"Y", "length"=>59373566}}
|
163
|
+
|
164
|
+
|
165
|
+
end # test
|
data/ragel/generate.sh
ADDED
data/template/vcf2json.erb
CHANGED
@@ -1,8 +1,20 @@
|
|
1
|
+
=HEADER
|
2
|
+
<% require 'json' %>
|
1
3
|
{
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
"HEADER": {
|
5
|
+
"options": <%= options.to_h.to_json %>,
|
6
|
+
"files": <%= ARGV %>,
|
7
|
+
"version": "<%= BIOVCF_VERSION %>"
|
8
|
+
},
|
9
|
+
"BODY": [
|
10
|
+
=BODY
|
11
|
+
{
|
12
|
+
"seq:chr": "<%= rec.chrom %>",
|
13
|
+
"seq:pos": <%= rec.pos %>,
|
14
|
+
"seq:ref": "<%= rec.ref %>",
|
15
|
+
"seq:alt": "<%= rec.alt[0] %>",
|
16
|
+
"dp": <%= rec.info.dp %>
|
17
|
+
},
|
18
|
+
=FOOTER
|
19
|
+
]
|
20
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
=HEADER
|
2
|
+
<% require 'json' %>
|
3
|
+
{
|
4
|
+
"HEADER": {
|
5
|
+
"options": <%= options.to_h.to_json %>,
|
6
|
+
"files": <%= ARGV %>,
|
7
|
+
"version": "<%= BIOVCF_VERSION %>"
|
8
|
+
},
|
9
|
+
"COLUMNS": <%= header.column_names.to_json %>,
|
10
|
+
"META": <%= header.meta.to_json %>,
|
11
|
+
"BODY": [
|
12
|
+
=BODY
|
13
|
+
{
|
14
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
15
|
+
"seq:pos": <%= rec.pos %> ,
|
16
|
+
"seq:ref": "<%= rec.ref %>" ,
|
17
|
+
"seq:alt": "<%= rec.alt[0] %>"
|
18
|
+
<% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>
|
19
|
+
},
|
20
|
+
=FOOTER
|
21
|
+
]
|
22
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
=HEADER
|
2
|
+
<% require 'json' %>
|
3
|
+
{
|
4
|
+
"HEADER": {
|
5
|
+
"options":<%= options.to_h.to_json %>,
|
6
|
+
"files": <%= ARGV %>,
|
7
|
+
"version": "<%= BIOVCF_VERSION %>"
|
8
|
+
},
|
9
|
+
"COLUMNS": <%= header.column_names.to_json %>,
|
10
|
+
"META": <%= header.meta.to_json %>,
|
11
|
+
"BODY": [
|
12
|
+
=BODY
|
13
|
+
<% sample_num = 0
|
14
|
+
sample_name = nil
|
15
|
+
sample_size = header.samples.size
|
16
|
+
%>
|
17
|
+
{
|
18
|
+
"seq:chr": "<%= rec.chrom %>" ,
|
19
|
+
"seq:pos": <%= rec.pos %> ,
|
20
|
+
"seq:ref": "<%= rec.ref %>" ,
|
21
|
+
"seq:alt": "<%= rec.alt[0] %>"
|
22
|
+
<% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>,
|
23
|
+
"samples" : {
|
24
|
+
<% rec.each_sample do |s| %>
|
25
|
+
<% if not s.empty?
|
26
|
+
sample_name = header.samples[sample_num]
|
27
|
+
%>
|
28
|
+
<%= (sample_num!=0 ? "," : "" ) %>
|
29
|
+
<% sample_num += 1%>
|
30
|
+
"<%= sample_name %>": {
|
31
|
+
<% header.meta['FORMAT'].each_key do |k| %>
|
32
|
+
"<%= k %>": <%= s[k].to_json %><%= (k==header.meta['FORMAT'].keys.last ? "" : "," ) %>
|
33
|
+
<% end %>
|
34
|
+
}
|
35
|
+
<% end %>
|
36
|
+
<% end %>
|
37
|
+
}
|
38
|
+
},
|
39
|
+
=FOOTER
|
40
|
+
]
|
41
|
+
}
|