bio-vcf 0.8.0 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -13,38 +13,49 @@ module BioVcf
13
13
  end
14
14
  end
15
15
 
16
- # Set INFO fields (used by --rewrite)
17
- def []= k, v
18
- split_fields if not @h
19
- kupper = k.upcase
20
- @h[kupper] = v
21
- @original_key[kupper] = k
22
- end
23
-
24
- def method_missing(m, *args, &block)
16
+ def [] k
25
17
  # split_fields if not @h
26
18
  # /#{m}=(?<value>[^;])/.@info
19
+ kupper = k.upcase
27
20
  v = if @h
28
- @h[m.to_s.upcase]
21
+ @h[kupper]
29
22
  else
30
- @info =~ /#{m.to_s}=([^;]+)/i
23
+ @info =~ /[\A;]#{k}=([^;]+)/i
31
24
  value = $1
32
25
  # p [m,value]
33
26
  # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
34
27
  # value = m[:value]
35
28
  if value == nil
36
29
  split_fields # no option but to split
37
- @h[m.to_s.upcase]
30
+ @h[kupper]
38
31
  else
39
32
  value
40
33
  end
41
34
  end
42
35
  ConvertStringToValue::convert(v)
36
+ end
37
+
38
+ # Set INFO fields (used by --rewrite)
39
+ def []= k, v
40
+ split_fields if not @h
41
+ kupper = k.upcase
42
+ @h[kupper] = v
43
+ @original_key[kupper] = k
44
+ end
45
+
46
+ def fields
47
+ split_fields
48
+ @h.keys
49
+ end
50
+
51
+ def method_missing(m, *args, &block)
52
+ self[m.to_s]
43
53
  end
44
54
 
45
55
  private
46
56
 
47
57
  def split_fields
58
+ return @h if @h
48
59
  @h = {}
49
60
  @original_key = {}
50
61
  @info.split(/;/).each do |f|
@@ -151,6 +162,10 @@ module BioVcf
151
162
  @qual ||= @fields[5].to_f
152
163
  end
153
164
 
165
+ def filter
166
+ @filter ||= @fields[6]
167
+ end
168
+
154
169
  def info
155
170
  @info ||= VcfRecordParser.get_info(@fields[7])
156
171
  end
@@ -184,15 +199,21 @@ module BioVcf
184
199
  end
185
200
 
186
201
  def sample_by_index i
187
- # p @fields
188
202
  raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
189
203
  @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
190
204
  end
191
205
 
192
206
  # Walk the samples. list contains an Array of int (the index)
193
207
  def each_sample(list = nil)
194
- list = @header.samples_index_array() if not list
195
- list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i.to_i)) }
208
+ @header.sample_subset_index(list).each { |i|
209
+ yield VcfSample::Sample.new(self,sample_by_index(i))
210
+ }
211
+ end
212
+
213
+ def samples
214
+ list = []
215
+ each_sample { |s| list << s }
216
+ list
196
217
  end
197
218
 
198
219
  def missing_samples?
@@ -229,6 +250,7 @@ module BioVcf
229
250
  $stderr.print "RECORD ERROR!\n"
230
251
  $stderr.print [@fields],"\n"
231
252
  $stderr.print expr,"\n"
253
+ $stderr.print "To ignore this error use the -i switch!\n"
232
254
  end
233
255
  if ignore_missing_data
234
256
  $stderr.print e.message if not quiet
@@ -239,19 +261,19 @@ module BioVcf
239
261
  end
240
262
  end
241
263
 
242
- def filter expr, ignore_missing_data: true, quiet: false
264
+ def gfilter expr, ignore_missing_data: true, quiet: false
243
265
  begin
244
266
  if not respond_to?(:call_cached_filter)
245
267
  code =
246
268
  """
247
- def call_cached_filter(rec,fields)
269
+ def call_cached_gfilter(rec,fields)
248
270
  r = rec
249
271
  #{expr}
250
272
  end
251
273
  """
252
274
  self.class.class_eval(code)
253
275
  end
254
- res = call_cached_filter(self,@fields)
276
+ res = call_cached_gfilter(self,@fields)
255
277
  if res.kind_of?(Array)
256
278
  res.join("\t")
257
279
  else
@@ -262,6 +284,7 @@ module BioVcf
262
284
  $stderr.print "RECORD ERROR!\n"
263
285
  $stderr.print [@fields],"\n"
264
286
  $stderr.print expr,"\n"
287
+ $stderr.print "To ignore this error use the -i switch!\n"
265
288
  end
266
289
  if ignore_missing_data
267
290
  $stderr.print e.message if not quiet
@@ -272,6 +295,21 @@ module BioVcf
272
295
  end
273
296
  end
274
297
 
298
+ def add_to_filter_field str
299
+ filter = @fields[6]
300
+ if not filter or filter == '.' or filter == 'PASS'
301
+ filter = str
302
+ else
303
+ values = filter.split(/;/)
304
+ if not values.include?(str)
305
+ filter = filter +';'+str
306
+ end
307
+ end
308
+ filter = '.' if filter == nil or filter == ''
309
+ @fields[6] = filter
310
+ filter
311
+ end
312
+
275
313
  # Return the sample
276
314
  def method_missing(m, *args, &block)
277
315
  name = m.to_s
@@ -3,7 +3,7 @@ module BioVcf
3
3
 
4
4
  # Check whether a sample is empty (on the raw string value)
5
5
  def VcfSample::empty? s
6
- s==nil or s == './.' or s == '' or s[0..2]=='./.'
6
+ s==nil or s == './.' or s == '' or s[0..2]=='./.' or s[0..1] == '.:'
7
7
  end
8
8
 
9
9
  class Sample
@@ -40,9 +40,24 @@ module BioVcf
40
40
  # Split GT into index values
41
41
  def gti
42
42
  v = fetch_values("GT")
43
- v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
43
+ v = './.' if v == '.' #In case that you have a single missing value, make both as missing.
44
+ v.split(/[\/\|]/).map{ |v| (v=='.' ? nil : v.to_i) }
44
45
  end
45
46
 
47
+ def gtindex
48
+ v = fetch_values("GT")
49
+ return case v
50
+ when nil then nil
51
+ when '.' then nil
52
+ when './.' then nil
53
+ when '0/0' then 0
54
+ when '0/1' then 1
55
+ when '1/1' then 2
56
+ else
57
+ raise "Unknown genotype #{v}"
58
+ end
59
+ end
60
+
46
61
  # Split GT into into a nucleode sequence
47
62
  def gts
48
63
  gti.map { |i| (i ? @rec.get_gt(i) : nil) }
@@ -51,7 +66,16 @@ module BioVcf
51
66
  def cache_method(name, &block)
52
67
  self.class.send(:define_method, name, &block)
53
68
  end
54
-
69
+
70
+ def [] name
71
+ if @format[name]
72
+ v = fetch_values(name)
73
+ return nil if VcfValue::empty?(v)
74
+ return ConvertStringToValue::convert(v)
75
+ end
76
+ nil
77
+ end
78
+
55
79
  def method_missing(m, *args, &block)
56
80
  name = m.to_s.upcase
57
81
  # p [:here,name,m ,@values]
@@ -0,0 +1,165 @@
1
+ # Ragel lexer for VCF-header
2
+ #
3
+ # This is compact a parser/lexer for the VCF header format. Bio-vcf
4
+ # uses the parser to generate meta information that can be output to
5
+ # (for example) JSON format. The advantage of using ragel as a state
6
+ # engine is that it allows for easy parsing of key-value pairs with
7
+ # syntax checking and, for example, escaped quotes in quoted string
8
+ # values. This ragel parser/lexer generates valid Ruby; it should be
9
+ # fairly trivial to generate python/C/JAVA instead. Note that this
10
+ # edition validates ID and Number fields only. Other fields are
11
+ # dumped 'AS IS'.
12
+ #
13
+ # Note the .rb version is generated from ./ragel/gen_vcfheaderline_parser.rl
14
+ #
15
+ # by Pjotr Prins (c) 2014/2015
16
+
17
+ module BioVcf
18
+
19
+ module VcfHeaderParser
20
+
21
+ module RagelKeyValues
22
+
23
+ def self.debug msg
24
+ # nothing
25
+ # $stderr.print "DEBUG: ",msg,"\n"
26
+ end
27
+
28
+ =begin
29
+ %%{
30
+
31
+ machine simple_lexer;
32
+
33
+ action mark { ts=p }
34
+ action endquoted {
35
+ emit.call(:value,data,ts,p)
36
+ }
37
+
38
+ action kw {
39
+ emit.call(:kw,data,ts,p)
40
+ }
41
+
42
+ squote = "'";
43
+ dquote = '"';
44
+ not_squote_or_escape = [^'\\];
45
+ not_dquote_or_escape = [^"\\];
46
+ escaped_something = /\\./;
47
+ ss = squote ( not_squote_or_escape | escaped_something )* >mark %endquoted squote;
48
+ dd = dquote ( not_dquote_or_escape | escaped_something )* >mark %endquoted dquote;
49
+
50
+ integer = ('+'|'-')?digit+;
51
+ float = ('+'|'-')?digit+'.'digit+;
52
+ assignment = '=';
53
+ identifier = ( alnum (alnum|'.'|'_')* );
54
+ version = ( digit (alnum|'.'|'_'|'-')* );
55
+ str = (ss|dd)* ;
56
+ boolean = '.';
57
+ date = str;
58
+ key_word = ( ('Type'|'Description'|'Source'|identifier - ('ID'|'Number'|'length'|'Version'|'assembly'|'Date'|'CommandLineOptions')) >mark %{ emit.call(:key_word,data,ts,p) } );
59
+ any_value = ( str|( integer|float|boolean|identifier >mark %{ emit.call(:value,data,ts,p) } ));
60
+ id_value = ( identifier >mark %{ emit.call(:value,data,ts,p) } );
61
+
62
+ version_value = ( str| ( version >mark %{ emit.call(:value,data,ts,p) } ));
63
+ date_value = ( date );
64
+ gatk_value = ( str );
65
+ number_value = ( ( integer|boolean|'A'|'R'|'G' ) >mark %{ emit.call(:value,data,ts,p) } );
66
+
67
+ id_kv = ( ( ('ID'|'assembly') %kw '=' id_value ) %{ debug("ID FOUND") } @!{ error_code="Malformed ID"} );
68
+ version_kv = ( ( ('Version') %kw '=' version_value ) @!{ error_code="Version"} );
69
+ number_kv = ( ( ('Number'|'length') %kw '=' number_value ) @!{ error_code="Number"} );
70
+ date_kv = ( ( ('Date') %kw '=' date_value ) %{ debug("DATE FOUND") } @!{ error_code="Date"} );
71
+ gatk_kv = ( ( ('CommandLineOptions') %kw '=' gatk_value ) @!{ error_code="GATK"} );
72
+ key_value = ( id_kv | version_kv | date_kv | number_kv | gatk_kv | (key_word '=' any_value) ) %{ debug("KEY_VALUE found") } >mark @!{ error_code="unknown key-value " };
73
+
74
+ main := ( '##' ('FILTER'|'FORMAT'|'contig'|'INFO'|'ALT'|'GATKCommandLine') '=') (('<'|',') key_value )* '>';
75
+ }%%
76
+ =end
77
+
78
+ %% write data;
79
+ # %% this just fixes syntax highlighting...
80
+
81
+ def self.run_lexer(buf, options = {})
82
+ do_debug = (options[:debug] == true)
83
+ $stderr.print "---> ",buf,"\n" if do_debug
84
+ data = buf.unpack("c*") if(buf.is_a?(String))
85
+ eof = data.length
86
+ values = []
87
+ stack = []
88
+
89
+ emit = lambda { |type, data, ts, p|
90
+ # Print the type and text of the last read token
91
+ # p ts,p
92
+ $stderr.print "EMITTED: #{type}: #{data[ts...p].pack('c*')}\n" if do_debug
93
+ values << [type,data[ts...p].pack('c*')]
94
+ }
95
+
96
+ error_code = nil
97
+
98
+ %% write init;
99
+ %% write exec;
100
+
101
+ raise "ERROR: "+error_code+" in "+buf if error_code
102
+
103
+ begin
104
+ res = {}
105
+ # p values
106
+ values.each_slice(2) do | a,b |
107
+ $stderr.print '*',a,b if do_debug
108
+ keyword = a[1]
109
+ value = b[1]
110
+ value = value.to_i if ['length','Epoch'].index(keyword)
111
+ res[keyword] = value
112
+ # p h[:value] if h[:name]==:identifier or h[:name]==:value or h[:name]==:string
113
+ end
114
+ rescue
115
+ print "ERROR: "
116
+ p values
117
+ raise
118
+ end
119
+ $stderr.print(res,"\n") if do_debug
120
+ res
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ if __FILE__ == $0
127
+
128
+ gatkcommandline = <<LINE1
129
+ ##GATKCommandLine=<ID=CombineVariants,Version=3.2-2-gec30cee,Date="Thu Oct 30 13:41:59 CET 2014",Epoch=1414672919266,CommandLineOptions="analysis_type=CombineVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/hpc/cog_bioinf/GENOMES/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 refactor_NDN_cigar_string=false fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false variant=[(RodBindingCollection [(RodBinding name=variant source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_snps.vcf)]), (RodBindingCollection [(RodBinding name=variant2 source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_indels.vcf)])] out=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false excludeNonVariants=false setKey=set assumeIdenticalSamples=false minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
130
+ LINE1
131
+
132
+ h = {}
133
+ s = gatkcommandline.strip
134
+ # print s,"\n"
135
+ result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
136
+ # h[result['ID']] = result
137
+ # p result
138
+
139
+ lines = <<LINES
140
+ ##FILTER=<ID=HaplotypeScoreHigh,Description="HaplotypeScore > 13.0">
141
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
142
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth",Extra="Yes?">
143
+ ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
144
+ ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
145
+ ##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf",Source="dbsnp",Version="138">
146
+ ##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)">
147
+ ##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags.">
148
+ ##INFO=<ID=CLNHGVS1,Number=.,Type=String,Description="Variant names from \\"HGVS\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags.">
149
+ ##contig=<ID=XXXY12>
150
+ ##contig=<ID=Y,length=59373566>
151
+ LINES
152
+
153
+ h = {}
154
+ lines.strip.split("\n").each { |s|
155
+ # print s,"\n"
156
+ result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
157
+ h[result['ID']] = result
158
+ p result
159
+ }
160
+ p h
161
+
162
+ raise "ERROR" if h != {"HaplotypeScoreHigh"=>{"ID"=>"HaplotypeScoreHigh", "Description"=>"HaplotypeScore > 13.0"}, "GT"=>{"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}, "DP"=>{"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth", "Extra"=>"Yes?"}, "DP4"=>{"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}, "PM"=>{"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}, "VP"=>{"ID"=>"VP", "Number"=>"1", "Type"=>"String", "Description"=>"Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf", "Source"=>"dbsnp", "Version"=>"138"}, "GENEINFO"=>{"ID"=>"GENEINFO", "Number"=>"1", "Type"=>"String", "Description"=>"Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)"}, "CLNHGVS"=>{"ID"=>"CLNHGVS", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags."}, "CLNHGVS1"=>{"ID"=>"CLNHGVS1", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from \\\"HGVS\\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags."}, "XXXY12"=>{"ID"=>"XXXY12"}, "Y"=>{"ID"=>"Y", "length"=>59373566}}
163
+
164
+
165
+ end # test
@@ -0,0 +1,8 @@
1
+ #! /bin/bash
2
+
3
+ ragel -R gen_vcfheaderline_parser.rl
4
+ [ $? -ne 0 ] && exit 1
5
+
6
+ ruby gen_vcfheaderline_parser.rb
7
+
8
+ cp gen_vcfheaderline_parser.rb ../lib/bio-vcf/vcfheader_line.rb
@@ -1,8 +1,20 @@
1
+ =HEADER
2
+ <% require 'json' %>
1
3
  {
2
- "seq:chr": "<%= rec.chrom %>" ,
3
- "seq:pos": <%= rec.pos %> ,
4
- "seq:ref": "<%= rec.ref %>" ,
5
- "seq:alt": "<%= rec.alt[0] %>" ,
6
- "seq:maf": <%= rec.info.maf[0] %> ,
7
- "dp": <%= rec.info.dp %> ,
8
- };
4
+ "HEADER": {
5
+ "options": <%= options.to_h.to_json %>,
6
+ "files": <%= ARGV %>,
7
+ "version": "<%= BIOVCF_VERSION %>"
8
+ },
9
+ "BODY": [
10
+ =BODY
11
+ {
12
+ "seq:chr": "<%= rec.chrom %>",
13
+ "seq:pos": <%= rec.pos %>,
14
+ "seq:ref": "<%= rec.ref %>",
15
+ "seq:alt": "<%= rec.alt[0] %>",
16
+ "dp": <%= rec.info.dp %>
17
+ },
18
+ =FOOTER
19
+ ]
20
+ }
@@ -0,0 +1,22 @@
1
+ =HEADER
2
+ <% require 'json' %>
3
+ {
4
+ "HEADER": {
5
+ "options": <%= options.to_h.to_json %>,
6
+ "files": <%= ARGV %>,
7
+ "version": "<%= BIOVCF_VERSION %>"
8
+ },
9
+ "COLUMNS": <%= header.column_names.to_json %>,
10
+ "META": <%= header.meta.to_json %>,
11
+ "BODY": [
12
+ =BODY
13
+ {
14
+ "seq:chr": "<%= rec.chrom %>" ,
15
+ "seq:pos": <%= rec.pos %> ,
16
+ "seq:ref": "<%= rec.ref %>" ,
17
+ "seq:alt": "<%= rec.alt[0] %>"
18
+ <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>
19
+ },
20
+ =FOOTER
21
+ ]
22
+ }
@@ -0,0 +1,41 @@
1
+ =HEADER
2
+ <% require 'json' %>
3
+ {
4
+ "HEADER": {
5
+ "options":<%= options.to_h.to_json %>,
6
+ "files": <%= ARGV %>,
7
+ "version": "<%= BIOVCF_VERSION %>"
8
+ },
9
+ "COLUMNS": <%= header.column_names.to_json %>,
10
+ "META": <%= header.meta.to_json %>,
11
+ "BODY": [
12
+ =BODY
13
+ <% sample_num = 0
14
+ sample_name = nil
15
+ sample_size = header.samples.size
16
+ %>
17
+ {
18
+ "seq:chr": "<%= rec.chrom %>" ,
19
+ "seq:pos": <%= rec.pos %> ,
20
+ "seq:ref": "<%= rec.ref %>" ,
21
+ "seq:alt": "<%= rec.alt[0] %>"
22
+ <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>,
23
+ "samples" : {
24
+ <% rec.each_sample do |s| %>
25
+ <% if not s.empty?
26
+ sample_name = header.samples[sample_num]
27
+ %>
28
+ <%= (sample_num!=0 ? "," : "" ) %>
29
+ <% sample_num += 1%>
30
+ "<%= sample_name %>": {
31
+ <% header.meta['FORMAT'].each_key do |k| %>
32
+ "<%= k %>": <%= s[k].to_json %><%= (k==header.meta['FORMAT'].keys.last ? "" : "," ) %>
33
+ <% end %>
34
+ }
35
+ <% end %>
36
+ <% end %>
37
+ }
38
+ },
39
+ =FOOTER
40
+ ]
41
+ }