bio-vcf 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,4 +14,9 @@ require 'bio-vcf'
14
14
 
15
15
  require 'rspec/expectations'
16
16
 
17
+ # Add the regression module if in the path (it can also be a gem)
18
+ rootdir = File.dirname(__FILE__) + '/../..'
19
+ $LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
20
+ require 'regressiontest'
21
+
17
22
  include BioVcf
data/lib/bio-vcf.rb CHANGED
@@ -8,7 +8,9 @@
8
8
  #
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
+ require 'bio-vcf/utils'
11
12
  require 'bio-vcf/vcf'
13
+ require 'bio-vcf/vcfsample'
12
14
  require 'bio-vcf/vcfheader'
13
15
  require 'bio-vcf/vcfline'
14
16
  require 'bio-vcf/vcfgenotypefield'
@@ -0,0 +1,23 @@
1
+ module BioVcf
2
+
3
+ module ConvertStringToValue
4
+ def self::integer?(str)
5
+ !!Integer(str) rescue false
6
+ end
7
+
8
+ def self::float?(str)
9
+ !!Float(str) rescue false
10
+ end
11
+
12
+ def self::convert v
13
+ if integer?(v) # the common case
14
+ v = v.to_i
15
+ else
16
+ # 150.268 or 9.68463e-05
17
+ v = v.to_f if float?(v)
18
+ end
19
+ v
20
+ end
21
+ end
22
+
23
+ end
@@ -3,23 +3,28 @@ module BioVcf
3
3
  MAXINT=100_000
4
4
 
5
5
  # Helper class for a list of (variant) values, such as A,G.
6
- # The [] function does the hard work (see ./features for examples)
7
- class VcfNucleotides
6
+ # The [] function does the hard work. You can pass in an index (integer)
7
+ # or nucleotide which translates to an index.
8
+ # (see ./features for examples)
9
+ class VcfNucleotideCount4
8
10
  def initialize alt,list
9
11
  @alt = alt
10
- @list = list.map{|i| i.to_i}
12
+ @list = list.split(/,/).map{|i| i.to_i}
11
13
  end
12
14
 
13
15
  def [] idx
14
16
  if idx.kind_of?(Integer)
15
- @list[idx].to_i
17
+ # return a value
18
+ @list[idx]
16
19
  elsif idx.kind_of?(String)
17
- @list[["A","C","G","T"].index(idx)].to_i
20
+ # return a value
21
+ @list[["A","C","G","T"].index(idx)]
18
22
  else idx.kind_of?(Array)
23
+ # return a list of values
19
24
  idx.map { |nuc|
20
25
  idx2 = ["A","C","G","T"].index(nuc)
21
26
  # p [idx,nuc,idx2,@list]
22
- @list[idx2].to_i
27
+ @list[idx2]
23
28
  }
24
29
  end
25
30
  end
@@ -47,10 +52,11 @@ module BioVcf
47
52
 
48
53
  end
49
54
 
50
- class VcfAltInfo
55
+ # Handle info fields with multiple entries, possibly relating to ALT (single nucleotide only)
56
+ class VcfAltInfoList
51
57
  def initialize alt,list
52
58
  @alt = alt
53
- @list = list.map{|i| i.to_i}
59
+ @list = list.split(/,/).map{|i| i.to_i}
54
60
  end
55
61
 
56
62
  def [] idx
@@ -86,43 +92,78 @@ module BioVcf
86
92
  end
87
93
 
88
94
  class VcfGenotypeField
95
+
96
+ attr_reader :format, :values, :header
97
+
89
98
  def initialize s, format, header, alt
90
- @values = s.split(/:/)
99
+ @is_empty = (s == '' or s == nil or s == './.')
100
+ @original_s = s
91
101
  @format = format
92
102
  @header = header
93
103
  @alt = alt
94
104
  end
95
105
 
96
- def dp4
97
- @values[@format['DP4']].split(',').map{|i| i.to_i}
106
+ def values
107
+ @cache_values ||= @original_s.split(/:/)
98
108
  end
99
109
 
100
- def ad
101
- @values[@format['AD']].split(',').map{|i| i.to_i}
110
+ def empty?
111
+ @is_empty
102
112
  end
103
113
 
104
- def pl
105
- @values[@format['PL']].split(',').map{|i| i.to_i}
114
+ def valid?
115
+ !@is_empty
116
+ end
117
+
118
+ def dp4
119
+ ilist('DP4')
120
+ end
121
+ def ad
122
+ ilist('AD')
123
+ end
124
+ def pl
125
+ ilist('PL')
106
126
  end
107
127
 
108
128
  def bcount
109
- VcfNucleotides.new(@alt,@values[@format['BCOUNT']].split(','))
129
+ VcfNucleotideCount4.new(@alt,values[fetch('BCOUNT')])
110
130
  end
111
131
 
112
132
  def bq
113
- VcfAltInfo.new(@alt,@values[@format['BQ']].split(','))
133
+ VcfAltInfoList.new(@alt,values[fetch('BQ')])
114
134
  end
115
135
 
116
136
  def amq
117
- VcfAltInfo.new(@alt,@values[@format['AMQ']].split(','))
137
+ VcfAltInfoList.new(@alt,values[fetch('AMQ')])
138
+ end
139
+
140
+ def method_missing(m, *args, &block)
141
+ return nil if @is_empty
142
+ if m =~ /\?$/
143
+ # query if a value exists, e.g., r.info.dp?
144
+ v = values[fetch(m.to_s.upcase.chop)]
145
+ v != nil
146
+ else
147
+ v = values[fetch(m.to_s.upcase)]
148
+ v = v.to_i if v =~ /^\d+$/
149
+ v = v.to_f if v =~ /^\d+\.\d+$/
150
+ v
151
+ end
152
+ end
153
+
154
+ private
155
+
156
+ def fetch name
157
+ raise "ERROR: Field with name #{name} does not exist!" if !@format[name]
158
+ @format[name]
159
+ end
160
+
161
+ def ilist name
162
+ v = values[fetch(name)]
163
+ return nil if not v
164
+ v.split(',').map{|i| i.to_i}
118
165
  end
119
166
 
120
- def method_missing(m, *args, &block)
121
- v = @values[@format[m.to_s.upcase]]
122
- v = v.to_i if v =~ /^\d+$/
123
- v = v.to_f if v =~ /^\d+\.\d+$/
124
- v
125
- end
126
167
 
127
168
  end
128
169
 
@@ -134,17 +175,21 @@ module BioVcf
134
175
  @header = header
135
176
  @alt = alt
136
177
  @samples = {} # lazy cache
137
- @index = {}
138
- @header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
178
+ @sample_index = @header.sample_index()
139
179
  end
140
180
 
141
181
  def [] name
142
- @samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
182
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
143
183
  end
144
184
 
145
185
  def method_missing(m, *args, &block)
146
186
  name = m.to_s
147
- @samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
187
+ if name =~ /\?$/
188
+ # test for valid sample
189
+ return !VcfSample::empty?(@fields[@sample_index[name.chop]])
190
+ else
191
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
192
+ end
148
193
  end
149
194
 
150
195
  end
@@ -42,6 +42,14 @@ module BioVcf
42
42
  def samples
43
43
  @samples ||= column_names[9..-1]
44
44
  end
45
+
46
+ def sample_index
47
+ return @sample_index if @sample_index
48
+ index = {}
49
+ samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 }
50
+ @sample_index = index
51
+ index
52
+ end
45
53
  end
46
54
 
47
55
  end
@@ -4,6 +4,7 @@ module BioVcf
4
4
  # Split a line into fields and check size
5
5
  def VcfLine.parse line,expected_size=nil
6
6
  fields = line.strip.split(/\t/)
7
+ raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6
7
8
  raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size
8
9
  fields
9
10
  end
@@ -1,26 +1,72 @@
1
1
  module BioVcf
2
2
 
3
- class VcfRecordInfo
3
+ class VcfRecordInfo
4
4
  def initialize s
5
- h = {}
6
- s.split(/;/).each { |f| k,v=f.split(/=/) ; h[k.upcase] = v }
7
- @h = h
8
- end
9
- def method_missing(m, *args, &block)
10
- v = @h[m.to_s.upcase]
11
- v = v.to_i if v =~ /^\d+$/
12
- v = v.to_f if v =~ /^\d+\.\d+$/
13
- v
5
+ @info = s
6
+ end
7
+
8
+ def to_s
9
+ if @h
10
+ @h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k]) }.join(';')
11
+ else
12
+ @info
13
+ end
14
+ end
15
+
16
+ # Set INFO fields (used by --rewrite)
17
+ def []= k, v
18
+ split_fields if not @h
19
+ kupper = k.upcase
20
+ @h[kupper] = v
21
+ @original_key[kupper] = k
22
+ end
23
+
24
+ def method_missing(m, *args, &block)
25
+ # split_fields if not @h
26
+ # /#{m}=(?<value>[^;])/.@info
27
+ v = if @h
28
+ @h[m.to_s.upcase]
29
+ else
30
+ @info =~ /#{m.to_s.upcase}=([^;]+)/
31
+ value = $1
32
+ # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
33
+ # value = m[:value]
34
+ if value == nil
35
+ split_fields # no option but to split
36
+ @h[m.to_s.upcase]
37
+ else
38
+ value
39
+ end
40
+ end
41
+ ConvertStringToValue::convert(v)
14
42
  end
15
43
 
44
+ private
45
+
46
+ def split_fields
47
+ @h = {}
48
+ @original_key = {}
49
+ @info.split(/;/).each do |f|
50
+ k,v = f.split(/=/)
51
+ kupper = k.upcase
52
+ @h[kupper] = v
53
+ @original_key[kupper] = k
54
+ end
55
+ end
16
56
  end
17
57
 
18
58
  module VcfRecordParser
19
59
  # Parse the format field into a Hash
20
60
  def VcfRecordParser.get_format s
21
- h = {}
22
- s.split(/:/).each_with_index { |v,i| h[v] = i }
23
- h
61
+ if s==$cached_sample_format_s
62
+ $cached_sample_format
63
+ else
64
+ h = {}
65
+ s.split(/:/).each_with_index { |v,i| h[v] = i }
66
+ $cached_sample_format = h
67
+ $cached_sample_format_s = s
68
+ h
69
+ end
24
70
  end
25
71
  def VcfRecordParser.get_info s
26
72
  VcfRecordInfo.new(s)
@@ -36,6 +82,15 @@ module BioVcf
36
82
  ['A','C','G','T'][index()]
37
83
  end
38
84
 
85
+ # Get the GT when 0 is REF and >0 is ALT
86
+ def get_gt(index)
87
+ if index == 0
88
+ ref()
89
+ else
90
+ alt[index-1]
91
+ end
92
+ end
93
+
39
94
  def call_tumor_count
40
95
  tumor.bcount.to_ary[index()]
41
96
  end
@@ -63,11 +118,13 @@ module BioVcf
63
118
  @fields = fields
64
119
  @header = header
65
120
  end
66
-
121
+
67
122
  def chrom
68
123
  @fields[0]
69
124
  end
70
125
 
126
+ alias :chr :chrom
127
+
71
128
  def pos
72
129
  @pos ||= @fields[1].to_i
73
130
  end
@@ -114,5 +171,76 @@ module BioVcf
114
171
  def sample
115
172
  @sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
116
173
  end
174
+
175
+ def sample_by_name name
176
+ sample[name]
177
+ end
178
+
179
+ def each_sample(list = nil)
180
+ samples = @header.column_names[9..-1]
181
+ raise "Empty sample list, can not execute query!" if not samples
182
+ samples.each_with_index { |name,i|
183
+ # p [i,list]
184
+ next if list and not list.index(i.to_s)
185
+ yield VcfSample::Sample.new(self,sample[name])
186
+ }
187
+ end
188
+
189
+ def missing_samples?
190
+ @fields[9..-1].each { |sample|
191
+ return true if VcfSample::empty?(sample)
192
+ }
193
+ false
194
+ end
195
+
196
+ def valid?
197
+ @fields.size == @header.column_names.size
198
+ end
199
+
200
+ def eval expr, ignore_missing_data, quiet
201
+ begin
202
+ if not respond_to?(:call_cached_eval)
203
+ code =
204
+ """
205
+ def call_cached_eval(rec,fields)
206
+ r = rec
207
+ #{expr}
208
+ end
209
+ """
210
+ self.class.class_eval(code)
211
+ end
212
+ res = call_cached_eval(self,@fields)
213
+ if res.kind_of?(Array)
214
+ res.join("\t")
215
+ else
216
+ res
217
+ end
218
+ rescue NoMethodError => e
219
+ if not quiet
220
+ $stderr.print "RECORD ERROR!\n"
221
+ $stderr.print [@fields],"\n"
222
+ $stderr.print expr,"\n"
223
+ end
224
+ if ignore_missing_data
225
+ $stderr.print e.message if not quiet
226
+ return false
227
+ else
228
+ raise
229
+ end
230
+ end
231
+ end
232
+
233
+ # Return the sample
234
+ def method_missing(m, *args, &block)
235
+ name = m.to_s
236
+ if name =~ /\?$/
237
+ # Query for empty sample name
238
+ @sample_index ||= @header.sample_index
239
+ return !VcfSample::empty?(@fields[@sample_index[name.chop]])
240
+ else
241
+ sample[name]
242
+ end
243
+ end
244
+
117
245
  end
118
246
  end
@@ -0,0 +1,88 @@
1
+ module BioVcf
2
+ module VcfSample
3
+
4
+ # Check whether a sample is empty (on the raw string value)
5
+ def VcfSample::empty? raw_sample
6
+ s = raw_sample.strip
7
+ s == './.' or s == '' or s == nil
8
+ end
9
+
10
+ class Sample
11
+ # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
12
+ def initialize rec,sample
13
+ @rec = rec
14
+ @sample = sample
15
+ @format = @sample.format
16
+ @values = @sample.values
17
+ end
18
+
19
+ def eval expr, ignore_missing_data, quiet
20
+ begin
21
+ if not respond_to?(:call_cached_eval)
22
+ code =
23
+ """
24
+ def call_cached_eval(rec,sample)
25
+ r = rec
26
+ s = sample
27
+ #{expr}
28
+ end
29
+ """
30
+ self.class.class_eval(code)
31
+ end
32
+ call_cached_eval(@rec,self)
33
+ rescue NoMethodError => e
34
+ empty = VcfSample::empty?(@sample.values.to_s)
35
+ $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty
36
+ if not quiet
37
+ $stderr.print [@format,@values],"\n"
38
+ $stderr.print expr,"\n"
39
+ end
40
+ if ignore_missing_data
41
+ $stderr.print e.message if not quiet and not empty
42
+ return false
43
+ else
44
+ raise
45
+ end
46
+ end
47
+ end
48
+
49
+ # Split GT into index values
50
+ def gti
51
+ v = fetch_values("GT")
52
+ v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
53
+ end
54
+
55
+ # Split GT into into a nucleode sequence
56
+ def gts
57
+ gti.map { |i| (i ? @rec.get_gt(i) : nil) }
58
+ end
59
+
60
+ def cache_method(name, &block)
61
+ self.class.send(:define_method, name, &block)
62
+ end
63
+
64
+ def method_missing(m, *args, &block)
65
+ name = m.to_s.upcase
66
+ if @format[name]
67
+ cache_method(m) {
68
+ ConvertStringToValue::convert(fetch_values(name))
69
+ }
70
+ self.send(m)
71
+ else
72
+ super(m, *args, &block)
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def fetch_values name
79
+ n = @format[name]
80
+ raise "Unknown sample field <#{name}>" if not n
81
+ @values[n] # <-- save names with upcase!
82
+ end
83
+
84
+ end
85
+
86
+
87
+ end
88
+ end