bio-vcf 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,4 +14,9 @@ require 'bio-vcf'
14
14
 
15
15
  require 'rspec/expectations'
16
16
 
17
+ # Add the regression module if in the path (it can also be a gem)
18
+ rootdir = File.dirname(__FILE__) + '/../..'
19
+ $LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
20
+ require 'regressiontest'
21
+
17
22
  include BioVcf
data/lib/bio-vcf.rb CHANGED
@@ -8,7 +8,9 @@
8
8
  #
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
+ require 'bio-vcf/utils'
11
12
  require 'bio-vcf/vcf'
13
+ require 'bio-vcf/vcfsample'
12
14
  require 'bio-vcf/vcfheader'
13
15
  require 'bio-vcf/vcfline'
14
16
  require 'bio-vcf/vcfgenotypefield'
@@ -0,0 +1,23 @@
1
+ module BioVcf
2
+
3
+ module ConvertStringToValue
4
+ def self::integer?(str)
5
+ !!Integer(str) rescue false
6
+ end
7
+
8
+ def self::float?(str)
9
+ !!Float(str) rescue false
10
+ end
11
+
12
+ def self::convert v
13
+ if integer?(v) # the common case
14
+ v = v.to_i
15
+ else
16
+ # 150.268 or 9.68463e-05
17
+ v = v.to_f if float?(v)
18
+ end
19
+ v
20
+ end
21
+ end
22
+
23
+ end
@@ -3,23 +3,28 @@ module BioVcf
3
3
  MAXINT=100_000
4
4
 
5
5
  # Helper class for a list of (variant) values, such as A,G.
6
- # The [] function does the hard work (see ./features for examples)
7
- class VcfNucleotides
6
+ # The [] function does the hard work. You can pass in an index (integer)
7
+ # or nucleotide which translates to an index.
8
+ # (see ./features for examples)
9
+ class VcfNucleotideCount4
8
10
  def initialize alt,list
9
11
  @alt = alt
10
- @list = list.map{|i| i.to_i}
12
+ @list = list.split(/,/).map{|i| i.to_i}
11
13
  end
12
14
 
13
15
  def [] idx
14
16
  if idx.kind_of?(Integer)
15
- @list[idx].to_i
17
+ # return a value
18
+ @list[idx]
16
19
  elsif idx.kind_of?(String)
17
- @list[["A","C","G","T"].index(idx)].to_i
20
+ # return a value
21
+ @list[["A","C","G","T"].index(idx)]
18
22
  else idx.kind_of?(Array)
23
+ # return a list of values
19
24
  idx.map { |nuc|
20
25
  idx2 = ["A","C","G","T"].index(nuc)
21
26
  # p [idx,nuc,idx2,@list]
22
- @list[idx2].to_i
27
+ @list[idx2]
23
28
  }
24
29
  end
25
30
  end
@@ -47,10 +52,11 @@ module BioVcf
47
52
 
48
53
  end
49
54
 
50
- class VcfAltInfo
55
+ # Handle info fields with multiple entries, possibly relating to ALT (single nucleotide only)
56
+ class VcfAltInfoList
51
57
  def initialize alt,list
52
58
  @alt = alt
53
- @list = list.map{|i| i.to_i}
59
+ @list = list.split(/,/).map{|i| i.to_i}
54
60
  end
55
61
 
56
62
  def [] idx
@@ -86,43 +92,78 @@ module BioVcf
86
92
  end
87
93
 
88
94
  class VcfGenotypeField
95
+
96
+ attr_reader :format, :values, :header
97
+
89
98
  def initialize s, format, header, alt
90
- @values = s.split(/:/)
99
+ @is_empty = (s == '' or s == nil or s == './.')
100
+ @original_s = s
91
101
  @format = format
92
102
  @header = header
93
103
  @alt = alt
94
104
  end
95
105
 
96
- def dp4
97
- @values[@format['DP4']].split(',').map{|i| i.to_i}
106
+ def values
107
+ @cache_values ||= @original_s.split(/:/)
98
108
  end
99
109
 
100
- def ad
101
- @values[@format['AD']].split(',').map{|i| i.to_i}
110
+ def empty?
111
+ @is_empty
102
112
  end
103
113
 
104
- def pl
105
- @values[@format['PL']].split(',').map{|i| i.to_i}
114
+ def valid?
115
+ !@is_empty
116
+ end
117
+
118
+ def dp4
119
+ ilist('DP4')
120
+ end
121
+ def ad
122
+ ilist('AD')
123
+ end
124
+ def pl
125
+ ilist('PL')
106
126
  end
107
127
 
108
128
  def bcount
109
- VcfNucleotides.new(@alt,@values[@format['BCOUNT']].split(','))
129
+ VcfNucleotideCount4.new(@alt,values[fetch('BCOUNT')])
110
130
  end
111
131
 
112
132
  def bq
113
- VcfAltInfo.new(@alt,@values[@format['BQ']].split(','))
133
+ VcfAltInfoList.new(@alt,values[fetch('BQ')])
114
134
  end
115
135
 
116
136
  def amq
117
- VcfAltInfo.new(@alt,@values[@format['AMQ']].split(','))
137
+ VcfAltInfoList.new(@alt,values[fetch('AMQ')])
138
+ end
139
+
140
+ def method_missing(m, *args, &block)
141
+ return nil if @is_empty
142
+ if m =~ /\?$/
143
+ # query if a value exists, e.g., r.info.dp?
144
+ v = values[fetch(m.to_s.upcase.chop)]
145
+ v != nil
146
+ else
147
+ v = values[fetch(m.to_s.upcase)]
148
+ v = v.to_i if v =~ /^\d+$/
149
+ v = v.to_f if v =~ /^\d+\.\d+$/
150
+ v
151
+ end
152
+ end
153
+
154
+ private
155
+
156
+ def fetch name
157
+ raise "ERROR: Field with name #{name} does not exist!" if !@format[name]
158
+ @format[name]
159
+ end
160
+
161
+ def ilist name
162
+ v = values[fetch(name)]
163
+ return nil if not v
164
+ v.split(',').map{|i| i.to_i}
118
165
  end
119
166
 
120
- def method_missing(m, *args, &block)
121
- v = @values[@format[m.to_s.upcase]]
122
- v = v.to_i if v =~ /^\d+$/
123
- v = v.to_f if v =~ /^\d+\.\d+$/
124
- v
125
- end
126
167
 
127
168
  end
128
169
 
@@ -134,17 +175,21 @@ module BioVcf
134
175
  @header = header
135
176
  @alt = alt
136
177
  @samples = {} # lazy cache
137
- @index = {}
138
- @header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
178
+ @sample_index = @header.sample_index()
139
179
  end
140
180
 
141
181
  def [] name
142
- @samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
182
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
143
183
  end
144
184
 
145
185
  def method_missing(m, *args, &block)
146
186
  name = m.to_s
147
- @samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
187
+ if name =~ /\?$/
188
+ # test for valid sample
189
+ return !VcfSample::empty?(@fields[@sample_index[name.chop]])
190
+ else
191
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
192
+ end
148
193
  end
149
194
 
150
195
  end
@@ -42,6 +42,14 @@ module BioVcf
42
42
  def samples
43
43
  @samples ||= column_names[9..-1]
44
44
  end
45
+
46
+ def sample_index
47
+ return @sample_index if @sample_index
48
+ index = {}
49
+ samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 }
50
+ @sample_index = index
51
+ index
52
+ end
45
53
  end
46
54
 
47
55
  end
@@ -4,6 +4,7 @@ module BioVcf
4
4
  # Split a line into fields and check size
5
5
  def VcfLine.parse line,expected_size=nil
6
6
  fields = line.strip.split(/\t/)
7
+ raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6
7
8
  raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size
8
9
  fields
9
10
  end
@@ -1,26 +1,72 @@
1
1
  module BioVcf
2
2
 
3
- class VcfRecordInfo
3
+ class VcfRecordInfo
4
4
  def initialize s
5
- h = {}
6
- s.split(/;/).each { |f| k,v=f.split(/=/) ; h[k.upcase] = v }
7
- @h = h
8
- end
9
- def method_missing(m, *args, &block)
10
- v = @h[m.to_s.upcase]
11
- v = v.to_i if v =~ /^\d+$/
12
- v = v.to_f if v =~ /^\d+\.\d+$/
13
- v
5
+ @info = s
6
+ end
7
+
8
+ def to_s
9
+ if @h
10
+ @h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k]) }.join(';')
11
+ else
12
+ @info
13
+ end
14
+ end
15
+
16
+ # Set INFO fields (used by --rewrite)
17
+ def []= k, v
18
+ split_fields if not @h
19
+ kupper = k.upcase
20
+ @h[kupper] = v
21
+ @original_key[kupper] = k
22
+ end
23
+
24
+ def method_missing(m, *args, &block)
25
+ # split_fields if not @h
26
+ # /#{m}=(?<value>[^;])/.@info
27
+ v = if @h
28
+ @h[m.to_s.upcase]
29
+ else
30
+ @info =~ /#{m.to_s.upcase}=([^;]+)/
31
+ value = $1
32
+ # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
33
+ # value = m[:value]
34
+ if value == nil
35
+ split_fields # no option but to split
36
+ @h[m.to_s.upcase]
37
+ else
38
+ value
39
+ end
40
+ end
41
+ ConvertStringToValue::convert(v)
14
42
  end
15
43
 
44
+ private
45
+
46
+ def split_fields
47
+ @h = {}
48
+ @original_key = {}
49
+ @info.split(/;/).each do |f|
50
+ k,v = f.split(/=/)
51
+ kupper = k.upcase
52
+ @h[kupper] = v
53
+ @original_key[kupper] = k
54
+ end
55
+ end
16
56
  end
17
57
 
18
58
  module VcfRecordParser
19
59
  # Parse the format field into a Hash
20
60
  def VcfRecordParser.get_format s
21
- h = {}
22
- s.split(/:/).each_with_index { |v,i| h[v] = i }
23
- h
61
+ if s==$cached_sample_format_s
62
+ $cached_sample_format
63
+ else
64
+ h = {}
65
+ s.split(/:/).each_with_index { |v,i| h[v] = i }
66
+ $cached_sample_format = h
67
+ $cached_sample_format_s = s
68
+ h
69
+ end
24
70
  end
25
71
  def VcfRecordParser.get_info s
26
72
  VcfRecordInfo.new(s)
@@ -36,6 +82,15 @@ module BioVcf
36
82
  ['A','C','G','T'][index()]
37
83
  end
38
84
 
85
+ # Get the GT when 0 is REF and >0 is ALT
86
+ def get_gt(index)
87
+ if index == 0
88
+ ref()
89
+ else
90
+ alt[index-1]
91
+ end
92
+ end
93
+
39
94
  def call_tumor_count
40
95
  tumor.bcount.to_ary[index()]
41
96
  end
@@ -63,11 +118,13 @@ module BioVcf
63
118
  @fields = fields
64
119
  @header = header
65
120
  end
66
-
121
+
67
122
  def chrom
68
123
  @fields[0]
69
124
  end
70
125
 
126
+ alias :chr :chrom
127
+
71
128
  def pos
72
129
  @pos ||= @fields[1].to_i
73
130
  end
@@ -114,5 +171,76 @@ module BioVcf
114
171
  def sample
115
172
  @sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
116
173
  end
174
+
175
+ def sample_by_name name
176
+ sample[name]
177
+ end
178
+
179
+ def each_sample(list = nil)
180
+ samples = @header.column_names[9..-1]
181
+ raise "Empty sample list, can not execute query!" if not samples
182
+ samples.each_with_index { |name,i|
183
+ # p [i,list]
184
+ next if list and not list.index(i.to_s)
185
+ yield VcfSample::Sample.new(self,sample[name])
186
+ }
187
+ end
188
+
189
+ def missing_samples?
190
+ @fields[9..-1].each { |sample|
191
+ return true if VcfSample::empty?(sample)
192
+ }
193
+ false
194
+ end
195
+
196
+ def valid?
197
+ @fields.size == @header.column_names.size
198
+ end
199
+
200
+ def eval expr, ignore_missing_data, quiet
201
+ begin
202
+ if not respond_to?(:call_cached_eval)
203
+ code =
204
+ """
205
+ def call_cached_eval(rec,fields)
206
+ r = rec
207
+ #{expr}
208
+ end
209
+ """
210
+ self.class.class_eval(code)
211
+ end
212
+ res = call_cached_eval(self,@fields)
213
+ if res.kind_of?(Array)
214
+ res.join("\t")
215
+ else
216
+ res
217
+ end
218
+ rescue NoMethodError => e
219
+ if not quiet
220
+ $stderr.print "RECORD ERROR!\n"
221
+ $stderr.print [@fields],"\n"
222
+ $stderr.print expr,"\n"
223
+ end
224
+ if ignore_missing_data
225
+ $stderr.print e.message if not quiet
226
+ return false
227
+ else
228
+ raise
229
+ end
230
+ end
231
+ end
232
+
233
+ # Return the sample
234
+ def method_missing(m, *args, &block)
235
+ name = m.to_s
236
+ if name =~ /\?$/
237
+ # Query for empty sample name
238
+ @sample_index ||= @header.sample_index
239
+ return !VcfSample::empty?(@fields[@sample_index[name.chop]])
240
+ else
241
+ sample[name]
242
+ end
243
+ end
244
+
117
245
  end
118
246
  end
@@ -0,0 +1,88 @@
1
+ module BioVcf
2
+ module VcfSample
3
+
4
+ # Check whether a sample is empty (on the raw string value)
5
+ def VcfSample::empty? raw_sample
6
+ s = raw_sample.strip
7
+ s == './.' or s == '' or s == nil
8
+ end
9
+
10
+ class Sample
11
+ # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
12
+ def initialize rec,sample
13
+ @rec = rec
14
+ @sample = sample
15
+ @format = @sample.format
16
+ @values = @sample.values
17
+ end
18
+
19
+ def eval expr, ignore_missing_data, quiet
20
+ begin
21
+ if not respond_to?(:call_cached_eval)
22
+ code =
23
+ """
24
+ def call_cached_eval(rec,sample)
25
+ r = rec
26
+ s = sample
27
+ #{expr}
28
+ end
29
+ """
30
+ self.class.class_eval(code)
31
+ end
32
+ call_cached_eval(@rec,self)
33
+ rescue NoMethodError => e
34
+ empty = VcfSample::empty?(@sample.values.to_s)
35
+ $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty
36
+ if not quiet
37
+ $stderr.print [@format,@values],"\n"
38
+ $stderr.print expr,"\n"
39
+ end
40
+ if ignore_missing_data
41
+ $stderr.print e.message if not quiet and not empty
42
+ return false
43
+ else
44
+ raise
45
+ end
46
+ end
47
+ end
48
+
49
+ # Split GT into index values
50
+ def gti
51
+ v = fetch_values("GT")
52
+ v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
53
+ end
54
+
55
+ # Split GT into into a nucleode sequence
56
+ def gts
57
+ gti.map { |i| (i ? @rec.get_gt(i) : nil) }
58
+ end
59
+
60
+ def cache_method(name, &block)
61
+ self.class.send(:define_method, name, &block)
62
+ end
63
+
64
+ def method_missing(m, *args, &block)
65
+ name = m.to_s.upcase
66
+ if @format[name]
67
+ cache_method(m) {
68
+ ConvertStringToValue::convert(fetch_values(name))
69
+ }
70
+ self.send(m)
71
+ else
72
+ super(m, *args, &block)
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def fetch_values name
79
+ n = @format[name]
80
+ raise "Unknown sample field <#{name}>" if not n
81
+ @values[n] # <-- save names with upcase!
82
+ end
83
+
84
+ end
85
+
86
+
87
+ end
88
+ end