bio-vcf 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/Gemfile.lock +8 -0
- data/README.md +376 -11
- data/VERSION +1 -1
- data/bin/bio-vcf +172 -39
- data/bio-vcf.gemspec +18 -3
- data/features/cli.feature +32 -0
- data/features/multisample.feature +28 -10
- data/features/step_definitions/cli-feature.rb +12 -0
- data/features/step_definitions/multisample.rb +64 -18
- data/features/support/env.rb +5 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/utils.rb +23 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +73 -28
- data/lib/bio-vcf/vcfheader.rb +8 -0
- data/lib/bio-vcf/vcfline.rb +1 -0
- data/lib/bio-vcf/vcfrecord.rb +142 -14
- data/lib/bio-vcf/vcfsample.rb +88 -0
- data/test/data/input/dbsnp.vcf +200 -0
- data/test/data/input/multisample.vcf +2 -2
- data/test/data/regression/eval_r.info.dp.ref +150 -0
- data/test/data/regression/r.info.dp.ref +147 -0
- data/test/data/regression/rewrite.info.sample.ref +150 -0
- data/test/data/regression/s.dp.ref +145 -0
- data/test/data/regression/seval_s.dp.ref +36 -0
- data/test/data/regression/sfilter001.ref +145 -0
- data/test/performance/metrics.md +98 -0
- metadata +28 -2
data/features/support/env.rb
CHANGED
@@ -14,4 +14,9 @@ require 'bio-vcf'
|
|
14
14
|
|
15
15
|
require 'rspec/expectations'
|
16
16
|
|
17
|
+
# Add the regression module if in the path (it can also be a gem)
|
18
|
+
rootdir = File.dirname(__FILE__) + '/../..'
|
19
|
+
$LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
|
20
|
+
require 'regressiontest'
|
21
|
+
|
17
22
|
include BioVcf
|
data/lib/bio-vcf.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
module ConvertStringToValue
|
4
|
+
def self::integer?(str)
|
5
|
+
!!Integer(str) rescue false
|
6
|
+
end
|
7
|
+
|
8
|
+
def self::float?(str)
|
9
|
+
!!Float(str) rescue false
|
10
|
+
end
|
11
|
+
|
12
|
+
def self::convert v
|
13
|
+
if integer?(v) # the common case
|
14
|
+
v = v.to_i
|
15
|
+
else
|
16
|
+
# 150.268 or 9.68463e-05
|
17
|
+
v = v.to_f if float?(v)
|
18
|
+
end
|
19
|
+
v
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -3,23 +3,28 @@ module BioVcf
|
|
3
3
|
MAXINT=100_000
|
4
4
|
|
5
5
|
# Helper class for a list of (variant) values, such as A,G.
|
6
|
-
# The [] function does the hard work
|
7
|
-
|
6
|
+
# The [] function does the hard work. You can pass in an index (integer)
|
7
|
+
# or nucleotide which translates to an index.
|
8
|
+
# (see ./features for examples)
|
9
|
+
class VcfNucleotideCount4
|
8
10
|
def initialize alt,list
|
9
11
|
@alt = alt
|
10
|
-
@list = list.map{|i| i.to_i}
|
12
|
+
@list = list.split(/,/).map{|i| i.to_i}
|
11
13
|
end
|
12
14
|
|
13
15
|
def [] idx
|
14
16
|
if idx.kind_of?(Integer)
|
15
|
-
|
17
|
+
# return a value
|
18
|
+
@list[idx]
|
16
19
|
elsif idx.kind_of?(String)
|
17
|
-
|
20
|
+
# return a value
|
21
|
+
@list[["A","C","G","T"].index(idx)]
|
18
22
|
else idx.kind_of?(Array)
|
23
|
+
# return a list of values
|
19
24
|
idx.map { |nuc|
|
20
25
|
idx2 = ["A","C","G","T"].index(nuc)
|
21
26
|
# p [idx,nuc,idx2,@list]
|
22
|
-
@list[idx2]
|
27
|
+
@list[idx2]
|
23
28
|
}
|
24
29
|
end
|
25
30
|
end
|
@@ -47,10 +52,11 @@ module BioVcf
|
|
47
52
|
|
48
53
|
end
|
49
54
|
|
50
|
-
|
55
|
+
# Handle info fields with multiple entries, possibly relating to ALT (single nucleotide only)
|
56
|
+
class VcfAltInfoList
|
51
57
|
def initialize alt,list
|
52
58
|
@alt = alt
|
53
|
-
@list = list.map{|i| i.to_i}
|
59
|
+
@list = list.split(/,/).map{|i| i.to_i}
|
54
60
|
end
|
55
61
|
|
56
62
|
def [] idx
|
@@ -86,43 +92,78 @@ module BioVcf
|
|
86
92
|
end
|
87
93
|
|
88
94
|
class VcfGenotypeField
|
95
|
+
|
96
|
+
attr_reader :format, :values, :header
|
97
|
+
|
89
98
|
def initialize s, format, header, alt
|
90
|
-
@
|
99
|
+
@is_empty = (s == '' or s == nil or s == './.')
|
100
|
+
@original_s = s
|
91
101
|
@format = format
|
92
102
|
@header = header
|
93
103
|
@alt = alt
|
94
104
|
end
|
95
105
|
|
96
|
-
def
|
97
|
-
@
|
106
|
+
def values
|
107
|
+
@cache_values ||= @original_s.split(/:/)
|
98
108
|
end
|
99
109
|
|
100
|
-
def
|
101
|
-
@
|
110
|
+
def empty?
|
111
|
+
@is_empty
|
102
112
|
end
|
103
113
|
|
104
|
-
def
|
105
|
-
|
114
|
+
def valid?
|
115
|
+
!@is_empty
|
116
|
+
end
|
117
|
+
|
118
|
+
def dp4
|
119
|
+
ilist('DP4')
|
120
|
+
end
|
121
|
+
def ad
|
122
|
+
ilist('AD')
|
123
|
+
end
|
124
|
+
def pl
|
125
|
+
ilist('PL')
|
106
126
|
end
|
107
127
|
|
108
128
|
def bcount
|
109
|
-
|
129
|
+
VcfNucleotideCount4.new(@alt,values[fetch('BCOUNT')])
|
110
130
|
end
|
111
131
|
|
112
132
|
def bq
|
113
|
-
|
133
|
+
VcfAltInfoList.new(@alt,values[fetch('BQ')])
|
114
134
|
end
|
115
135
|
|
116
136
|
def amq
|
117
|
-
|
137
|
+
VcfAltInfoList.new(@alt,values[fetch('AMQ')])
|
138
|
+
end
|
139
|
+
|
140
|
+
def method_missing(m, *args, &block)
|
141
|
+
return nil if @is_empty
|
142
|
+
if m =~ /\?$/
|
143
|
+
# query if a value exists, e.g., r.info.dp?
|
144
|
+
v = values[fetch(m.to_s.upcase.chop)]
|
145
|
+
v != nil
|
146
|
+
else
|
147
|
+
v = values[fetch(m.to_s.upcase)]
|
148
|
+
v = v.to_i if v =~ /^\d+$/
|
149
|
+
v = v.to_f if v =~ /^\d+\.\d+$/
|
150
|
+
v
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
def fetch name
|
157
|
+
raise "ERROR: Field with name #{name} does not exist!" if !@format[name]
|
158
|
+
@format[name]
|
159
|
+
end
|
160
|
+
|
161
|
+
def ilist name
|
162
|
+
v = values[fetch(name)]
|
163
|
+
return nil if not v
|
164
|
+
v.split(',').map{|i| i.to_i}
|
118
165
|
end
|
119
166
|
|
120
|
-
def method_missing(m, *args, &block)
|
121
|
-
v = @values[@format[m.to_s.upcase]]
|
122
|
-
v = v.to_i if v =~ /^\d+$/
|
123
|
-
v = v.to_f if v =~ /^\d+\.\d+$/
|
124
|
-
v
|
125
|
-
end
|
126
167
|
|
127
168
|
end
|
128
169
|
|
@@ -134,17 +175,21 @@ module BioVcf
|
|
134
175
|
@header = header
|
135
176
|
@alt = alt
|
136
177
|
@samples = {} # lazy cache
|
137
|
-
@
|
138
|
-
@header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
|
178
|
+
@sample_index = @header.sample_index()
|
139
179
|
end
|
140
180
|
|
141
181
|
def [] name
|
142
|
-
@samples[name] ||= VcfGenotypeField.new(@fields[@
|
182
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
|
143
183
|
end
|
144
184
|
|
145
185
|
def method_missing(m, *args, &block)
|
146
186
|
name = m.to_s
|
147
|
-
|
187
|
+
if name =~ /\?$/
|
188
|
+
# test for valid sample
|
189
|
+
return !VcfSample::empty?(@fields[@sample_index[name.chop]])
|
190
|
+
else
|
191
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
|
192
|
+
end
|
148
193
|
end
|
149
194
|
|
150
195
|
end
|
data/lib/bio-vcf/vcfheader.rb
CHANGED
@@ -42,6 +42,14 @@ module BioVcf
|
|
42
42
|
def samples
|
43
43
|
@samples ||= column_names[9..-1]
|
44
44
|
end
|
45
|
+
|
46
|
+
def sample_index
|
47
|
+
return @sample_index if @sample_index
|
48
|
+
index = {}
|
49
|
+
samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 }
|
50
|
+
@sample_index = index
|
51
|
+
index
|
52
|
+
end
|
45
53
|
end
|
46
54
|
|
47
55
|
end
|
data/lib/bio-vcf/vcfline.rb
CHANGED
@@ -4,6 +4,7 @@ module BioVcf
|
|
4
4
|
# Split a line into fields and check size
|
5
5
|
def VcfLine.parse line,expected_size=nil
|
6
6
|
fields = line.strip.split(/\t/)
|
7
|
+
raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6
|
7
8
|
raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size
|
8
9
|
fields
|
9
10
|
end
|
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -1,26 +1,72 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
-
class VcfRecordInfo
|
3
|
+
class VcfRecordInfo
|
4
4
|
def initialize s
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
5
|
+
@info = s
|
6
|
+
end
|
7
|
+
|
8
|
+
def to_s
|
9
|
+
if @h
|
10
|
+
@h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k]) }.join(';')
|
11
|
+
else
|
12
|
+
@info
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Set INFO fields (used by --rewrite)
|
17
|
+
def []= k, v
|
18
|
+
split_fields if not @h
|
19
|
+
kupper = k.upcase
|
20
|
+
@h[kupper] = v
|
21
|
+
@original_key[kupper] = k
|
22
|
+
end
|
23
|
+
|
24
|
+
def method_missing(m, *args, &block)
|
25
|
+
# split_fields if not @h
|
26
|
+
# /#{m}=(?<value>[^;])/.@info
|
27
|
+
v = if @h
|
28
|
+
@h[m.to_s.upcase]
|
29
|
+
else
|
30
|
+
@info =~ /#{m.to_s.upcase}=([^;]+)/
|
31
|
+
value = $1
|
32
|
+
# m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
|
33
|
+
# value = m[:value]
|
34
|
+
if value == nil
|
35
|
+
split_fields # no option but to split
|
36
|
+
@h[m.to_s.upcase]
|
37
|
+
else
|
38
|
+
value
|
39
|
+
end
|
40
|
+
end
|
41
|
+
ConvertStringToValue::convert(v)
|
14
42
|
end
|
15
43
|
|
44
|
+
private
|
45
|
+
|
46
|
+
def split_fields
|
47
|
+
@h = {}
|
48
|
+
@original_key = {}
|
49
|
+
@info.split(/;/).each do |f|
|
50
|
+
k,v = f.split(/=/)
|
51
|
+
kupper = k.upcase
|
52
|
+
@h[kupper] = v
|
53
|
+
@original_key[kupper] = k
|
54
|
+
end
|
55
|
+
end
|
16
56
|
end
|
17
57
|
|
18
58
|
module VcfRecordParser
|
19
59
|
# Parse the format field into a Hash
|
20
60
|
def VcfRecordParser.get_format s
|
21
|
-
|
22
|
-
|
23
|
-
|
61
|
+
if s==$cached_sample_format_s
|
62
|
+
$cached_sample_format
|
63
|
+
else
|
64
|
+
h = {}
|
65
|
+
s.split(/:/).each_with_index { |v,i| h[v] = i }
|
66
|
+
$cached_sample_format = h
|
67
|
+
$cached_sample_format_s = s
|
68
|
+
h
|
69
|
+
end
|
24
70
|
end
|
25
71
|
def VcfRecordParser.get_info s
|
26
72
|
VcfRecordInfo.new(s)
|
@@ -36,6 +82,15 @@ module BioVcf
|
|
36
82
|
['A','C','G','T'][index()]
|
37
83
|
end
|
38
84
|
|
85
|
+
# Get the GT when 0 is REF and >0 is ALT
|
86
|
+
def get_gt(index)
|
87
|
+
if index == 0
|
88
|
+
ref()
|
89
|
+
else
|
90
|
+
alt[index-1]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
39
94
|
def call_tumor_count
|
40
95
|
tumor.bcount.to_ary[index()]
|
41
96
|
end
|
@@ -63,11 +118,13 @@ module BioVcf
|
|
63
118
|
@fields = fields
|
64
119
|
@header = header
|
65
120
|
end
|
66
|
-
|
121
|
+
|
67
122
|
def chrom
|
68
123
|
@fields[0]
|
69
124
|
end
|
70
125
|
|
126
|
+
alias :chr :chrom
|
127
|
+
|
71
128
|
def pos
|
72
129
|
@pos ||= @fields[1].to_i
|
73
130
|
end
|
@@ -114,5 +171,76 @@ module BioVcf
|
|
114
171
|
def sample
|
115
172
|
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
116
173
|
end
|
174
|
+
|
175
|
+
def sample_by_name name
|
176
|
+
sample[name]
|
177
|
+
end
|
178
|
+
|
179
|
+
def each_sample(list = nil)
|
180
|
+
samples = @header.column_names[9..-1]
|
181
|
+
raise "Empty sample list, can not execute query!" if not samples
|
182
|
+
samples.each_with_index { |name,i|
|
183
|
+
# p [i,list]
|
184
|
+
next if list and not list.index(i.to_s)
|
185
|
+
yield VcfSample::Sample.new(self,sample[name])
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def missing_samples?
|
190
|
+
@fields[9..-1].each { |sample|
|
191
|
+
return true if VcfSample::empty?(sample)
|
192
|
+
}
|
193
|
+
false
|
194
|
+
end
|
195
|
+
|
196
|
+
def valid?
|
197
|
+
@fields.size == @header.column_names.size
|
198
|
+
end
|
199
|
+
|
200
|
+
def eval expr, ignore_missing_data, quiet
|
201
|
+
begin
|
202
|
+
if not respond_to?(:call_cached_eval)
|
203
|
+
code =
|
204
|
+
"""
|
205
|
+
def call_cached_eval(rec,fields)
|
206
|
+
r = rec
|
207
|
+
#{expr}
|
208
|
+
end
|
209
|
+
"""
|
210
|
+
self.class.class_eval(code)
|
211
|
+
end
|
212
|
+
res = call_cached_eval(self,@fields)
|
213
|
+
if res.kind_of?(Array)
|
214
|
+
res.join("\t")
|
215
|
+
else
|
216
|
+
res
|
217
|
+
end
|
218
|
+
rescue NoMethodError => e
|
219
|
+
if not quiet
|
220
|
+
$stderr.print "RECORD ERROR!\n"
|
221
|
+
$stderr.print [@fields],"\n"
|
222
|
+
$stderr.print expr,"\n"
|
223
|
+
end
|
224
|
+
if ignore_missing_data
|
225
|
+
$stderr.print e.message if not quiet
|
226
|
+
return false
|
227
|
+
else
|
228
|
+
raise
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Return the sample
|
234
|
+
def method_missing(m, *args, &block)
|
235
|
+
name = m.to_s
|
236
|
+
if name =~ /\?$/
|
237
|
+
# Query for empty sample name
|
238
|
+
@sample_index ||= @header.sample_index
|
239
|
+
return !VcfSample::empty?(@fields[@sample_index[name.chop]])
|
240
|
+
else
|
241
|
+
sample[name]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
117
245
|
end
|
118
246
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module BioVcf
|
2
|
+
module VcfSample
|
3
|
+
|
4
|
+
# Check whether a sample is empty (on the raw string value)
|
5
|
+
def VcfSample::empty? raw_sample
|
6
|
+
s = raw_sample.strip
|
7
|
+
s == './.' or s == '' or s == nil
|
8
|
+
end
|
9
|
+
|
10
|
+
class Sample
|
11
|
+
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
12
|
+
def initialize rec,sample
|
13
|
+
@rec = rec
|
14
|
+
@sample = sample
|
15
|
+
@format = @sample.format
|
16
|
+
@values = @sample.values
|
17
|
+
end
|
18
|
+
|
19
|
+
def eval expr, ignore_missing_data, quiet
|
20
|
+
begin
|
21
|
+
if not respond_to?(:call_cached_eval)
|
22
|
+
code =
|
23
|
+
"""
|
24
|
+
def call_cached_eval(rec,sample)
|
25
|
+
r = rec
|
26
|
+
s = sample
|
27
|
+
#{expr}
|
28
|
+
end
|
29
|
+
"""
|
30
|
+
self.class.class_eval(code)
|
31
|
+
end
|
32
|
+
call_cached_eval(@rec,self)
|
33
|
+
rescue NoMethodError => e
|
34
|
+
empty = VcfSample::empty?(@sample.values.to_s)
|
35
|
+
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty
|
36
|
+
if not quiet
|
37
|
+
$stderr.print [@format,@values],"\n"
|
38
|
+
$stderr.print expr,"\n"
|
39
|
+
end
|
40
|
+
if ignore_missing_data
|
41
|
+
$stderr.print e.message if not quiet and not empty
|
42
|
+
return false
|
43
|
+
else
|
44
|
+
raise
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Split GT into index values
|
50
|
+
def gti
|
51
|
+
v = fetch_values("GT")
|
52
|
+
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
53
|
+
end
|
54
|
+
|
55
|
+
# Split GT into into a nucleode sequence
|
56
|
+
def gts
|
57
|
+
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
58
|
+
end
|
59
|
+
|
60
|
+
def cache_method(name, &block)
|
61
|
+
self.class.send(:define_method, name, &block)
|
62
|
+
end
|
63
|
+
|
64
|
+
def method_missing(m, *args, &block)
|
65
|
+
name = m.to_s.upcase
|
66
|
+
if @format[name]
|
67
|
+
cache_method(m) {
|
68
|
+
ConvertStringToValue::convert(fetch_values(name))
|
69
|
+
}
|
70
|
+
self.send(m)
|
71
|
+
else
|
72
|
+
super(m, *args, &block)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def fetch_values name
|
79
|
+
n = @format[name]
|
80
|
+
raise "Unknown sample field <#{name}>" if not n
|
81
|
+
@values[n] # <-- save names with upcase!
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|