bio-vcf 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/Gemfile.lock +8 -0
- data/README.md +376 -11
- data/VERSION +1 -1
- data/bin/bio-vcf +172 -39
- data/bio-vcf.gemspec +18 -3
- data/features/cli.feature +32 -0
- data/features/multisample.feature +28 -10
- data/features/step_definitions/cli-feature.rb +12 -0
- data/features/step_definitions/multisample.rb +64 -18
- data/features/support/env.rb +5 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/utils.rb +23 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +73 -28
- data/lib/bio-vcf/vcfheader.rb +8 -0
- data/lib/bio-vcf/vcfline.rb +1 -0
- data/lib/bio-vcf/vcfrecord.rb +142 -14
- data/lib/bio-vcf/vcfsample.rb +88 -0
- data/test/data/input/dbsnp.vcf +200 -0
- data/test/data/input/multisample.vcf +2 -2
- data/test/data/regression/eval_r.info.dp.ref +150 -0
- data/test/data/regression/r.info.dp.ref +147 -0
- data/test/data/regression/rewrite.info.sample.ref +150 -0
- data/test/data/regression/s.dp.ref +145 -0
- data/test/data/regression/seval_s.dp.ref +36 -0
- data/test/data/regression/sfilter001.ref +145 -0
- data/test/performance/metrics.md +98 -0
- metadata +28 -2
data/features/support/env.rb
CHANGED
@@ -14,4 +14,9 @@ require 'bio-vcf'
|
|
14
14
|
|
15
15
|
require 'rspec/expectations'
|
16
16
|
|
17
|
+
# Add the regression module if in the path (it can also be a gem)
|
18
|
+
rootdir = File.dirname(__FILE__) + '/../..'
|
19
|
+
$LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
|
20
|
+
require 'regressiontest'
|
21
|
+
|
17
22
|
include BioVcf
|
data/lib/bio-vcf.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
module ConvertStringToValue
|
4
|
+
def self::integer?(str)
|
5
|
+
!!Integer(str) rescue false
|
6
|
+
end
|
7
|
+
|
8
|
+
def self::float?(str)
|
9
|
+
!!Float(str) rescue false
|
10
|
+
end
|
11
|
+
|
12
|
+
def self::convert v
|
13
|
+
if integer?(v) # the common case
|
14
|
+
v = v.to_i
|
15
|
+
else
|
16
|
+
# 150.268 or 9.68463e-05
|
17
|
+
v = v.to_f if float?(v)
|
18
|
+
end
|
19
|
+
v
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -3,23 +3,28 @@ module BioVcf
|
|
3
3
|
MAXINT=100_000
|
4
4
|
|
5
5
|
# Helper class for a list of (variant) values, such as A,G.
|
6
|
-
# The [] function does the hard work
|
7
|
-
|
6
|
+
# The [] function does the hard work. You can pass in an index (integer)
|
7
|
+
# or nucleotide which translates to an index.
|
8
|
+
# (see ./features for examples)
|
9
|
+
class VcfNucleotideCount4
|
8
10
|
def initialize alt,list
|
9
11
|
@alt = alt
|
10
|
-
@list = list.map{|i| i.to_i}
|
12
|
+
@list = list.split(/,/).map{|i| i.to_i}
|
11
13
|
end
|
12
14
|
|
13
15
|
def [] idx
|
14
16
|
if idx.kind_of?(Integer)
|
15
|
-
|
17
|
+
# return a value
|
18
|
+
@list[idx]
|
16
19
|
elsif idx.kind_of?(String)
|
17
|
-
|
20
|
+
# return a value
|
21
|
+
@list[["A","C","G","T"].index(idx)]
|
18
22
|
else idx.kind_of?(Array)
|
23
|
+
# return a list of values
|
19
24
|
idx.map { |nuc|
|
20
25
|
idx2 = ["A","C","G","T"].index(nuc)
|
21
26
|
# p [idx,nuc,idx2,@list]
|
22
|
-
@list[idx2]
|
27
|
+
@list[idx2]
|
23
28
|
}
|
24
29
|
end
|
25
30
|
end
|
@@ -47,10 +52,11 @@ module BioVcf
|
|
47
52
|
|
48
53
|
end
|
49
54
|
|
50
|
-
|
55
|
+
# Handle info fields with multiple entries, possibly relating to ALT (single nucleotide only)
|
56
|
+
class VcfAltInfoList
|
51
57
|
def initialize alt,list
|
52
58
|
@alt = alt
|
53
|
-
@list = list.map{|i| i.to_i}
|
59
|
+
@list = list.split(/,/).map{|i| i.to_i}
|
54
60
|
end
|
55
61
|
|
56
62
|
def [] idx
|
@@ -86,43 +92,78 @@ module BioVcf
|
|
86
92
|
end
|
87
93
|
|
88
94
|
class VcfGenotypeField
|
95
|
+
|
96
|
+
attr_reader :format, :values, :header
|
97
|
+
|
89
98
|
def initialize s, format, header, alt
|
90
|
-
@
|
99
|
+
@is_empty = (s == '' or s == nil or s == './.')
|
100
|
+
@original_s = s
|
91
101
|
@format = format
|
92
102
|
@header = header
|
93
103
|
@alt = alt
|
94
104
|
end
|
95
105
|
|
96
|
-
def
|
97
|
-
@
|
106
|
+
def values
|
107
|
+
@cache_values ||= @original_s.split(/:/)
|
98
108
|
end
|
99
109
|
|
100
|
-
def
|
101
|
-
@
|
110
|
+
def empty?
|
111
|
+
@is_empty
|
102
112
|
end
|
103
113
|
|
104
|
-
def
|
105
|
-
|
114
|
+
def valid?
|
115
|
+
!@is_empty
|
116
|
+
end
|
117
|
+
|
118
|
+
def dp4
|
119
|
+
ilist('DP4')
|
120
|
+
end
|
121
|
+
def ad
|
122
|
+
ilist('AD')
|
123
|
+
end
|
124
|
+
def pl
|
125
|
+
ilist('PL')
|
106
126
|
end
|
107
127
|
|
108
128
|
def bcount
|
109
|
-
|
129
|
+
VcfNucleotideCount4.new(@alt,values[fetch('BCOUNT')])
|
110
130
|
end
|
111
131
|
|
112
132
|
def bq
|
113
|
-
|
133
|
+
VcfAltInfoList.new(@alt,values[fetch('BQ')])
|
114
134
|
end
|
115
135
|
|
116
136
|
def amq
|
117
|
-
|
137
|
+
VcfAltInfoList.new(@alt,values[fetch('AMQ')])
|
138
|
+
end
|
139
|
+
|
140
|
+
def method_missing(m, *args, &block)
|
141
|
+
return nil if @is_empty
|
142
|
+
if m =~ /\?$/
|
143
|
+
# query if a value exists, e.g., r.info.dp?
|
144
|
+
v = values[fetch(m.to_s.upcase.chop)]
|
145
|
+
v != nil
|
146
|
+
else
|
147
|
+
v = values[fetch(m.to_s.upcase)]
|
148
|
+
v = v.to_i if v =~ /^\d+$/
|
149
|
+
v = v.to_f if v =~ /^\d+\.\d+$/
|
150
|
+
v
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
def fetch name
|
157
|
+
raise "ERROR: Field with name #{name} does not exist!" if !@format[name]
|
158
|
+
@format[name]
|
159
|
+
end
|
160
|
+
|
161
|
+
def ilist name
|
162
|
+
v = values[fetch(name)]
|
163
|
+
return nil if not v
|
164
|
+
v.split(',').map{|i| i.to_i}
|
118
165
|
end
|
119
166
|
|
120
|
-
def method_missing(m, *args, &block)
|
121
|
-
v = @values[@format[m.to_s.upcase]]
|
122
|
-
v = v.to_i if v =~ /^\d+$/
|
123
|
-
v = v.to_f if v =~ /^\d+\.\d+$/
|
124
|
-
v
|
125
|
-
end
|
126
167
|
|
127
168
|
end
|
128
169
|
|
@@ -134,17 +175,21 @@ module BioVcf
|
|
134
175
|
@header = header
|
135
176
|
@alt = alt
|
136
177
|
@samples = {} # lazy cache
|
137
|
-
@
|
138
|
-
@header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
|
178
|
+
@sample_index = @header.sample_index()
|
139
179
|
end
|
140
180
|
|
141
181
|
def [] name
|
142
|
-
@samples[name] ||= VcfGenotypeField.new(@fields[@
|
182
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
|
143
183
|
end
|
144
184
|
|
145
185
|
def method_missing(m, *args, &block)
|
146
186
|
name = m.to_s
|
147
|
-
|
187
|
+
if name =~ /\?$/
|
188
|
+
# test for valid sample
|
189
|
+
return !VcfSample::empty?(@fields[@sample_index[name.chop]])
|
190
|
+
else
|
191
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@alt)
|
192
|
+
end
|
148
193
|
end
|
149
194
|
|
150
195
|
end
|
data/lib/bio-vcf/vcfheader.rb
CHANGED
@@ -42,6 +42,14 @@ module BioVcf
|
|
42
42
|
def samples
|
43
43
|
@samples ||= column_names[9..-1]
|
44
44
|
end
|
45
|
+
|
46
|
+
def sample_index
|
47
|
+
return @sample_index if @sample_index
|
48
|
+
index = {}
|
49
|
+
samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 }
|
50
|
+
@sample_index = index
|
51
|
+
index
|
52
|
+
end
|
45
53
|
end
|
46
54
|
|
47
55
|
end
|
data/lib/bio-vcf/vcfline.rb
CHANGED
@@ -4,6 +4,7 @@ module BioVcf
|
|
4
4
|
# Split a line into fields and check size
|
5
5
|
def VcfLine.parse line,expected_size=nil
|
6
6
|
fields = line.strip.split(/\t/)
|
7
|
+
raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6
|
7
8
|
raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size
|
8
9
|
fields
|
9
10
|
end
|
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -1,26 +1,72 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
-
class VcfRecordInfo
|
3
|
+
class VcfRecordInfo
|
4
4
|
def initialize s
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
5
|
+
@info = s
|
6
|
+
end
|
7
|
+
|
8
|
+
def to_s
|
9
|
+
if @h
|
10
|
+
@h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k]) }.join(';')
|
11
|
+
else
|
12
|
+
@info
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Set INFO fields (used by --rewrite)
|
17
|
+
def []= k, v
|
18
|
+
split_fields if not @h
|
19
|
+
kupper = k.upcase
|
20
|
+
@h[kupper] = v
|
21
|
+
@original_key[kupper] = k
|
22
|
+
end
|
23
|
+
|
24
|
+
def method_missing(m, *args, &block)
|
25
|
+
# split_fields if not @h
|
26
|
+
# /#{m}=(?<value>[^;])/.@info
|
27
|
+
v = if @h
|
28
|
+
@h[m.to_s.upcase]
|
29
|
+
else
|
30
|
+
@info =~ /#{m.to_s.upcase}=([^;]+)/
|
31
|
+
value = $1
|
32
|
+
# m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
|
33
|
+
# value = m[:value]
|
34
|
+
if value == nil
|
35
|
+
split_fields # no option but to split
|
36
|
+
@h[m.to_s.upcase]
|
37
|
+
else
|
38
|
+
value
|
39
|
+
end
|
40
|
+
end
|
41
|
+
ConvertStringToValue::convert(v)
|
14
42
|
end
|
15
43
|
|
44
|
+
private
|
45
|
+
|
46
|
+
def split_fields
|
47
|
+
@h = {}
|
48
|
+
@original_key = {}
|
49
|
+
@info.split(/;/).each do |f|
|
50
|
+
k,v = f.split(/=/)
|
51
|
+
kupper = k.upcase
|
52
|
+
@h[kupper] = v
|
53
|
+
@original_key[kupper] = k
|
54
|
+
end
|
55
|
+
end
|
16
56
|
end
|
17
57
|
|
18
58
|
module VcfRecordParser
|
19
59
|
# Parse the format field into a Hash
|
20
60
|
def VcfRecordParser.get_format s
|
21
|
-
|
22
|
-
|
23
|
-
|
61
|
+
if s==$cached_sample_format_s
|
62
|
+
$cached_sample_format
|
63
|
+
else
|
64
|
+
h = {}
|
65
|
+
s.split(/:/).each_with_index { |v,i| h[v] = i }
|
66
|
+
$cached_sample_format = h
|
67
|
+
$cached_sample_format_s = s
|
68
|
+
h
|
69
|
+
end
|
24
70
|
end
|
25
71
|
def VcfRecordParser.get_info s
|
26
72
|
VcfRecordInfo.new(s)
|
@@ -36,6 +82,15 @@ module BioVcf
|
|
36
82
|
['A','C','G','T'][index()]
|
37
83
|
end
|
38
84
|
|
85
|
+
# Get the GT when 0 is REF and >0 is ALT
|
86
|
+
def get_gt(index)
|
87
|
+
if index == 0
|
88
|
+
ref()
|
89
|
+
else
|
90
|
+
alt[index-1]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
39
94
|
def call_tumor_count
|
40
95
|
tumor.bcount.to_ary[index()]
|
41
96
|
end
|
@@ -63,11 +118,13 @@ module BioVcf
|
|
63
118
|
@fields = fields
|
64
119
|
@header = header
|
65
120
|
end
|
66
|
-
|
121
|
+
|
67
122
|
def chrom
|
68
123
|
@fields[0]
|
69
124
|
end
|
70
125
|
|
126
|
+
alias :chr :chrom
|
127
|
+
|
71
128
|
def pos
|
72
129
|
@pos ||= @fields[1].to_i
|
73
130
|
end
|
@@ -114,5 +171,76 @@ module BioVcf
|
|
114
171
|
def sample
|
115
172
|
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
116
173
|
end
|
174
|
+
|
175
|
+
def sample_by_name name
|
176
|
+
sample[name]
|
177
|
+
end
|
178
|
+
|
179
|
+
def each_sample(list = nil)
|
180
|
+
samples = @header.column_names[9..-1]
|
181
|
+
raise "Empty sample list, can not execute query!" if not samples
|
182
|
+
samples.each_with_index { |name,i|
|
183
|
+
# p [i,list]
|
184
|
+
next if list and not list.index(i.to_s)
|
185
|
+
yield VcfSample::Sample.new(self,sample[name])
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def missing_samples?
|
190
|
+
@fields[9..-1].each { |sample|
|
191
|
+
return true if VcfSample::empty?(sample)
|
192
|
+
}
|
193
|
+
false
|
194
|
+
end
|
195
|
+
|
196
|
+
def valid?
|
197
|
+
@fields.size == @header.column_names.size
|
198
|
+
end
|
199
|
+
|
200
|
+
def eval expr, ignore_missing_data, quiet
|
201
|
+
begin
|
202
|
+
if not respond_to?(:call_cached_eval)
|
203
|
+
code =
|
204
|
+
"""
|
205
|
+
def call_cached_eval(rec,fields)
|
206
|
+
r = rec
|
207
|
+
#{expr}
|
208
|
+
end
|
209
|
+
"""
|
210
|
+
self.class.class_eval(code)
|
211
|
+
end
|
212
|
+
res = call_cached_eval(self,@fields)
|
213
|
+
if res.kind_of?(Array)
|
214
|
+
res.join("\t")
|
215
|
+
else
|
216
|
+
res
|
217
|
+
end
|
218
|
+
rescue NoMethodError => e
|
219
|
+
if not quiet
|
220
|
+
$stderr.print "RECORD ERROR!\n"
|
221
|
+
$stderr.print [@fields],"\n"
|
222
|
+
$stderr.print expr,"\n"
|
223
|
+
end
|
224
|
+
if ignore_missing_data
|
225
|
+
$stderr.print e.message if not quiet
|
226
|
+
return false
|
227
|
+
else
|
228
|
+
raise
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Return the sample
|
234
|
+
def method_missing(m, *args, &block)
|
235
|
+
name = m.to_s
|
236
|
+
if name =~ /\?$/
|
237
|
+
# Query for empty sample name
|
238
|
+
@sample_index ||= @header.sample_index
|
239
|
+
return !VcfSample::empty?(@fields[@sample_index[name.chop]])
|
240
|
+
else
|
241
|
+
sample[name]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
117
245
|
end
|
118
246
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module BioVcf
|
2
|
+
module VcfSample
|
3
|
+
|
4
|
+
# Check whether a sample is empty (on the raw string value)
|
5
|
+
def VcfSample::empty? raw_sample
|
6
|
+
s = raw_sample.strip
|
7
|
+
s == './.' or s == '' or s == nil
|
8
|
+
end
|
9
|
+
|
10
|
+
class Sample
|
11
|
+
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
12
|
+
def initialize rec,sample
|
13
|
+
@rec = rec
|
14
|
+
@sample = sample
|
15
|
+
@format = @sample.format
|
16
|
+
@values = @sample.values
|
17
|
+
end
|
18
|
+
|
19
|
+
def eval expr, ignore_missing_data, quiet
|
20
|
+
begin
|
21
|
+
if not respond_to?(:call_cached_eval)
|
22
|
+
code =
|
23
|
+
"""
|
24
|
+
def call_cached_eval(rec,sample)
|
25
|
+
r = rec
|
26
|
+
s = sample
|
27
|
+
#{expr}
|
28
|
+
end
|
29
|
+
"""
|
30
|
+
self.class.class_eval(code)
|
31
|
+
end
|
32
|
+
call_cached_eval(@rec,self)
|
33
|
+
rescue NoMethodError => e
|
34
|
+
empty = VcfSample::empty?(@sample.values.to_s)
|
35
|
+
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty
|
36
|
+
if not quiet
|
37
|
+
$stderr.print [@format,@values],"\n"
|
38
|
+
$stderr.print expr,"\n"
|
39
|
+
end
|
40
|
+
if ignore_missing_data
|
41
|
+
$stderr.print e.message if not quiet and not empty
|
42
|
+
return false
|
43
|
+
else
|
44
|
+
raise
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Split GT into index values
|
50
|
+
def gti
|
51
|
+
v = fetch_values("GT")
|
52
|
+
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
53
|
+
end
|
54
|
+
|
55
|
+
# Split GT into into a nucleode sequence
|
56
|
+
def gts
|
57
|
+
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
58
|
+
end
|
59
|
+
|
60
|
+
def cache_method(name, &block)
|
61
|
+
self.class.send(:define_method, name, &block)
|
62
|
+
end
|
63
|
+
|
64
|
+
def method_missing(m, *args, &block)
|
65
|
+
name = m.to_s.upcase
|
66
|
+
if @format[name]
|
67
|
+
cache_method(m) {
|
68
|
+
ConvertStringToValue::convert(fetch_values(name))
|
69
|
+
}
|
70
|
+
self.send(m)
|
71
|
+
else
|
72
|
+
super(m, *args, &block)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def fetch_values name
|
79
|
+
n = @format[name]
|
80
|
+
raise "Unknown sample field <#{name}>" if not n
|
81
|
+
@values[n] # <-- save names with upcase!
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|