bio-vcf 0.7.0 → 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/Gemfile +2 -5
- data/Gemfile.lock +3 -3
- data/README.md +101 -23
- data/Rakefile +4 -2
- data/VERSION +1 -1
- data/bin/bio-vcf +133 -73
- data/bio-vcf.gemspec +13 -10
- data/features/cli.feature +9 -1
- data/features/multisample.feature +4 -4
- data/features/sfilter.feature +1 -1
- data/features/step_definitions/cli-feature.rb +4 -0
- data/features/step_definitions/multisample.rb +24 -12
- data/features/step_definitions/sfilter.rb +80 -31
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +45 -9
- data/lib/bio-vcf/vcfheader.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +14 -8
- data/lib/bio-vcf/vcfsample.rb +101 -152
- data/lib/bio-vcf/vcfstatistics.rb +28 -0
- data/test/data/regression/ifilter_s.dp.ref +31 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -0
- metadata +16 -12
data/lib/bio-vcf/vcfheader.rb
CHANGED
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -159,19 +159,24 @@ module BioVcf
|
|
159
159
|
@format ||= VcfRecordParser.get_format(@fields[8])
|
160
160
|
end
|
161
161
|
|
162
|
+
# Return the first (single) sample (used in one sample VCF)
|
163
|
+
def first
|
164
|
+
@first ||= VcfGenotypeField.new(@fields[9],format,@header,ref,alt)
|
165
|
+
end
|
166
|
+
|
162
167
|
# Return the normal sample (used in two sample VCF)
|
163
168
|
def normal
|
164
|
-
|
169
|
+
first
|
165
170
|
end
|
166
171
|
|
167
172
|
# Return the tumor sample (used in two sample VCF)
|
168
173
|
def tumor
|
169
|
-
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
|
174
|
+
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,ref,alt)
|
170
175
|
end
|
171
176
|
|
172
177
|
# Return the sample as a named hash
|
173
178
|
def sample
|
174
|
-
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
179
|
+
@sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt)
|
175
180
|
end
|
176
181
|
|
177
182
|
def sample_by_name name
|
@@ -179,14 +184,15 @@ module BioVcf
|
|
179
184
|
end
|
180
185
|
|
181
186
|
def sample_by_index i
|
182
|
-
# p
|
183
|
-
|
187
|
+
# p @fields
|
188
|
+
raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
|
189
|
+
@sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
|
184
190
|
end
|
185
191
|
|
186
192
|
# Walk the samples. list contains an Array of int (the index)
|
187
193
|
def each_sample(list = nil)
|
188
194
|
list = @header.samples_index_array() if not list
|
189
|
-
list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
|
195
|
+
list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i.to_i)) }
|
190
196
|
end
|
191
197
|
|
192
198
|
def missing_samples?
|
@@ -200,7 +206,7 @@ module BioVcf
|
|
200
206
|
@fields.size == @header.column_names.size
|
201
207
|
end
|
202
208
|
|
203
|
-
def eval expr, ignore_missing_data, quiet
|
209
|
+
def eval expr, ignore_missing_data: true, quiet: false
|
204
210
|
begin
|
205
211
|
if not respond_to?(:call_cached_eval)
|
206
212
|
code =
|
@@ -233,7 +239,7 @@ module BioVcf
|
|
233
239
|
end
|
234
240
|
end
|
235
241
|
|
236
|
-
def filter expr, ignore_missing_data, quiet
|
242
|
+
def filter expr, ignore_missing_data: true, quiet: false
|
237
243
|
begin
|
238
244
|
if not respond_to?(:call_cached_filter)
|
239
245
|
code =
|
data/lib/bio-vcf/vcfsample.rb
CHANGED
@@ -1,177 +1,126 @@
|
|
1
1
|
module BioVcf
|
2
2
|
module VcfSample
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
end
|
8
|
-
|
9
|
-
class Sample
|
10
|
-
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
11
|
-
def initialize rec,sample
|
12
|
-
@rec = rec
|
13
|
-
@sample = sample
|
14
|
-
@format = @sample.format
|
15
|
-
@values = @sample.values
|
4
|
+
# Check whether a sample is empty (on the raw string value)
|
5
|
+
def VcfSample::empty? s
|
6
|
+
s == './.' or s == '' or s == nil
|
16
7
|
end
|
17
8
|
|
18
|
-
|
19
|
-
|
20
|
-
|
9
|
+
class Sample
|
10
|
+
# Initialized sample with rec and genotypefield
|
11
|
+
#
|
12
|
+
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
13
|
+
def initialize rec,genotypefield
|
14
|
+
@rec = rec
|
15
|
+
@sample = genotypefield
|
16
|
+
@format = @sample.format
|
17
|
+
@values = @sample.values
|
18
|
+
end
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
if not respond_to?(:call_cached_eval)
|
25
|
-
code =
|
26
|
-
"""
|
27
|
-
def call_cached_eval(rec,sample)
|
28
|
-
r = rec
|
29
|
-
s = sample
|
30
|
-
#{expr}
|
31
|
-
end
|
32
|
-
"""
|
33
|
-
self.class.class_eval(code)
|
34
|
-
end
|
35
|
-
call_cached_eval(@rec,self)
|
36
|
-
rescue NoMethodError => e
|
37
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
38
|
-
if not quiet
|
39
|
-
$stderr.print [@format,@values],"\n"
|
40
|
-
$stderr.print expr,"\n"
|
41
|
-
end
|
42
|
-
if ignore_missing_data
|
43
|
-
$stderr.print e.message if not quiet and not empty?
|
44
|
-
return false
|
45
|
-
else
|
46
|
-
raise
|
47
|
-
end
|
20
|
+
def empty?
|
21
|
+
cache_empty ||= VcfSample::empty?(@sample.to_s)
|
48
22
|
end
|
49
|
-
end
|
50
23
|
|
51
|
-
|
52
|
-
|
53
|
-
if not respond_to?(:call_cached_sfilter)
|
54
|
-
code =
|
55
|
-
"""
|
56
|
-
def call_cached_sfilter(rec,sample)
|
57
|
-
r = rec
|
58
|
-
s = sample
|
59
|
-
#{expr}
|
60
|
-
end
|
61
|
-
"""
|
62
|
-
self.class.class_eval(code)
|
63
|
-
end
|
64
|
-
call_cached_sfilter(@rec,self)
|
65
|
-
rescue NoMethodError => e
|
66
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
67
|
-
if not quiet
|
68
|
-
$stderr.print [@format,@values],"\n"
|
69
|
-
$stderr.print expr,"\n"
|
70
|
-
end
|
71
|
-
if ignore_missing_data
|
72
|
-
$stderr.print e.message if not quiet and not empty?
|
73
|
-
return false
|
74
|
-
else
|
75
|
-
raise
|
76
|
-
end
|
24
|
+
def eval expr, ignore_missing_data: false, quiet: false, do_cache: true
|
25
|
+
caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
77
26
|
end
|
78
|
-
end
|
79
27
|
|
80
|
-
|
81
|
-
|
82
|
-
if not respond_to?(:call_cached_ifilter)
|
83
|
-
code =
|
84
|
-
"""
|
85
|
-
def call_cached_ifilter(rec,sample)
|
86
|
-
r = rec
|
87
|
-
s = sample
|
88
|
-
#{expr}
|
89
|
-
end
|
90
|
-
"""
|
91
|
-
self.class.class_eval(code)
|
92
|
-
end
|
93
|
-
call_cached_ifilter(@rec,self)
|
94
|
-
rescue NoMethodError => e
|
95
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
96
|
-
if not quiet
|
97
|
-
$stderr.print [@format,@values],"\n"
|
98
|
-
$stderr.print expr,"\n"
|
99
|
-
end
|
100
|
-
if ignore_missing_data
|
101
|
-
$stderr.print e.message if not quiet and not empty?
|
102
|
-
return false
|
103
|
-
else
|
104
|
-
raise
|
105
|
-
end
|
28
|
+
def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true
|
29
|
+
caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
106
30
|
end
|
107
|
-
end
|
108
31
|
|
109
|
-
|
110
|
-
|
111
|
-
if not respond_to?(:call_cached_efilter)
|
112
|
-
code =
|
113
|
-
"""
|
114
|
-
def call_cached_efilter(rec,sample)
|
115
|
-
r = rec
|
116
|
-
s = sample
|
117
|
-
#{expr}
|
118
|
-
end
|
119
|
-
"""
|
120
|
-
self.class.class_eval(code)
|
121
|
-
end
|
122
|
-
call_cached_efilter(@rec,self)
|
123
|
-
rescue NoMethodError => e
|
124
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
125
|
-
if not quiet
|
126
|
-
$stderr.print [@format,@values],"\n"
|
127
|
-
$stderr.print expr,"\n"
|
128
|
-
end
|
129
|
-
if ignore_missing_data
|
130
|
-
$stderr.print e.message if not quiet and not empty?
|
131
|
-
return false
|
132
|
-
else
|
133
|
-
raise
|
134
|
-
end
|
32
|
+
def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true
|
33
|
+
caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
135
34
|
end
|
136
|
-
end
|
137
35
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
142
|
-
end
|
36
|
+
def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true
|
37
|
+
caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
38
|
+
end
|
143
39
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
40
|
+
# Split GT into index values
|
41
|
+
def gti
|
42
|
+
v = fetch_values("GT")
|
43
|
+
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
44
|
+
end
|
148
45
|
|
149
|
-
|
150
|
-
|
151
|
-
|
46
|
+
# Split GT into into a nucleode sequence
|
47
|
+
def gts
|
48
|
+
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
49
|
+
end
|
152
50
|
|
153
|
-
|
154
|
-
|
155
|
-
if @format[name]
|
156
|
-
cache_method(m) {
|
157
|
-
ConvertStringToValue::convert(fetch_values(name))
|
158
|
-
}
|
159
|
-
self.send(m)
|
160
|
-
else
|
161
|
-
super(m, *args, &block)
|
51
|
+
def cache_method(name, &block)
|
52
|
+
self.class.send(:define_method, name, &block)
|
162
53
|
end
|
163
|
-
|
54
|
+
|
55
|
+
def method_missing(m, *args, &block)
|
56
|
+
name = m.to_s.upcase
|
57
|
+
# p [:here,name,m ,@values]
|
58
|
+
# p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)]
|
59
|
+
if name =~ /\?$/
|
60
|
+
# test for valid field
|
61
|
+
return !VcfValue::empty?(fetch_values(name.chop))
|
62
|
+
else
|
63
|
+
if @format[name]
|
64
|
+
cache_method(m) {
|
65
|
+
v = fetch_values(name)
|
66
|
+
return nil if VcfValue::empty?(v)
|
67
|
+
ConvertStringToValue::convert(v)
|
68
|
+
}
|
69
|
+
self.send(m)
|
70
|
+
else
|
71
|
+
super(m, *args, &block)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
164
75
|
|
165
|
-
private
|
76
|
+
private
|
166
77
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
78
|
+
def fetch_values name
|
79
|
+
n = @format[name]
|
80
|
+
raise "Unknown sample field <#{name}>" if not n
|
81
|
+
@values[n] # <-- save names with upcase!
|
82
|
+
end
|
172
83
|
|
173
|
-
|
84
|
+
def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true
|
85
|
+
begin
|
86
|
+
if do_cache
|
87
|
+
if not respond_to?(cached_method)
|
88
|
+
code =
|
89
|
+
"""
|
90
|
+
def #{cached_method}(rec,sample)
|
91
|
+
r = rec
|
92
|
+
s = sample
|
93
|
+
#{expr}
|
94
|
+
end
|
95
|
+
"""
|
96
|
+
self.class.class_eval(code)
|
97
|
+
end
|
98
|
+
self.send(cached_method,@rec,self)
|
99
|
+
else
|
100
|
+
# This is used for testing mostly
|
101
|
+
print "WARNING: NOT CACHING #{method}\n"
|
102
|
+
self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval)
|
103
|
+
self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter)
|
104
|
+
r = @rec
|
105
|
+
s = @sample
|
106
|
+
eval(expr)
|
107
|
+
end
|
108
|
+
rescue NoMethodError => e
|
109
|
+
$stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
110
|
+
if not quiet
|
111
|
+
$stderr.print [:format,@format,:sample,@values],"\n"
|
112
|
+
$stderr.print [:filter,expr],"\n"
|
113
|
+
end
|
114
|
+
if ignore_missing_data
|
115
|
+
$stderr.print e.message if not quiet and not empty?
|
116
|
+
return false
|
117
|
+
else
|
118
|
+
raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
174
122
|
|
123
|
+
end
|
175
124
|
|
176
125
|
end
|
177
126
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
class VcfStatistics
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@count = 0
|
7
|
+
@ref_alt_count = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def add rec
|
11
|
+
@count += 1
|
12
|
+
s = rec.ref+">"+rec.alt[0]
|
13
|
+
@ref_alt_count[s] ||= 0
|
14
|
+
@ref_alt_count[s] += 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def print
|
18
|
+
puts "## ==== Statistics =================================="
|
19
|
+
@ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v|
|
20
|
+
printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round
|
21
|
+
end
|
22
|
+
puts "Total\t#{@count}"
|
23
|
+
puts "## =================================================="
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
1 10257 159 242 249 249 186 212 218
|
2
|
+
1 10291 165 249 249 247 161 163 189
|
3
|
+
1 10297 182 246 250 246 165 158 183
|
4
|
+
1 10303 198 247 248 248 172 157 182
|
5
|
+
1 10315 212 246 242 245 190 157 189
|
6
|
+
1 10321 218 246 248 248 193 164 196
|
7
|
+
1 10327 237 238 229 237 209 183 210
|
8
|
+
1 12783 58 164 144 182 126 103 158
|
9
|
+
1 13116 32 131 102 152 104 88 109
|
10
|
+
1 13118 34 129 101 145 99 85 108
|
11
|
+
1 13178 52 172 137 172 129 119 148
|
12
|
+
1 13302 36 136 99 146 90 65 117
|
13
|
+
1 13757 53 201 181 250 152 130 182
|
14
|
+
1 13868 75 192 182 224 142 111 167
|
15
|
+
1 13896 62 135 143 175 112 81 121
|
16
|
+
1 14354 43 158 115 145 72 119
|
17
|
+
1 14464 51 155 141 150 83 89 140
|
18
|
+
1 14673 36 142 117 157 95 76 131
|
19
|
+
1 14699 43 128 109 147 98 78 114
|
20
|
+
1 14907 57 216 162 205 153 118 158
|
21
|
+
1 14930 68 216 170 210 136 125 164
|
22
|
+
1 14933 68 216 169 212 132 128 164
|
23
|
+
1 14948 63 192 181 211 129 121 153
|
24
|
+
1 14976 56 166 161 196 109 116 135
|
25
|
+
1 15118 46 198 129 230 113 126 158
|
26
|
+
1 15190 53 208 170 200 126 145 179
|
27
|
+
1 15211 54 183 161 171 120 134 168
|
28
|
+
1 15274 37 121 102 137 71 67 98
|
29
|
+
1 15447 46 242 183 226 137 173 175
|
30
|
+
1 15688 37 182 147 184 100 101 148
|
31
|
+
1 16103 50 79 86 106 60 61 84
|
@@ -0,0 +1 @@
|
|
1
|
+
unexpected return
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-vcf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -42,31 +42,32 @@ dependencies:
|
|
42
42
|
name: jeweler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 2.0.1
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 2.0.1
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: regressiontest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: 0.0.3
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
69
|
-
description: Smart parser for VCF format
|
68
|
+
version: 0.0.3
|
69
|
+
description: Smart lazy multi-threaded parser for VCF format with useful filtering
|
70
|
+
and output rewriting
|
70
71
|
email: pjotr.public01@thebird.nl
|
71
72
|
executables:
|
72
73
|
- bio-vcf
|
@@ -106,10 +107,12 @@ files:
|
|
106
107
|
- lib/bio-vcf/vcfrdf.rb
|
107
108
|
- lib/bio-vcf/vcfrecord.rb
|
108
109
|
- lib/bio-vcf/vcfsample.rb
|
110
|
+
- lib/bio-vcf/vcfstatistics.rb
|
109
111
|
- test/data/input/dbsnp.vcf
|
110
112
|
- test/data/input/multisample.vcf
|
111
113
|
- test/data/input/somaticsniper.vcf
|
112
114
|
- test/data/regression/eval_r.info.dp.ref
|
115
|
+
- test/data/regression/ifilter_s.dp.ref
|
113
116
|
- test/data/regression/r.info.dp.ref
|
114
117
|
- test/data/regression/rewrite.info.sample.ref
|
115
118
|
- test/data/regression/s.dp.ref
|
@@ -117,6 +120,7 @@ files:
|
|
117
120
|
- test/data/regression/sfilter_seval_s.dp.ref
|
118
121
|
- test/data/regression/thread4.ref
|
119
122
|
- test/data/regression/thread4_4.ref
|
123
|
+
- test/data/regression/thread4_4_failed_filter-stderr.ref
|
120
124
|
- test/performance/metrics.md
|
121
125
|
homepage: http://github.com/pjotrp/bioruby-vcf
|
122
126
|
licenses:
|
@@ -141,5 +145,5 @@ rubyforge_project:
|
|
141
145
|
rubygems_version: 2.0.3
|
142
146
|
signing_key:
|
143
147
|
specification_version: 4
|
144
|
-
summary: VCF parser
|
148
|
+
summary: Fast multi-threaded VCF parser
|
145
149
|
test_files: []
|