bio-vcf 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/Gemfile +2 -5
- data/Gemfile.lock +3 -3
- data/README.md +101 -23
- data/Rakefile +4 -2
- data/VERSION +1 -1
- data/bin/bio-vcf +133 -73
- data/bio-vcf.gemspec +13 -10
- data/features/cli.feature +9 -1
- data/features/multisample.feature +4 -4
- data/features/sfilter.feature +1 -1
- data/features/step_definitions/cli-feature.rb +4 -0
- data/features/step_definitions/multisample.rb +24 -12
- data/features/step_definitions/sfilter.rb +80 -31
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +45 -9
- data/lib/bio-vcf/vcfheader.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +14 -8
- data/lib/bio-vcf/vcfsample.rb +101 -152
- data/lib/bio-vcf/vcfstatistics.rb +28 -0
- data/test/data/regression/ifilter_s.dp.ref +31 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -0
- metadata +16 -12
data/lib/bio-vcf/vcfheader.rb
CHANGED
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -159,19 +159,24 @@ module BioVcf
|
|
159
159
|
@format ||= VcfRecordParser.get_format(@fields[8])
|
160
160
|
end
|
161
161
|
|
162
|
+
# Return the first (single) sample (used in one sample VCF)
|
163
|
+
def first
|
164
|
+
@first ||= VcfGenotypeField.new(@fields[9],format,@header,ref,alt)
|
165
|
+
end
|
166
|
+
|
162
167
|
# Return the normal sample (used in two sample VCF)
|
163
168
|
def normal
|
164
|
-
|
169
|
+
first
|
165
170
|
end
|
166
171
|
|
167
172
|
# Return the tumor sample (used in two sample VCF)
|
168
173
|
def tumor
|
169
|
-
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
|
174
|
+
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,ref,alt)
|
170
175
|
end
|
171
176
|
|
172
177
|
# Return the sample as a named hash
|
173
178
|
def sample
|
174
|
-
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
179
|
+
@sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt)
|
175
180
|
end
|
176
181
|
|
177
182
|
def sample_by_name name
|
@@ -179,14 +184,15 @@ module BioVcf
|
|
179
184
|
end
|
180
185
|
|
181
186
|
def sample_by_index i
|
182
|
-
# p
|
183
|
-
|
187
|
+
# p @fields
|
188
|
+
raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
|
189
|
+
@sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
|
184
190
|
end
|
185
191
|
|
186
192
|
# Walk the samples. list contains an Array of int (the index)
|
187
193
|
def each_sample(list = nil)
|
188
194
|
list = @header.samples_index_array() if not list
|
189
|
-
list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
|
195
|
+
list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i.to_i)) }
|
190
196
|
end
|
191
197
|
|
192
198
|
def missing_samples?
|
@@ -200,7 +206,7 @@ module BioVcf
|
|
200
206
|
@fields.size == @header.column_names.size
|
201
207
|
end
|
202
208
|
|
203
|
-
def eval expr, ignore_missing_data, quiet
|
209
|
+
def eval expr, ignore_missing_data: true, quiet: false
|
204
210
|
begin
|
205
211
|
if not respond_to?(:call_cached_eval)
|
206
212
|
code =
|
@@ -233,7 +239,7 @@ module BioVcf
|
|
233
239
|
end
|
234
240
|
end
|
235
241
|
|
236
|
-
def filter expr, ignore_missing_data, quiet
|
242
|
+
def filter expr, ignore_missing_data: true, quiet: false
|
237
243
|
begin
|
238
244
|
if not respond_to?(:call_cached_filter)
|
239
245
|
code =
|
data/lib/bio-vcf/vcfsample.rb
CHANGED
@@ -1,177 +1,126 @@
|
|
1
1
|
module BioVcf
|
2
2
|
module VcfSample
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
end
|
8
|
-
|
9
|
-
class Sample
|
10
|
-
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
11
|
-
def initialize rec,sample
|
12
|
-
@rec = rec
|
13
|
-
@sample = sample
|
14
|
-
@format = @sample.format
|
15
|
-
@values = @sample.values
|
4
|
+
# Check whether a sample is empty (on the raw string value)
|
5
|
+
def VcfSample::empty? s
|
6
|
+
s == './.' or s == '' or s == nil
|
16
7
|
end
|
17
8
|
|
18
|
-
|
19
|
-
|
20
|
-
|
9
|
+
class Sample
|
10
|
+
# Initialized sample with rec and genotypefield
|
11
|
+
#
|
12
|
+
# #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
|
13
|
+
def initialize rec,genotypefield
|
14
|
+
@rec = rec
|
15
|
+
@sample = genotypefield
|
16
|
+
@format = @sample.format
|
17
|
+
@values = @sample.values
|
18
|
+
end
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
if not respond_to?(:call_cached_eval)
|
25
|
-
code =
|
26
|
-
"""
|
27
|
-
def call_cached_eval(rec,sample)
|
28
|
-
r = rec
|
29
|
-
s = sample
|
30
|
-
#{expr}
|
31
|
-
end
|
32
|
-
"""
|
33
|
-
self.class.class_eval(code)
|
34
|
-
end
|
35
|
-
call_cached_eval(@rec,self)
|
36
|
-
rescue NoMethodError => e
|
37
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
38
|
-
if not quiet
|
39
|
-
$stderr.print [@format,@values],"\n"
|
40
|
-
$stderr.print expr,"\n"
|
41
|
-
end
|
42
|
-
if ignore_missing_data
|
43
|
-
$stderr.print e.message if not quiet and not empty?
|
44
|
-
return false
|
45
|
-
else
|
46
|
-
raise
|
47
|
-
end
|
20
|
+
def empty?
|
21
|
+
cache_empty ||= VcfSample::empty?(@sample.to_s)
|
48
22
|
end
|
49
|
-
end
|
50
23
|
|
51
|
-
|
52
|
-
|
53
|
-
if not respond_to?(:call_cached_sfilter)
|
54
|
-
code =
|
55
|
-
"""
|
56
|
-
def call_cached_sfilter(rec,sample)
|
57
|
-
r = rec
|
58
|
-
s = sample
|
59
|
-
#{expr}
|
60
|
-
end
|
61
|
-
"""
|
62
|
-
self.class.class_eval(code)
|
63
|
-
end
|
64
|
-
call_cached_sfilter(@rec,self)
|
65
|
-
rescue NoMethodError => e
|
66
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
67
|
-
if not quiet
|
68
|
-
$stderr.print [@format,@values],"\n"
|
69
|
-
$stderr.print expr,"\n"
|
70
|
-
end
|
71
|
-
if ignore_missing_data
|
72
|
-
$stderr.print e.message if not quiet and not empty?
|
73
|
-
return false
|
74
|
-
else
|
75
|
-
raise
|
76
|
-
end
|
24
|
+
def eval expr, ignore_missing_data: false, quiet: false, do_cache: true
|
25
|
+
caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
77
26
|
end
|
78
|
-
end
|
79
27
|
|
80
|
-
|
81
|
-
|
82
|
-
if not respond_to?(:call_cached_ifilter)
|
83
|
-
code =
|
84
|
-
"""
|
85
|
-
def call_cached_ifilter(rec,sample)
|
86
|
-
r = rec
|
87
|
-
s = sample
|
88
|
-
#{expr}
|
89
|
-
end
|
90
|
-
"""
|
91
|
-
self.class.class_eval(code)
|
92
|
-
end
|
93
|
-
call_cached_ifilter(@rec,self)
|
94
|
-
rescue NoMethodError => e
|
95
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
96
|
-
if not quiet
|
97
|
-
$stderr.print [@format,@values],"\n"
|
98
|
-
$stderr.print expr,"\n"
|
99
|
-
end
|
100
|
-
if ignore_missing_data
|
101
|
-
$stderr.print e.message if not quiet and not empty?
|
102
|
-
return false
|
103
|
-
else
|
104
|
-
raise
|
105
|
-
end
|
28
|
+
def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true
|
29
|
+
caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
106
30
|
end
|
107
|
-
end
|
108
31
|
|
109
|
-
|
110
|
-
|
111
|
-
if not respond_to?(:call_cached_efilter)
|
112
|
-
code =
|
113
|
-
"""
|
114
|
-
def call_cached_efilter(rec,sample)
|
115
|
-
r = rec
|
116
|
-
s = sample
|
117
|
-
#{expr}
|
118
|
-
end
|
119
|
-
"""
|
120
|
-
self.class.class_eval(code)
|
121
|
-
end
|
122
|
-
call_cached_efilter(@rec,self)
|
123
|
-
rescue NoMethodError => e
|
124
|
-
$stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
125
|
-
if not quiet
|
126
|
-
$stderr.print [@format,@values],"\n"
|
127
|
-
$stderr.print expr,"\n"
|
128
|
-
end
|
129
|
-
if ignore_missing_data
|
130
|
-
$stderr.print e.message if not quiet and not empty?
|
131
|
-
return false
|
132
|
-
else
|
133
|
-
raise
|
134
|
-
end
|
32
|
+
def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true
|
33
|
+
caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
135
34
|
end
|
136
|
-
end
|
137
35
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
142
|
-
end
|
36
|
+
def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true
|
37
|
+
caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
|
38
|
+
end
|
143
39
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
40
|
+
# Split GT into index values
|
41
|
+
def gti
|
42
|
+
v = fetch_values("GT")
|
43
|
+
v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
|
44
|
+
end
|
148
45
|
|
149
|
-
|
150
|
-
|
151
|
-
|
46
|
+
# Split GT into into a nucleode sequence
|
47
|
+
def gts
|
48
|
+
gti.map { |i| (i ? @rec.get_gt(i) : nil) }
|
49
|
+
end
|
152
50
|
|
153
|
-
|
154
|
-
|
155
|
-
if @format[name]
|
156
|
-
cache_method(m) {
|
157
|
-
ConvertStringToValue::convert(fetch_values(name))
|
158
|
-
}
|
159
|
-
self.send(m)
|
160
|
-
else
|
161
|
-
super(m, *args, &block)
|
51
|
+
def cache_method(name, &block)
|
52
|
+
self.class.send(:define_method, name, &block)
|
162
53
|
end
|
163
|
-
|
54
|
+
|
55
|
+
def method_missing(m, *args, &block)
|
56
|
+
name = m.to_s.upcase
|
57
|
+
# p [:here,name,m ,@values]
|
58
|
+
# p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)]
|
59
|
+
if name =~ /\?$/
|
60
|
+
# test for valid field
|
61
|
+
return !VcfValue::empty?(fetch_values(name.chop))
|
62
|
+
else
|
63
|
+
if @format[name]
|
64
|
+
cache_method(m) {
|
65
|
+
v = fetch_values(name)
|
66
|
+
return nil if VcfValue::empty?(v)
|
67
|
+
ConvertStringToValue::convert(v)
|
68
|
+
}
|
69
|
+
self.send(m)
|
70
|
+
else
|
71
|
+
super(m, *args, &block)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
164
75
|
|
165
|
-
private
|
76
|
+
private
|
166
77
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
78
|
+
def fetch_values name
|
79
|
+
n = @format[name]
|
80
|
+
raise "Unknown sample field <#{name}>" if not n
|
81
|
+
@values[n] # <-- save names with upcase!
|
82
|
+
end
|
172
83
|
|
173
|
-
|
84
|
+
def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true
|
85
|
+
begin
|
86
|
+
if do_cache
|
87
|
+
if not respond_to?(cached_method)
|
88
|
+
code =
|
89
|
+
"""
|
90
|
+
def #{cached_method}(rec,sample)
|
91
|
+
r = rec
|
92
|
+
s = sample
|
93
|
+
#{expr}
|
94
|
+
end
|
95
|
+
"""
|
96
|
+
self.class.class_eval(code)
|
97
|
+
end
|
98
|
+
self.send(cached_method,@rec,self)
|
99
|
+
else
|
100
|
+
# This is used for testing mostly
|
101
|
+
print "WARNING: NOT CACHING #{method}\n"
|
102
|
+
self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval)
|
103
|
+
self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter)
|
104
|
+
r = @rec
|
105
|
+
s = @sample
|
106
|
+
eval(expr)
|
107
|
+
end
|
108
|
+
rescue NoMethodError => e
|
109
|
+
$stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
|
110
|
+
if not quiet
|
111
|
+
$stderr.print [:format,@format,:sample,@values],"\n"
|
112
|
+
$stderr.print [:filter,expr],"\n"
|
113
|
+
end
|
114
|
+
if ignore_missing_data
|
115
|
+
$stderr.print e.message if not quiet and not empty?
|
116
|
+
return false
|
117
|
+
else
|
118
|
+
raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
174
122
|
|
123
|
+
end
|
175
124
|
|
176
125
|
end
|
177
126
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
class VcfStatistics
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@count = 0
|
7
|
+
@ref_alt_count = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def add rec
|
11
|
+
@count += 1
|
12
|
+
s = rec.ref+">"+rec.alt[0]
|
13
|
+
@ref_alt_count[s] ||= 0
|
14
|
+
@ref_alt_count[s] += 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def print
|
18
|
+
puts "## ==== Statistics =================================="
|
19
|
+
@ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v|
|
20
|
+
printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round
|
21
|
+
end
|
22
|
+
puts "Total\t#{@count}"
|
23
|
+
puts "## =================================================="
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
1 10257 159 242 249 249 186 212 218
|
2
|
+
1 10291 165 249 249 247 161 163 189
|
3
|
+
1 10297 182 246 250 246 165 158 183
|
4
|
+
1 10303 198 247 248 248 172 157 182
|
5
|
+
1 10315 212 246 242 245 190 157 189
|
6
|
+
1 10321 218 246 248 248 193 164 196
|
7
|
+
1 10327 237 238 229 237 209 183 210
|
8
|
+
1 12783 58 164 144 182 126 103 158
|
9
|
+
1 13116 32 131 102 152 104 88 109
|
10
|
+
1 13118 34 129 101 145 99 85 108
|
11
|
+
1 13178 52 172 137 172 129 119 148
|
12
|
+
1 13302 36 136 99 146 90 65 117
|
13
|
+
1 13757 53 201 181 250 152 130 182
|
14
|
+
1 13868 75 192 182 224 142 111 167
|
15
|
+
1 13896 62 135 143 175 112 81 121
|
16
|
+
1 14354 43 158 115 145 72 119
|
17
|
+
1 14464 51 155 141 150 83 89 140
|
18
|
+
1 14673 36 142 117 157 95 76 131
|
19
|
+
1 14699 43 128 109 147 98 78 114
|
20
|
+
1 14907 57 216 162 205 153 118 158
|
21
|
+
1 14930 68 216 170 210 136 125 164
|
22
|
+
1 14933 68 216 169 212 132 128 164
|
23
|
+
1 14948 63 192 181 211 129 121 153
|
24
|
+
1 14976 56 166 161 196 109 116 135
|
25
|
+
1 15118 46 198 129 230 113 126 158
|
26
|
+
1 15190 53 208 170 200 126 145 179
|
27
|
+
1 15211 54 183 161 171 120 134 168
|
28
|
+
1 15274 37 121 102 137 71 67 98
|
29
|
+
1 15447 46 242 183 226 137 173 175
|
30
|
+
1 15688 37 182 147 184 100 101 148
|
31
|
+
1 16103 50 79 86 106 60 61 84
|
@@ -0,0 +1 @@
|
|
1
|
+
unexpected return
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-vcf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -42,31 +42,32 @@ dependencies:
|
|
42
42
|
name: jeweler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 2.0.1
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 2.0.1
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: regressiontest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: 0.0.3
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
69
|
-
description: Smart parser for VCF format
|
68
|
+
version: 0.0.3
|
69
|
+
description: Smart lazy multi-threaded parser for VCF format with useful filtering
|
70
|
+
and output rewriting
|
70
71
|
email: pjotr.public01@thebird.nl
|
71
72
|
executables:
|
72
73
|
- bio-vcf
|
@@ -106,10 +107,12 @@ files:
|
|
106
107
|
- lib/bio-vcf/vcfrdf.rb
|
107
108
|
- lib/bio-vcf/vcfrecord.rb
|
108
109
|
- lib/bio-vcf/vcfsample.rb
|
110
|
+
- lib/bio-vcf/vcfstatistics.rb
|
109
111
|
- test/data/input/dbsnp.vcf
|
110
112
|
- test/data/input/multisample.vcf
|
111
113
|
- test/data/input/somaticsniper.vcf
|
112
114
|
- test/data/regression/eval_r.info.dp.ref
|
115
|
+
- test/data/regression/ifilter_s.dp.ref
|
113
116
|
- test/data/regression/r.info.dp.ref
|
114
117
|
- test/data/regression/rewrite.info.sample.ref
|
115
118
|
- test/data/regression/s.dp.ref
|
@@ -117,6 +120,7 @@ files:
|
|
117
120
|
- test/data/regression/sfilter_seval_s.dp.ref
|
118
121
|
- test/data/regression/thread4.ref
|
119
122
|
- test/data/regression/thread4_4.ref
|
123
|
+
- test/data/regression/thread4_4_failed_filter-stderr.ref
|
120
124
|
- test/performance/metrics.md
|
121
125
|
homepage: http://github.com/pjotrp/bioruby-vcf
|
122
126
|
licenses:
|
@@ -141,5 +145,5 @@ rubyforge_project:
|
|
141
145
|
rubygems_version: 2.0.3
|
142
146
|
signing_key:
|
143
147
|
specification_version: 4
|
144
|
-
summary: VCF parser
|
148
|
+
summary: Fast multi-threaded VCF parser
|
145
149
|
test_files: []
|