bio-vcf 0.7.0 → 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -64,7 +64,7 @@ module BioVcf
64
64
  end
65
65
 
66
66
  def samples_index_array
67
- @all_samples_index ||= column_names[9..-1].fill{|i| i}
67
+ @all_samples_index ||= column_names[9..-1].fill{|i| i}
68
68
  end
69
69
 
70
70
  def sample_index
@@ -159,19 +159,24 @@ module BioVcf
159
159
  @format ||= VcfRecordParser.get_format(@fields[8])
160
160
  end
161
161
 
162
+ # Return the first (single) sample (used in one sample VCF)
163
+ def first
164
+ @first ||= VcfGenotypeField.new(@fields[9],format,@header,ref,alt)
165
+ end
166
+
162
167
  # Return the normal sample (used in two sample VCF)
163
168
  def normal
164
- @normal ||= VcfGenotypeField.new(@fields[9],format,@header,alt)
169
+ first
165
170
  end
166
171
 
167
172
  # Return the tumor sample (used in two sample VCF)
168
173
  def tumor
169
- @tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
174
+ @tumor ||= VcfGenotypeField.new(@fields[10],format,@header,ref,alt)
170
175
  end
171
176
 
172
177
  # Return the sample as a named hash
173
178
  def sample
174
- @sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
179
+ @sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt)
175
180
  end
176
181
 
177
182
  def sample_by_name name
@@ -179,14 +184,15 @@ module BioVcf
179
184
  end
180
185
 
181
186
  def sample_by_index i
182
- # p [i,@fields[i+9]]
183
- @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,alt)
187
+ # p @fields
188
+ raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
189
+ @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
184
190
  end
185
191
 
186
192
  # Walk the samples. list contains an Array of int (the index)
187
193
  def each_sample(list = nil)
188
194
  list = @header.samples_index_array() if not list
189
- list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
195
+ list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i.to_i)) }
190
196
  end
191
197
 
192
198
  def missing_samples?
@@ -200,7 +206,7 @@ module BioVcf
200
206
  @fields.size == @header.column_names.size
201
207
  end
202
208
 
203
- def eval expr, ignore_missing_data, quiet
209
+ def eval expr, ignore_missing_data: true, quiet: false
204
210
  begin
205
211
  if not respond_to?(:call_cached_eval)
206
212
  code =
@@ -233,7 +239,7 @@ module BioVcf
233
239
  end
234
240
  end
235
241
 
236
- def filter expr, ignore_missing_data, quiet
242
+ def filter expr, ignore_missing_data: true, quiet: false
237
243
  begin
238
244
  if not respond_to?(:call_cached_filter)
239
245
  code =
@@ -1,177 +1,126 @@
1
1
  module BioVcf
2
2
  module VcfSample
3
3
 
4
- # Check whether a sample is empty (on the raw string value)
5
- def VcfSample::empty? s
6
- s == './.' or s == '' or s == nil
7
- end
8
-
9
- class Sample
10
- # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
11
- def initialize rec,sample
12
- @rec = rec
13
- @sample = sample
14
- @format = @sample.format
15
- @values = @sample.values
4
+ # Check whether a sample is empty (on the raw string value)
5
+ def VcfSample::empty? s
6
+ s == './.' or s == '' or s == nil
16
7
  end
17
8
 
18
- def empty?
19
- cache_empty ||= VcfSample::empty?(@sample.to_s)
20
- end
9
+ class Sample
10
+ # Initialized sample with rec and genotypefield
11
+ #
12
+ # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
13
+ def initialize rec,genotypefield
14
+ @rec = rec
15
+ @sample = genotypefield
16
+ @format = @sample.format
17
+ @values = @sample.values
18
+ end
21
19
 
22
- def eval expr, ignore_missing_data, quiet
23
- begin
24
- if not respond_to?(:call_cached_eval)
25
- code =
26
- """
27
- def call_cached_eval(rec,sample)
28
- r = rec
29
- s = sample
30
- #{expr}
31
- end
32
- """
33
- self.class.class_eval(code)
34
- end
35
- call_cached_eval(@rec,self)
36
- rescue NoMethodError => e
37
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
38
- if not quiet
39
- $stderr.print [@format,@values],"\n"
40
- $stderr.print expr,"\n"
41
- end
42
- if ignore_missing_data
43
- $stderr.print e.message if not quiet and not empty?
44
- return false
45
- else
46
- raise
47
- end
20
+ def empty?
21
+ cache_empty ||= VcfSample::empty?(@sample.to_s)
48
22
  end
49
- end
50
23
 
51
- def sfilter expr, ignore_missing_data, quiet
52
- begin
53
- if not respond_to?(:call_cached_sfilter)
54
- code =
55
- """
56
- def call_cached_sfilter(rec,sample)
57
- r = rec
58
- s = sample
59
- #{expr}
60
- end
61
- """
62
- self.class.class_eval(code)
63
- end
64
- call_cached_sfilter(@rec,self)
65
- rescue NoMethodError => e
66
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
67
- if not quiet
68
- $stderr.print [@format,@values],"\n"
69
- $stderr.print expr,"\n"
70
- end
71
- if ignore_missing_data
72
- $stderr.print e.message if not quiet and not empty?
73
- return false
74
- else
75
- raise
76
- end
24
+ def eval expr, ignore_missing_data: false, quiet: false, do_cache: true
25
+ caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
77
26
  end
78
- end
79
27
 
80
- def ifilter expr, ignore_missing_data, quiet
81
- begin
82
- if not respond_to?(:call_cached_ifilter)
83
- code =
84
- """
85
- def call_cached_ifilter(rec,sample)
86
- r = rec
87
- s = sample
88
- #{expr}
89
- end
90
- """
91
- self.class.class_eval(code)
92
- end
93
- call_cached_ifilter(@rec,self)
94
- rescue NoMethodError => e
95
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
96
- if not quiet
97
- $stderr.print [@format,@values],"\n"
98
- $stderr.print expr,"\n"
99
- end
100
- if ignore_missing_data
101
- $stderr.print e.message if not quiet and not empty?
102
- return false
103
- else
104
- raise
105
- end
28
+ def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true
29
+ caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
106
30
  end
107
- end
108
31
 
109
- def efilter expr, ignore_missing_data, quiet
110
- begin
111
- if not respond_to?(:call_cached_efilter)
112
- code =
113
- """
114
- def call_cached_efilter(rec,sample)
115
- r = rec
116
- s = sample
117
- #{expr}
118
- end
119
- """
120
- self.class.class_eval(code)
121
- end
122
- call_cached_efilter(@rec,self)
123
- rescue NoMethodError => e
124
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
125
- if not quiet
126
- $stderr.print [@format,@values],"\n"
127
- $stderr.print expr,"\n"
128
- end
129
- if ignore_missing_data
130
- $stderr.print e.message if not quiet and not empty?
131
- return false
132
- else
133
- raise
134
- end
32
+ def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true
33
+ caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
135
34
  end
136
- end
137
35
 
138
- # Split GT into index values
139
- def gti
140
- v = fetch_values("GT")
141
- v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
142
- end
36
+ def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true
37
+ caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
38
+ end
143
39
 
144
- # Split GT into into a nucleode sequence
145
- def gts
146
- gti.map { |i| (i ? @rec.get_gt(i) : nil) }
147
- end
40
+ # Split GT into index values
41
+ def gti
42
+ v = fetch_values("GT")
43
+ v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
44
+ end
148
45
 
149
- def cache_method(name, &block)
150
- self.class.send(:define_method, name, &block)
151
- end
46
+ # Split GT into into a nucleode sequence
47
+ def gts
48
+ gti.map { |i| (i ? @rec.get_gt(i) : nil) }
49
+ end
152
50
 
153
- def method_missing(m, *args, &block)
154
- name = m.to_s.upcase
155
- if @format[name]
156
- cache_method(m) {
157
- ConvertStringToValue::convert(fetch_values(name))
158
- }
159
- self.send(m)
160
- else
161
- super(m, *args, &block)
51
+ def cache_method(name, &block)
52
+ self.class.send(:define_method, name, &block)
162
53
  end
163
- end
54
+
55
+ def method_missing(m, *args, &block)
56
+ name = m.to_s.upcase
57
+ # p [:here,name,m ,@values]
58
+ # p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)]
59
+ if name =~ /\?$/
60
+ # test for valid field
61
+ return !VcfValue::empty?(fetch_values(name.chop))
62
+ else
63
+ if @format[name]
64
+ cache_method(m) {
65
+ v = fetch_values(name)
66
+ return nil if VcfValue::empty?(v)
67
+ ConvertStringToValue::convert(v)
68
+ }
69
+ self.send(m)
70
+ else
71
+ super(m, *args, &block)
72
+ end
73
+ end
74
+ end
164
75
 
165
- private
76
+ private
166
77
 
167
- def fetch_values name
168
- n = @format[name]
169
- raise "Unknown sample field <#{name}>" if not n
170
- @values[n] # <-- save names with upcase!
171
- end
78
+ def fetch_values name
79
+ n = @format[name]
80
+ raise "Unknown sample field <#{name}>" if not n
81
+ @values[n] # <-- save names with upcase!
82
+ end
172
83
 
173
- end
84
+ def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true
85
+ begin
86
+ if do_cache
87
+ if not respond_to?(cached_method)
88
+ code =
89
+ """
90
+ def #{cached_method}(rec,sample)
91
+ r = rec
92
+ s = sample
93
+ #{expr}
94
+ end
95
+ """
96
+ self.class.class_eval(code)
97
+ end
98
+ self.send(cached_method,@rec,self)
99
+ else
100
+ # This is used for testing mostly
101
+ print "WARNING: NOT CACHING #{method}\n"
102
+ self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval)
103
+ self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter)
104
+ r = @rec
105
+ s = @sample
106
+ eval(expr)
107
+ end
108
+ rescue NoMethodError => e
109
+ $stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
110
+ if not quiet
111
+ $stderr.print [:format,@format,:sample,@values],"\n"
112
+ $stderr.print [:filter,expr],"\n"
113
+ end
114
+ if ignore_missing_data
115
+ $stderr.print e.message if not quiet and not empty?
116
+ return false
117
+ else
118
+ raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!")
119
+ end
120
+ end
121
+ end
174
122
 
123
+ end
175
124
 
176
125
  end
177
126
  end
@@ -0,0 +1,28 @@
1
+ module BioVcf
2
+
3
+ class VcfStatistics
4
+
5
+ def initialize
6
+ @count = 0
7
+ @ref_alt_count = {}
8
+ end
9
+
10
+ def add rec
11
+ @count += 1
12
+ s = rec.ref+">"+rec.alt[0]
13
+ @ref_alt_count[s] ||= 0
14
+ @ref_alt_count[s] += 1
15
+ end
16
+
17
+ def print
18
+ puts "## ==== Statistics =================================="
19
+ @ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v|
20
+ printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round
21
+ end
22
+ puts "Total\t#{@count}"
23
+ puts "## =================================================="
24
+ end
25
+ end
26
+
27
+ end
28
+
@@ -0,0 +1,31 @@
1
+ 1 10257 159 242 249 249 186 212 218
2
+ 1 10291 165 249 249 247 161 163 189
3
+ 1 10297 182 246 250 246 165 158 183
4
+ 1 10303 198 247 248 248 172 157 182
5
+ 1 10315 212 246 242 245 190 157 189
6
+ 1 10321 218 246 248 248 193 164 196
7
+ 1 10327 237 238 229 237 209 183 210
8
+ 1 12783 58 164 144 182 126 103 158
9
+ 1 13116 32 131 102 152 104 88 109
10
+ 1 13118 34 129 101 145 99 85 108
11
+ 1 13178 52 172 137 172 129 119 148
12
+ 1 13302 36 136 99 146 90 65 117
13
+ 1 13757 53 201 181 250 152 130 182
14
+ 1 13868 75 192 182 224 142 111 167
15
+ 1 13896 62 135 143 175 112 81 121
16
+ 1 14354 43 158 115 145 72 119
17
+ 1 14464 51 155 141 150 83 89 140
18
+ 1 14673 36 142 117 157 95 76 131
19
+ 1 14699 43 128 109 147 98 78 114
20
+ 1 14907 57 216 162 205 153 118 158
21
+ 1 14930 68 216 170 210 136 125 164
22
+ 1 14933 68 216 169 212 132 128 164
23
+ 1 14948 63 192 181 211 129 121 153
24
+ 1 14976 56 166 161 196 109 116 135
25
+ 1 15118 46 198 129 230 113 126 158
26
+ 1 15190 53 208 170 200 126 145 179
27
+ 1 15211 54 183 161 171 120 134 168
28
+ 1 15274 37 121 102 137 71 67 98
29
+ 1 15447 46 242 183 226 137 173 175
30
+ 1 15688 37 182 147 184 100 101 148
31
+ 1 16103 50 79 86 106 60 61 84
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-vcf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Pjotr Prins
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-24 00:00:00.000000000 Z
11
+ date: 2014-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -42,31 +42,32 @@ dependencies:
42
42
  name: jeweler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: 2.0.1
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: 2.0.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: regressiontest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: 0.0.3
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ">="
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
69
- description: Smart parser for VCF format
68
+ version: 0.0.3
69
+ description: Smart lazy multi-threaded parser for VCF format with useful filtering
70
+ and output rewriting
70
71
  email: pjotr.public01@thebird.nl
71
72
  executables:
72
73
  - bio-vcf
@@ -106,10 +107,12 @@ files:
106
107
  - lib/bio-vcf/vcfrdf.rb
107
108
  - lib/bio-vcf/vcfrecord.rb
108
109
  - lib/bio-vcf/vcfsample.rb
110
+ - lib/bio-vcf/vcfstatistics.rb
109
111
  - test/data/input/dbsnp.vcf
110
112
  - test/data/input/multisample.vcf
111
113
  - test/data/input/somaticsniper.vcf
112
114
  - test/data/regression/eval_r.info.dp.ref
115
+ - test/data/regression/ifilter_s.dp.ref
113
116
  - test/data/regression/r.info.dp.ref
114
117
  - test/data/regression/rewrite.info.sample.ref
115
118
  - test/data/regression/s.dp.ref
@@ -117,6 +120,7 @@ files:
117
120
  - test/data/regression/sfilter_seval_s.dp.ref
118
121
  - test/data/regression/thread4.ref
119
122
  - test/data/regression/thread4_4.ref
123
+ - test/data/regression/thread4_4_failed_filter-stderr.ref
120
124
  - test/performance/metrics.md
121
125
  homepage: http://github.com/pjotrp/bioruby-vcf
122
126
  licenses:
@@ -141,5 +145,5 @@ rubyforge_project:
141
145
  rubygems_version: 2.0.3
142
146
  signing_key:
143
147
  specification_version: 4
144
- summary: VCF parser
148
+ summary: Fast multi-threaded VCF parser
145
149
  test_files: []