bio-vcf 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,7 +64,7 @@ module BioVcf
64
64
  end
65
65
 
66
66
  def samples_index_array
67
- @all_samples_index ||= column_names[9..-1].fill{|i| i}
67
+ @all_samples_index ||= column_names[9..-1].fill{|i| i}
68
68
  end
69
69
 
70
70
  def sample_index
@@ -159,19 +159,24 @@ module BioVcf
159
159
  @format ||= VcfRecordParser.get_format(@fields[8])
160
160
  end
161
161
 
162
+ # Return the first (single) sample (used in one sample VCF)
163
+ def first
164
+ @first ||= VcfGenotypeField.new(@fields[9],format,@header,ref,alt)
165
+ end
166
+
162
167
  # Return the normal sample (used in two sample VCF)
163
168
  def normal
164
- @normal ||= VcfGenotypeField.new(@fields[9],format,@header,alt)
169
+ first
165
170
  end
166
171
 
167
172
  # Return the tumor sample (used in two sample VCF)
168
173
  def tumor
169
- @tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
174
+ @tumor ||= VcfGenotypeField.new(@fields[10],format,@header,ref,alt)
170
175
  end
171
176
 
172
177
  # Return the sample as a named hash
173
178
  def sample
174
- @sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
179
+ @sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt)
175
180
  end
176
181
 
177
182
  def sample_by_name name
@@ -179,14 +184,15 @@ module BioVcf
179
184
  end
180
185
 
181
186
  def sample_by_index i
182
- # p [i,@fields[i+9]]
183
- @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,alt)
187
+ # p @fields
188
+ raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
189
+ @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,ref,alt)
184
190
  end
185
191
 
186
192
  # Walk the samples. list contains an Array of int (the index)
187
193
  def each_sample(list = nil)
188
194
  list = @header.samples_index_array() if not list
189
- list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
195
+ list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i.to_i)) }
190
196
  end
191
197
 
192
198
  def missing_samples?
@@ -200,7 +206,7 @@ module BioVcf
200
206
  @fields.size == @header.column_names.size
201
207
  end
202
208
 
203
- def eval expr, ignore_missing_data, quiet
209
+ def eval expr, ignore_missing_data: true, quiet: false
204
210
  begin
205
211
  if not respond_to?(:call_cached_eval)
206
212
  code =
@@ -233,7 +239,7 @@ module BioVcf
233
239
  end
234
240
  end
235
241
 
236
- def filter expr, ignore_missing_data, quiet
242
+ def filter expr, ignore_missing_data: true, quiet: false
237
243
  begin
238
244
  if not respond_to?(:call_cached_filter)
239
245
  code =
@@ -1,177 +1,126 @@
1
1
  module BioVcf
2
2
  module VcfSample
3
3
 
4
- # Check whether a sample is empty (on the raw string value)
5
- def VcfSample::empty? s
6
- s == './.' or s == '' or s == nil
7
- end
8
-
9
- class Sample
10
- # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
11
- def initialize rec,sample
12
- @rec = rec
13
- @sample = sample
14
- @format = @sample.format
15
- @values = @sample.values
4
+ # Check whether a sample is empty (on the raw string value)
5
+ def VcfSample::empty? s
6
+ s == './.' or s == '' or s == nil
16
7
  end
17
8
 
18
- def empty?
19
- cache_empty ||= VcfSample::empty?(@sample.to_s)
20
- end
9
+ class Sample
10
+ # Initialized sample with rec and genotypefield
11
+ #
12
+ # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
13
+ def initialize rec,genotypefield
14
+ @rec = rec
15
+ @sample = genotypefield
16
+ @format = @sample.format
17
+ @values = @sample.values
18
+ end
21
19
 
22
- def eval expr, ignore_missing_data, quiet
23
- begin
24
- if not respond_to?(:call_cached_eval)
25
- code =
26
- """
27
- def call_cached_eval(rec,sample)
28
- r = rec
29
- s = sample
30
- #{expr}
31
- end
32
- """
33
- self.class.class_eval(code)
34
- end
35
- call_cached_eval(@rec,self)
36
- rescue NoMethodError => e
37
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
38
- if not quiet
39
- $stderr.print [@format,@values],"\n"
40
- $stderr.print expr,"\n"
41
- end
42
- if ignore_missing_data
43
- $stderr.print e.message if not quiet and not empty?
44
- return false
45
- else
46
- raise
47
- end
20
+ def empty?
21
+ cache_empty ||= VcfSample::empty?(@sample.to_s)
48
22
  end
49
- end
50
23
 
51
- def sfilter expr, ignore_missing_data, quiet
52
- begin
53
- if not respond_to?(:call_cached_sfilter)
54
- code =
55
- """
56
- def call_cached_sfilter(rec,sample)
57
- r = rec
58
- s = sample
59
- #{expr}
60
- end
61
- """
62
- self.class.class_eval(code)
63
- end
64
- call_cached_sfilter(@rec,self)
65
- rescue NoMethodError => e
66
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
67
- if not quiet
68
- $stderr.print [@format,@values],"\n"
69
- $stderr.print expr,"\n"
70
- end
71
- if ignore_missing_data
72
- $stderr.print e.message if not quiet and not empty?
73
- return false
74
- else
75
- raise
76
- end
24
+ def eval expr, ignore_missing_data: false, quiet: false, do_cache: true
25
+ caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
77
26
  end
78
- end
79
27
 
80
- def ifilter expr, ignore_missing_data, quiet
81
- begin
82
- if not respond_to?(:call_cached_ifilter)
83
- code =
84
- """
85
- def call_cached_ifilter(rec,sample)
86
- r = rec
87
- s = sample
88
- #{expr}
89
- end
90
- """
91
- self.class.class_eval(code)
92
- end
93
- call_cached_ifilter(@rec,self)
94
- rescue NoMethodError => e
95
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
96
- if not quiet
97
- $stderr.print [@format,@values],"\n"
98
- $stderr.print expr,"\n"
99
- end
100
- if ignore_missing_data
101
- $stderr.print e.message if not quiet and not empty?
102
- return false
103
- else
104
- raise
105
- end
28
+ def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true
29
+ caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
106
30
  end
107
- end
108
31
 
109
- def efilter expr, ignore_missing_data, quiet
110
- begin
111
- if not respond_to?(:call_cached_efilter)
112
- code =
113
- """
114
- def call_cached_efilter(rec,sample)
115
- r = rec
116
- s = sample
117
- #{expr}
118
- end
119
- """
120
- self.class.class_eval(code)
121
- end
122
- call_cached_efilter(@rec,self)
123
- rescue NoMethodError => e
124
- $stderr.print "\nTrying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
125
- if not quiet
126
- $stderr.print [@format,@values],"\n"
127
- $stderr.print expr,"\n"
128
- end
129
- if ignore_missing_data
130
- $stderr.print e.message if not quiet and not empty?
131
- return false
132
- else
133
- raise
134
- end
32
+ def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true
33
+ caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
135
34
  end
136
- end
137
35
 
138
- # Split GT into index values
139
- def gti
140
- v = fetch_values("GT")
141
- v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
142
- end
36
+ def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true
37
+ caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
38
+ end
143
39
 
144
- # Split GT into into a nucleode sequence
145
- def gts
146
- gti.map { |i| (i ? @rec.get_gt(i) : nil) }
147
- end
40
+ # Split GT into index values
41
+ def gti
42
+ v = fetch_values("GT")
43
+ v.split(/\//).map{ |v| (v=='.' ? nil : v.to_i) }
44
+ end
148
45
 
149
- def cache_method(name, &block)
150
- self.class.send(:define_method, name, &block)
151
- end
46
+ # Split GT into into a nucleode sequence
47
+ def gts
48
+ gti.map { |i| (i ? @rec.get_gt(i) : nil) }
49
+ end
152
50
 
153
- def method_missing(m, *args, &block)
154
- name = m.to_s.upcase
155
- if @format[name]
156
- cache_method(m) {
157
- ConvertStringToValue::convert(fetch_values(name))
158
- }
159
- self.send(m)
160
- else
161
- super(m, *args, &block)
51
+ def cache_method(name, &block)
52
+ self.class.send(:define_method, name, &block)
162
53
  end
163
- end
54
+
55
+ def method_missing(m, *args, &block)
56
+ name = m.to_s.upcase
57
+ # p [:here,name,m ,@values]
58
+ # p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)]
59
+ if name =~ /\?$/
60
+ # test for valid field
61
+ return !VcfValue::empty?(fetch_values(name.chop))
62
+ else
63
+ if @format[name]
64
+ cache_method(m) {
65
+ v = fetch_values(name)
66
+ return nil if VcfValue::empty?(v)
67
+ ConvertStringToValue::convert(v)
68
+ }
69
+ self.send(m)
70
+ else
71
+ super(m, *args, &block)
72
+ end
73
+ end
74
+ end
164
75
 
165
- private
76
+ private
166
77
 
167
- def fetch_values name
168
- n = @format[name]
169
- raise "Unknown sample field <#{name}>" if not n
170
- @values[n] # <-- save names with upcase!
171
- end
78
+ def fetch_values name
79
+ n = @format[name]
80
+ raise "Unknown sample field <#{name}>" if not n
81
+ @values[n] # <-- save names with upcase!
82
+ end
172
83
 
173
- end
84
+ def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true
85
+ begin
86
+ if do_cache
87
+ if not respond_to?(cached_method)
88
+ code =
89
+ """
90
+ def #{cached_method}(rec,sample)
91
+ r = rec
92
+ s = sample
93
+ #{expr}
94
+ end
95
+ """
96
+ self.class.class_eval(code)
97
+ end
98
+ self.send(cached_method,@rec,self)
99
+ else
100
+ # This is used for testing mostly
101
+ print "WARNING: NOT CACHING #{method}\n"
102
+ self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval)
103
+ self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter)
104
+ r = @rec
105
+ s = @sample
106
+ eval(expr)
107
+ end
108
+ rescue NoMethodError => e
109
+ $stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
110
+ if not quiet
111
+ $stderr.print [:format,@format,:sample,@values],"\n"
112
+ $stderr.print [:filter,expr],"\n"
113
+ end
114
+ if ignore_missing_data
115
+ $stderr.print e.message if not quiet and not empty?
116
+ return false
117
+ else
118
+ raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!")
119
+ end
120
+ end
121
+ end
174
122
 
123
+ end
175
124
 
176
125
  end
177
126
  end
@@ -0,0 +1,28 @@
1
+ module BioVcf
2
+
3
+ class VcfStatistics
4
+
5
+ def initialize
6
+ @count = 0
7
+ @ref_alt_count = {}
8
+ end
9
+
10
+ def add rec
11
+ @count += 1
12
+ s = rec.ref+">"+rec.alt[0]
13
+ @ref_alt_count[s] ||= 0
14
+ @ref_alt_count[s] += 1
15
+ end
16
+
17
+ def print
18
+ puts "## ==== Statistics =================================="
19
+ @ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v|
20
+ printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round
21
+ end
22
+ puts "Total\t#{@count}"
23
+ puts "## =================================================="
24
+ end
25
+ end
26
+
27
+ end
28
+
@@ -0,0 +1,31 @@
1
+ 1 10257 159 242 249 249 186 212 218
2
+ 1 10291 165 249 249 247 161 163 189
3
+ 1 10297 182 246 250 246 165 158 183
4
+ 1 10303 198 247 248 248 172 157 182
5
+ 1 10315 212 246 242 245 190 157 189
6
+ 1 10321 218 246 248 248 193 164 196
7
+ 1 10327 237 238 229 237 209 183 210
8
+ 1 12783 58 164 144 182 126 103 158
9
+ 1 13116 32 131 102 152 104 88 109
10
+ 1 13118 34 129 101 145 99 85 108
11
+ 1 13178 52 172 137 172 129 119 148
12
+ 1 13302 36 136 99 146 90 65 117
13
+ 1 13757 53 201 181 250 152 130 182
14
+ 1 13868 75 192 182 224 142 111 167
15
+ 1 13896 62 135 143 175 112 81 121
16
+ 1 14354 43 158 115 145 72 119
17
+ 1 14464 51 155 141 150 83 89 140
18
+ 1 14673 36 142 117 157 95 76 131
19
+ 1 14699 43 128 109 147 98 78 114
20
+ 1 14907 57 216 162 205 153 118 158
21
+ 1 14930 68 216 170 210 136 125 164
22
+ 1 14933 68 216 169 212 132 128 164
23
+ 1 14948 63 192 181 211 129 121 153
24
+ 1 14976 56 166 161 196 109 116 135
25
+ 1 15118 46 198 129 230 113 126 158
26
+ 1 15190 53 208 170 200 126 145 179
27
+ 1 15211 54 183 161 171 120 134 168
28
+ 1 15274 37 121 102 137 71 67 98
29
+ 1 15447 46 242 183 226 137 173 175
30
+ 1 15688 37 182 147 184 100 101 148
31
+ 1 16103 50 79 86 106 60 61 84
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-vcf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Pjotr Prins
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-24 00:00:00.000000000 Z
11
+ date: 2014-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -42,31 +42,32 @@ dependencies:
42
42
  name: jeweler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: 2.0.1
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: 2.0.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: regressiontest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: 0.0.3
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ">="
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
69
- description: Smart parser for VCF format
68
+ version: 0.0.3
69
+ description: Smart lazy multi-threaded parser for VCF format with useful filtering
70
+ and output rewriting
70
71
  email: pjotr.public01@thebird.nl
71
72
  executables:
72
73
  - bio-vcf
@@ -106,10 +107,12 @@ files:
106
107
  - lib/bio-vcf/vcfrdf.rb
107
108
  - lib/bio-vcf/vcfrecord.rb
108
109
  - lib/bio-vcf/vcfsample.rb
110
+ - lib/bio-vcf/vcfstatistics.rb
109
111
  - test/data/input/dbsnp.vcf
110
112
  - test/data/input/multisample.vcf
111
113
  - test/data/input/somaticsniper.vcf
112
114
  - test/data/regression/eval_r.info.dp.ref
115
+ - test/data/regression/ifilter_s.dp.ref
113
116
  - test/data/regression/r.info.dp.ref
114
117
  - test/data/regression/rewrite.info.sample.ref
115
118
  - test/data/regression/s.dp.ref
@@ -117,6 +120,7 @@ files:
117
120
  - test/data/regression/sfilter_seval_s.dp.ref
118
121
  - test/data/regression/thread4.ref
119
122
  - test/data/regression/thread4_4.ref
123
+ - test/data/regression/thread4_4_failed_filter-stderr.ref
120
124
  - test/performance/metrics.md
121
125
  homepage: http://github.com/pjotrp/bioruby-vcf
122
126
  licenses:
@@ -141,5 +145,5 @@ rubyforge_project:
141
145
  rubygems_version: 2.0.3
142
146
  signing_key:
143
147
  specification_version: 4
144
- summary: VCF parser
148
+ summary: Fast multi-threaded VCF parser
145
149
  test_files: []