bio-vcf 0.0.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-vcf"
8
- s.version = "0.0.3"
8
+ s.version = "0.7.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
12
- s.date = "2014-05-24"
12
+ s.date = "2014-06-24"
13
13
  s.description = "Smart parser for VCF format"
14
14
  s.email = "pjotr.public01@thebird.nl"
15
15
  s.executables = ["bio-vcf"]
@@ -30,11 +30,13 @@ Gem::Specification.new do |s|
30
30
  "features/cli.feature",
31
31
  "features/diff_count.feature",
32
32
  "features/multisample.feature",
33
+ "features/sfilter.feature",
33
34
  "features/somaticsniper.feature",
34
35
  "features/step_definitions/bio-vcf_steps.rb",
35
36
  "features/step_definitions/cli-feature.rb",
36
37
  "features/step_definitions/diff_count.rb",
37
38
  "features/step_definitions/multisample.rb",
39
+ "features/step_definitions/sfilter.rb",
38
40
  "features/step_definitions/somaticsniper.rb",
39
41
  "features/support/env.rb",
40
42
  "lib/bio-vcf.rb",
@@ -55,7 +57,9 @@ Gem::Specification.new do |s|
55
57
  "test/data/regression/rewrite.info.sample.ref",
56
58
  "test/data/regression/s.dp.ref",
57
59
  "test/data/regression/seval_s.dp.ref",
58
- "test/data/regression/sfilter001.ref",
60
+ "test/data/regression/sfilter_seval_s.dp.ref",
61
+ "test/data/regression/thread4.ref",
62
+ "test/data/regression/thread4_4.ref",
59
63
  "test/performance/metrics.md"
60
64
  ]
61
65
  s.homepage = "http://github.com/pjotrp/bioruby-vcf"
@@ -8,6 +8,16 @@ Feature: Command-line interface (CLI)
8
8
  When I execute "./bin/bio-vcf -i --filter 'r.info.dp>100'"
9
9
  Then I expect the named output to match the named output "r.info.dp"
10
10
 
11
+ Scenario: Test the info filter using dp and threads
12
+ Given I have input file(s) named "test/data/input/multisample.vcf"
13
+ When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
+ Then I expect the named output to match the named output "thread4"
15
+
16
+ Scenario: Test the info filter using dp and threads with lines
17
+ Given I have input file(s) named "test/data/input/multisample.vcf"
18
+ When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
+ Then I expect the named output to match the named output "thread4_4"
20
+
11
21
  Scenario: Test the sample filter using dp
12
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
23
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'"
@@ -23,6 +33,12 @@ Feature: Command-line interface (CLI)
23
33
  When I execute "./bin/bio-vcf -i --seval 's.dp'"
24
34
  Then I expect the named output to match the named output "seval_s.dp"
25
35
 
36
+ Scenario: Test the sample filter + eval using dp
37
+ Given I have input file(s) named "test/data/input/multisample.vcf"
38
+ When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
39
+ Then I expect the named output to match the named output "sfilter_seval_s.dp"
40
+
41
+
26
42
  Scenario: Rewrite an info field
27
43
  Given I have input file(s) named "test/data/input/multisample.vcf"
28
44
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
@@ -36,11 +36,21 @@ Feature: Multi-sample VCF
36
36
  And I expect rec.sample.original.gt to be [0,1]
37
37
  And I expect rec.sample.s3t2.pl to be [20,0,522]
38
38
  # And the even better
39
+ And I expect r.original.gt? to be true
39
40
  And I expect rec.original.gt to be [0,1]
40
41
  And I expect rec.s3t2.pl to be [20,0,522]
41
42
  # Check for missing data
42
43
  And I expect test rec.missing_samples? to be false
43
44
  And I expect test rec.original? to be true
45
+ # Special functions
46
+ And I expect r.original? to be true
47
+ And I expect r.original.gti? to be true
48
+ And I expect r.original.gti to be [0,1]
49
+ And I expect r.original.gti[1] to be 1
50
+ And I expect r.original.gts? to be true
51
+ And I expect r.original.gts to be ["C","T"]
52
+ And I expect r.original.gts[1] to be "T"
53
+
44
54
  Given multisample vcf line
45
55
  """
46
56
  1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL ./. ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
@@ -0,0 +1,60 @@
1
+ @sfilter
2
+ Feature: Sample filters
3
+
4
+ Bio-vcf supports sample filters, where every sample is evaluated
5
+ independently, though they have the rec information (chrom, pos, info)
6
+ available.
7
+
8
+ Scenario: Example of a sample
9
+
10
+ Given the VCF line
11
+ """
12
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL
13
+ """
14
+ When I evaluate '0/0:6,0:6:3:0,3,33'
15
+ Then I expect s.empty? to be false
16
+ Then I expect s.dp? to be true
17
+ Then I expect s.dp to be 6
18
+ And sfilter 's.dp>4' to be true
19
+
20
+ # Scenario: Sample with missing data
21
+ When I evaluate missing '0/0:6,0:.:3:0,3,33'
22
+ Then I expect s.empty? to be false
23
+ Then I expect s.dp? to be false
24
+ Then I expect s.dp to be nil
25
+ And sfilter 's.dp>4' to throw an error
26
+
27
+ # Scenario: Sample with missing data with ignore missing set
28
+ When I evaluate missing '0/0:6,0:.:3:0,3,33' with ignore missing
29
+ Then I expect s.empty? to be false
30
+ Then I expect s.dp? to be false
31
+ Then I expect s.dp to be nil
32
+ And sfilter 's.dp>4' to be false
33
+
34
+ # Scenario: Missing sample
35
+ When I evaluate empty './.'
36
+ Then I expect s.empty? to be true
37
+ Then I expect s.dp? to be false
38
+ Then I expect s.dp to throw an error
39
+ And sfilter 's.dp>4' to throw an error
40
+
41
+ # Scenario: Missing sample with ignore missing set
42
+ When I evaluate empty './.' with ignore missing
43
+ Then I expect s.empty? to be true
44
+ Then I expect s.dp? to be false
45
+ Then I expect s.dp to be nil
46
+ And sfilter 's.dp>4' to be false
47
+
48
+ # Scenario: Wrong field name in sample
49
+ When I evaluate '0/0:6,0:6:3:0,3,33'
50
+ Then I expect s.empty? to be false
51
+ Then I expect s.dp? to be true
52
+ Then I expect s.what? to throw an error
53
+ And I expect s.what to throw an error
54
+
55
+ # Scenario: Get other information for a sample
56
+ When I evaluate '0/0:6,0:6:3:0,3,33'
57
+ Then I expect r.chrom to be "1"
58
+ And I expect r.alt to be ["G"]
59
+ And I expect r.info.af to be 0.667
60
+
@@ -8,5 +8,5 @@ When /^I execute "(.*?)"$/ do |arg1|
8
8
  end
9
9
 
10
10
  Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1).should be_true
11
+ RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
12
  end
@@ -117,3 +117,35 @@ Then(/^I expect rec\.valid\? to be true$/) do
117
117
  expect(@rec1.valid?).to eq true
118
118
  end
119
119
 
120
+ Then(/^I expect r\.original\.gt\? to be true$/) do
121
+ pending # express the regexp above with the code you wish you had
122
+ end
123
+
124
+ Then(/^I expect r\.original\? to be true$/) do
125
+ pending # express the regexp above with the code you wish you had
126
+ end
127
+
128
+ Then(/^I expect r\.original\.gti\? to be true$/) do
129
+ pending # express the regexp above with the code you wish you had
130
+ end
131
+
132
+ Then(/^I expect r\.original\.gti to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
133
+ pending # express the regexp above with the code you wish you had
134
+ end
135
+
136
+ Then(/^I expect r\.original\.gti\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
137
+ pending # express the regexp above with the code you wish you had
138
+ end
139
+
140
+ Then(/^I expect r\.original\.gts\? to be true$/) do
141
+ pending # express the regexp above with the code you wish you had
142
+ end
143
+
144
+ Then(/^I expect r\.original\.gts to be \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
145
+ pending # express the regexp above with the code you wish you had
146
+ end
147
+
148
+ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
149
+ pending # express the regexp above with the code you wish you had
150
+ end
151
+
@@ -0,0 +1,90 @@
1
+ Given(/^the VCF line$/) do |string|
2
+ @header = nil
3
+ @vcfline = string
4
+ end
5
+
6
+ When(/^I evaluate '([^']+)'$/) do |arg1|
7
+ @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
8
+ @rec = VcfRecord.new(@fields,@header)
9
+ p @rec
10
+ end
11
+
12
+ Then(/^I expect s\.empty\? to be false$/) do
13
+ p @rec.sample[0]
14
+ expect(@s.empty?).to be false
15
+ end
16
+
17
+
18
+ Then(/^I expect s\.dp to be (\d+)$/) do |arg1|
19
+ pending # express the regexp above with the code you wish you had
20
+ end
21
+
22
+ Then(/^sfilter 's\.dp>(\d+)' to be true$/) do |arg1|
23
+ pending # express the regexp above with the code you wish you had
24
+ end
25
+
26
+ When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)'$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
27
+ pending # express the regexp above with the code you wish you had
28
+ end
29
+
30
+ Then(/^I expect s\.dp to be nil$/) do
31
+ pending # express the regexp above with the code you wish you had
32
+ end
33
+
34
+ Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1|
35
+ pending # express the regexp above with the code you wish you had
36
+ end
37
+
38
+ When(/^I evaluate empty '\.\/\.'$/) do
39
+ pending # express the regexp above with the code you wish you had
40
+ end
41
+
42
+ Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1|
43
+ pending # express the regexp above with the code you wish you had
44
+ end
45
+
46
+ When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)' with ignore missing$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
47
+ pending # express the regexp above with the code you wish you had
48
+ end
49
+
50
+ Then(/^I expect s\.empty\? to be true$/) do
51
+ pending # express the regexp above with the code you wish you had
52
+ end
53
+
54
+ Then(/^I expect s\.dp to throw an error$/) do
55
+ pending # express the regexp above with the code you wish you had
56
+ end
57
+
58
+ When(/^I evaluate empty '\.\/\.' with ignore missing$/) do
59
+ pending # express the regexp above with the code you wish you had
60
+ end
61
+
62
+ Then(/^I expect s\.dp\? to be true$/) do
63
+ pending # express the regexp above with the code you wish you had
64
+ end
65
+
66
+ Then(/^I expect s\.dp\? to be false$/) do
67
+ pending # express the regexp above with the code you wish you had
68
+ end
69
+
70
+ Then(/^I expect s\.what\? to throw an error$/) do
71
+ pending # express the regexp above with the code you wish you had
72
+ end
73
+
74
+ Then(/^I expect s\.what to throw an error$/) do
75
+ pending # express the regexp above with the code you wish you had
76
+ end
77
+
78
+ Then(/^I expect r\.chrom to be "(.*?)"$/) do |arg1|
79
+ pending # express the regexp above with the code you wish you had
80
+ end
81
+
82
+ Then(/^I expect r\.alt to be \["(.*?)"\]$/) do |arg1|
83
+ pending # express the regexp above with the code you wish you had
84
+ end
85
+
86
+ Then(/^I expect r\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
87
+ pending # express the regexp above with the code you wish you had
88
+ end
89
+
90
+
@@ -9,14 +9,20 @@ module BioVcf
9
9
  !!Float(str) rescue false
10
10
  end
11
11
 
12
- def self::convert v
13
- if integer?(v) # the common case
14
- v = v.to_i
12
+ def self::convert str
13
+ if str =~ /,/
14
+ str.split(/,/).map { |item| convert(item) }
15
15
  else
16
- # 150.268 or 9.68463e-05
17
- v = v.to_f if float?(v)
16
+ if integer?(str)
17
+ str.to_i
18
+ else
19
+ if float?(str)
20
+ str.to_f
21
+ else
22
+ str
23
+ end
24
+ end
18
25
  end
19
- v
20
26
  end
21
27
  end
22
28
 
@@ -103,6 +103,10 @@ module BioVcf
103
103
  @alt = alt
104
104
  end
105
105
 
106
+ def to_s
107
+ @original_s
108
+ end
109
+
106
110
  def values
107
111
  @cache_values ||= @original_s.split(/:/)
108
112
  end
@@ -164,7 +168,6 @@ module BioVcf
164
168
  v.split(',').map{|i| i.to_i}
165
169
  end
166
170
 
167
-
168
171
  end
169
172
 
170
173
  # Holds all samples
@@ -27,6 +27,16 @@ module BioVcf
27
27
  @lines << line.strip
28
28
  end
29
29
 
30
+ # Add a key value list to the header
31
+ def tag h
32
+ h2 = h.dup
33
+ [:show_help,:skip_header,:verbose,:quiet,:debug].each { |key| h2.delete(key) }
34
+ info = h2.map { |k,v| k.to_s.capitalize+'='+'"'+v.to_s+'"' }.join(',')
35
+ line = '##BioVcf=<'+info+'>'
36
+ @lines.insert(-2,line)
37
+ line
38
+ end
39
+
30
40
  def version
31
41
  @version ||= lines[0].scan(/##fileformat=VCFv(\d+\.\d+)/)[0][0]
32
42
  end
@@ -39,10 +49,24 @@ module BioVcf
39
49
  @column ||= column_names.size
40
50
  end
41
51
 
52
+ def printable_header_line(fields)
53
+ fields.map { | field |
54
+ if field == '#samples'
55
+ samples
56
+ else
57
+ field
58
+ end
59
+ }.join("\t")
60
+ end
61
+
42
62
  def samples
43
63
  @samples ||= column_names[9..-1]
44
64
  end
45
65
 
66
+ def samples_index_array
67
+ @all_samples_index ||= column_names[9..-1].fill{|i| i}
68
+ end
69
+
46
70
  def sample_index
47
71
  return @sample_index if @sample_index
48
72
  index = {}
@@ -11,20 +11,27 @@ module BioVcf
11
11
  @prefix dc: <http://purl.org/dc/elements/1.1/> .
12
12
  @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
13
13
  @prefix doi: <http://dx.doi.org/> .
14
- @prefix : <http://biobeat.org/rdf/ns#> .
14
+ @prefix db: <http://biobeat.org/rdf/db#> .
15
+ @prefix seq: <http://biobeat.org/rdf/seq#> .
16
+ @prefix : <http://biobeat.org/rdf/vcf#> .
15
17
  EOB
16
18
  end
17
19
 
18
- def VcfRdf::record id,rec,hash = {}
19
- id2 = [id,'ch'+rec.chrom,rec.pos].join('_')
20
+ def VcfRdf::record id,rec,tags = "{}"
21
+ id2 = [id,'ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')
20
22
  print <<OUT
21
- :#{id2} :chr \"#{rec.chrom}\" .
22
- :#{id2} :pos #{rec.pos} .
23
- :#{id2} :vcf true .
23
+ :#{id2} seq:chr \"#{rec.chrom}\" .
24
+ :#{id2} seq:pos #{rec.pos} .
25
+ :#{id2} seq:alt \"#{rec.alt[0]}\" .
26
+ :#{id2} db:vcf true .
24
27
  OUT
25
- hash.each do |k,v|
26
- print ":#{id2} :#{k} #{v} .\n"
28
+ hash = eval(tags)
29
+ if hash
30
+ hash.each do |k,v|
31
+ print ":#{id2} #{k} #{v} .\n"
32
+ end
27
33
  end
34
+ print "\n"
28
35
  end
29
36
  end
30
37
  end
@@ -27,9 +27,10 @@ module BioVcf
27
27
  v = if @h
28
28
  @h[m.to_s.upcase]
29
29
  else
30
- @info =~ /#{m.to_s.upcase}=([^;]+)/
30
+ @info =~ /#{m.to_s}=([^;]+)/i
31
31
  value = $1
32
- # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
32
+ # p [m,value]
33
+ # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
33
34
  # value = m[:value]
34
35
  if value == nil
35
36
  split_fields # no option but to split
@@ -117,6 +118,7 @@ module BioVcf
117
118
  def initialize fields, header
118
119
  @fields = fields
119
120
  @header = header
121
+ @sample_by_index = []
120
122
  end
121
123
 
122
124
  def chrom
@@ -176,14 +178,15 @@ module BioVcf
176
178
  sample[name]
177
179
  end
178
180
 
181
+ def sample_by_index i
182
+ # p [i,@fields[i+9]]
183
+ @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,alt)
184
+ end
185
+
186
+ # Walk the samples. list contains an Array of int (the index)
179
187
  def each_sample(list = nil)
180
- samples = @header.column_names[9..-1]
181
- raise "Empty sample list, can not execute query!" if not samples
182
- samples.each_with_index { |name,i|
183
- # p [i,list]
184
- next if list and not list.index(i.to_s)
185
- yield VcfSample::Sample.new(self,sample[name])
186
- }
188
+ list = @header.samples_index_array() if not list
189
+ list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
187
190
  end
188
191
 
189
192
  def missing_samples?
@@ -230,6 +233,39 @@ module BioVcf
230
233
  end
231
234
  end
232
235
 
236
+ def filter expr, ignore_missing_data, quiet
237
+ begin
238
+ if not respond_to?(:call_cached_filter)
239
+ code =
240
+ """
241
+ def call_cached_filter(rec,fields)
242
+ r = rec
243
+ #{expr}
244
+ end
245
+ """
246
+ self.class.class_eval(code)
247
+ end
248
+ res = call_cached_filter(self,@fields)
249
+ if res.kind_of?(Array)
250
+ res.join("\t")
251
+ else
252
+ res
253
+ end
254
+ rescue NoMethodError => e
255
+ if not quiet
256
+ $stderr.print "RECORD ERROR!\n"
257
+ $stderr.print [@fields],"\n"
258
+ $stderr.print expr,"\n"
259
+ end
260
+ if ignore_missing_data
261
+ $stderr.print e.message if not quiet
262
+ return false
263
+ else
264
+ raise
265
+ end
266
+ end
267
+ end
268
+
233
269
  # Return the sample
234
270
  def method_missing(m, *args, &block)
235
271
  name = m.to_s