bio-vcf 0.0.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-vcf"
8
- s.version = "0.0.3"
8
+ s.version = "0.7.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
12
- s.date = "2014-05-24"
12
+ s.date = "2014-06-24"
13
13
  s.description = "Smart parser for VCF format"
14
14
  s.email = "pjotr.public01@thebird.nl"
15
15
  s.executables = ["bio-vcf"]
@@ -30,11 +30,13 @@ Gem::Specification.new do |s|
30
30
  "features/cli.feature",
31
31
  "features/diff_count.feature",
32
32
  "features/multisample.feature",
33
+ "features/sfilter.feature",
33
34
  "features/somaticsniper.feature",
34
35
  "features/step_definitions/bio-vcf_steps.rb",
35
36
  "features/step_definitions/cli-feature.rb",
36
37
  "features/step_definitions/diff_count.rb",
37
38
  "features/step_definitions/multisample.rb",
39
+ "features/step_definitions/sfilter.rb",
38
40
  "features/step_definitions/somaticsniper.rb",
39
41
  "features/support/env.rb",
40
42
  "lib/bio-vcf.rb",
@@ -55,7 +57,9 @@ Gem::Specification.new do |s|
55
57
  "test/data/regression/rewrite.info.sample.ref",
56
58
  "test/data/regression/s.dp.ref",
57
59
  "test/data/regression/seval_s.dp.ref",
58
- "test/data/regression/sfilter001.ref",
60
+ "test/data/regression/sfilter_seval_s.dp.ref",
61
+ "test/data/regression/thread4.ref",
62
+ "test/data/regression/thread4_4.ref",
59
63
  "test/performance/metrics.md"
60
64
  ]
61
65
  s.homepage = "http://github.com/pjotrp/bioruby-vcf"
@@ -8,6 +8,16 @@ Feature: Command-line interface (CLI)
8
8
  When I execute "./bin/bio-vcf -i --filter 'r.info.dp>100'"
9
9
  Then I expect the named output to match the named output "r.info.dp"
10
10
 
11
+ Scenario: Test the info filter using dp and threads
12
+ Given I have input file(s) named "test/data/input/multisample.vcf"
13
+ When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
+ Then I expect the named output to match the named output "thread4"
15
+
16
+ Scenario: Test the info filter using dp and threads with lines
17
+ Given I have input file(s) named "test/data/input/multisample.vcf"
18
+ When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
+ Then I expect the named output to match the named output "thread4_4"
20
+
11
21
  Scenario: Test the sample filter using dp
12
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
23
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'"
@@ -23,6 +33,12 @@ Feature: Command-line interface (CLI)
23
33
  When I execute "./bin/bio-vcf -i --seval 's.dp'"
24
34
  Then I expect the named output to match the named output "seval_s.dp"
25
35
 
36
+ Scenario: Test the sample filter + eval using dp
37
+ Given I have input file(s) named "test/data/input/multisample.vcf"
38
+ When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
39
+ Then I expect the named output to match the named output "sfilter_seval_s.dp"
40
+
41
+
26
42
  Scenario: Rewrite an info field
27
43
  Given I have input file(s) named "test/data/input/multisample.vcf"
28
44
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
@@ -36,11 +36,21 @@ Feature: Multi-sample VCF
36
36
  And I expect rec.sample.original.gt to be [0,1]
37
37
  And I expect rec.sample.s3t2.pl to be [20,0,522]
38
38
  # And the even better
39
+ And I expect r.original.gt? to be true
39
40
  And I expect rec.original.gt to be [0,1]
40
41
  And I expect rec.s3t2.pl to be [20,0,522]
41
42
  # Check for missing data
42
43
  And I expect test rec.missing_samples? to be false
43
44
  And I expect test rec.original? to be true
45
+ # Special functions
46
+ And I expect r.original? to be true
47
+ And I expect r.original.gti? to be true
48
+ And I expect r.original.gti to be [0,1]
49
+ And I expect r.original.gti[1] to be 1
50
+ And I expect r.original.gts? to be true
51
+ And I expect r.original.gts to be ["C","T"]
52
+ And I expect r.original.gts[1] to be "T"
53
+
44
54
  Given multisample vcf line
45
55
  """
46
56
  1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL ./. ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
@@ -0,0 +1,60 @@
1
+ @sfilter
2
+ Feature: Sample filters
3
+
4
+ Bio-vcf supports sample filters, where every sample is evaluated
5
+ independently, though they have the rec information (chrom, pos, info)
6
+ available.
7
+
8
+ Scenario: Example of a sample
9
+
10
+ Given the VCF line
11
+ """
12
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL
13
+ """
14
+ When I evaluate '0/0:6,0:6:3:0,3,33'
15
+ Then I expect s.empty? to be false
16
+ Then I expect s.dp? to be true
17
+ Then I expect s.dp to be 6
18
+ And sfilter 's.dp>4' to be true
19
+
20
+ # Scenario: Sample with missing data
21
+ When I evaluate missing '0/0:6,0:.:3:0,3,33'
22
+ Then I expect s.empty? to be false
23
+ Then I expect s.dp? to be false
24
+ Then I expect s.dp to be nil
25
+ And sfilter 's.dp>4' to throw an error
26
+
27
+ # Scenario: Sample with missing data with ignore missing set
28
+ When I evaluate missing '0/0:6,0:.:3:0,3,33' with ignore missing
29
+ Then I expect s.empty? to be false
30
+ Then I expect s.dp? to be false
31
+ Then I expect s.dp to be nil
32
+ And sfilter 's.dp>4' to be false
33
+
34
+ # Scenario: Missing sample
35
+ When I evaluate empty './.'
36
+ Then I expect s.empty? to be true
37
+ Then I expect s.dp? to be false
38
+ Then I expect s.dp to throw an error
39
+ And sfilter 's.dp>4' to throw an error
40
+
41
+ # Scenario: Missing sample with ignore missing set
42
+ When I evaluate empty './.' with ignore missing
43
+ Then I expect s.empty? to be true
44
+ Then I expect s.dp? to be false
45
+ Then I expect s.dp to be nil
46
+ And sfilter 's.dp>4' to be false
47
+
48
+ # Scenario: Wrong field name in sample
49
+ When I evaluate '0/0:6,0:6:3:0,3,33'
50
+ Then I expect s.empty? to be false
51
+ Then I expect s.dp? to be true
52
+ Then I expect s.what? to throw an error
53
+ And I expect s.what to throw an error
54
+
55
+ # Scenario: Get other information for a sample
56
+ When I evaluate '0/0:6,0:6:3:0,3,33'
57
+ Then I expect r.chrom to be "1"
58
+ And I expect r.alt to be ["G"]
59
+ And I expect r.info.af to be 0.667
60
+
@@ -8,5 +8,5 @@ When /^I execute "(.*?)"$/ do |arg1|
8
8
  end
9
9
 
10
10
  Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1).should be_true
11
+ RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
12
  end
@@ -117,3 +117,35 @@ Then(/^I expect rec\.valid\? to be true$/) do
117
117
  expect(@rec1.valid?).to eq true
118
118
  end
119
119
 
120
+ Then(/^I expect r\.original\.gt\? to be true$/) do
121
+ pending # express the regexp above with the code you wish you had
122
+ end
123
+
124
+ Then(/^I expect r\.original\? to be true$/) do
125
+ pending # express the regexp above with the code you wish you had
126
+ end
127
+
128
+ Then(/^I expect r\.original\.gti\? to be true$/) do
129
+ pending # express the regexp above with the code you wish you had
130
+ end
131
+
132
+ Then(/^I expect r\.original\.gti to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
133
+ pending # express the regexp above with the code you wish you had
134
+ end
135
+
136
+ Then(/^I expect r\.original\.gti\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
137
+ pending # express the regexp above with the code you wish you had
138
+ end
139
+
140
+ Then(/^I expect r\.original\.gts\? to be true$/) do
141
+ pending # express the regexp above with the code you wish you had
142
+ end
143
+
144
+ Then(/^I expect r\.original\.gts to be \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
145
+ pending # express the regexp above with the code you wish you had
146
+ end
147
+
148
+ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
149
+ pending # express the regexp above with the code you wish you had
150
+ end
151
+
@@ -0,0 +1,90 @@
1
+ Given(/^the VCF line$/) do |string|
2
+ @header = nil
3
+ @vcfline = string
4
+ end
5
+
6
+ When(/^I evaluate '([^']+)'$/) do |arg1|
7
+ @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
8
+ @rec = VcfRecord.new(@fields,@header)
9
+ p @rec
10
+ end
11
+
12
+ Then(/^I expect s\.empty\? to be false$/) do
13
+ p @rec.sample[0]
14
+ expect(@s.empty?).to be false
15
+ end
16
+
17
+
18
+ Then(/^I expect s\.dp to be (\d+)$/) do |arg1|
19
+ pending # express the regexp above with the code you wish you had
20
+ end
21
+
22
+ Then(/^sfilter 's\.dp>(\d+)' to be true$/) do |arg1|
23
+ pending # express the regexp above with the code you wish you had
24
+ end
25
+
26
+ When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)'$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
27
+ pending # express the regexp above with the code you wish you had
28
+ end
29
+
30
+ Then(/^I expect s\.dp to be nil$/) do
31
+ pending # express the regexp above with the code you wish you had
32
+ end
33
+
34
+ Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1|
35
+ pending # express the regexp above with the code you wish you had
36
+ end
37
+
38
+ When(/^I evaluate empty '\.\/\.'$/) do
39
+ pending # express the regexp above with the code you wish you had
40
+ end
41
+
42
+ Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1|
43
+ pending # express the regexp above with the code you wish you had
44
+ end
45
+
46
+ When(/^I evaluate missing '(\d+)\/(\d+):(\d+),(\d+):\.:(\d+):(\d+),(\d+),(\d+)' with ignore missing$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
47
+ pending # express the regexp above with the code you wish you had
48
+ end
49
+
50
+ Then(/^I expect s\.empty\? to be true$/) do
51
+ pending # express the regexp above with the code you wish you had
52
+ end
53
+
54
+ Then(/^I expect s\.dp to throw an error$/) do
55
+ pending # express the regexp above with the code you wish you had
56
+ end
57
+
58
+ When(/^I evaluate empty '\.\/\.' with ignore missing$/) do
59
+ pending # express the regexp above with the code you wish you had
60
+ end
61
+
62
+ Then(/^I expect s\.dp\? to be true$/) do
63
+ pending # express the regexp above with the code you wish you had
64
+ end
65
+
66
+ Then(/^I expect s\.dp\? to be false$/) do
67
+ pending # express the regexp above with the code you wish you had
68
+ end
69
+
70
+ Then(/^I expect s\.what\? to throw an error$/) do
71
+ pending # express the regexp above with the code you wish you had
72
+ end
73
+
74
+ Then(/^I expect s\.what to throw an error$/) do
75
+ pending # express the regexp above with the code you wish you had
76
+ end
77
+
78
+ Then(/^I expect r\.chrom to be "(.*?)"$/) do |arg1|
79
+ pending # express the regexp above with the code you wish you had
80
+ end
81
+
82
+ Then(/^I expect r\.alt to be \["(.*?)"\]$/) do |arg1|
83
+ pending # express the regexp above with the code you wish you had
84
+ end
85
+
86
+ Then(/^I expect r\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
87
+ pending # express the regexp above with the code you wish you had
88
+ end
89
+
90
+
@@ -9,14 +9,20 @@ module BioVcf
9
9
  !!Float(str) rescue false
10
10
  end
11
11
 
12
- def self::convert v
13
- if integer?(v) # the common case
14
- v = v.to_i
12
+ def self::convert str
13
+ if str =~ /,/
14
+ str.split(/,/).map { |item| convert(item) }
15
15
  else
16
- # 150.268 or 9.68463e-05
17
- v = v.to_f if float?(v)
16
+ if integer?(str)
17
+ str.to_i
18
+ else
19
+ if float?(str)
20
+ str.to_f
21
+ else
22
+ str
23
+ end
24
+ end
18
25
  end
19
- v
20
26
  end
21
27
  end
22
28
 
@@ -103,6 +103,10 @@ module BioVcf
103
103
  @alt = alt
104
104
  end
105
105
 
106
+ def to_s
107
+ @original_s
108
+ end
109
+
106
110
  def values
107
111
  @cache_values ||= @original_s.split(/:/)
108
112
  end
@@ -164,7 +168,6 @@ module BioVcf
164
168
  v.split(',').map{|i| i.to_i}
165
169
  end
166
170
 
167
-
168
171
  end
169
172
 
170
173
  # Holds all samples
@@ -27,6 +27,16 @@ module BioVcf
27
27
  @lines << line.strip
28
28
  end
29
29
 
30
+ # Add a key value list to the header
31
+ def tag h
32
+ h2 = h.dup
33
+ [:show_help,:skip_header,:verbose,:quiet,:debug].each { |key| h2.delete(key) }
34
+ info = h2.map { |k,v| k.to_s.capitalize+'='+'"'+v.to_s+'"' }.join(',')
35
+ line = '##BioVcf=<'+info+'>'
36
+ @lines.insert(-2,line)
37
+ line
38
+ end
39
+
30
40
  def version
31
41
  @version ||= lines[0].scan(/##fileformat=VCFv(\d+\.\d+)/)[0][0]
32
42
  end
@@ -39,10 +49,24 @@ module BioVcf
39
49
  @column ||= column_names.size
40
50
  end
41
51
 
52
+ def printable_header_line(fields)
53
+ fields.map { | field |
54
+ if field == '#samples'
55
+ samples
56
+ else
57
+ field
58
+ end
59
+ }.join("\t")
60
+ end
61
+
42
62
  def samples
43
63
  @samples ||= column_names[9..-1]
44
64
  end
45
65
 
66
+ def samples_index_array
67
+ @all_samples_index ||= column_names[9..-1].fill{|i| i}
68
+ end
69
+
46
70
  def sample_index
47
71
  return @sample_index if @sample_index
48
72
  index = {}
@@ -11,20 +11,27 @@ module BioVcf
11
11
  @prefix dc: <http://purl.org/dc/elements/1.1/> .
12
12
  @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
13
13
  @prefix doi: <http://dx.doi.org/> .
14
- @prefix : <http://biobeat.org/rdf/ns#> .
14
+ @prefix db: <http://biobeat.org/rdf/db#> .
15
+ @prefix seq: <http://biobeat.org/rdf/seq#> .
16
+ @prefix : <http://biobeat.org/rdf/vcf#> .
15
17
  EOB
16
18
  end
17
19
 
18
- def VcfRdf::record id,rec,hash = {}
19
- id2 = [id,'ch'+rec.chrom,rec.pos].join('_')
20
+ def VcfRdf::record id,rec,tags = "{}"
21
+ id2 = [id,'ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')
20
22
  print <<OUT
21
- :#{id2} :chr \"#{rec.chrom}\" .
22
- :#{id2} :pos #{rec.pos} .
23
- :#{id2} :vcf true .
23
+ :#{id2} seq:chr \"#{rec.chrom}\" .
24
+ :#{id2} seq:pos #{rec.pos} .
25
+ :#{id2} seq:alt \"#{rec.alt[0]}\" .
26
+ :#{id2} db:vcf true .
24
27
  OUT
25
- hash.each do |k,v|
26
- print ":#{id2} :#{k} #{v} .\n"
28
+ hash = eval(tags)
29
+ if hash
30
+ hash.each do |k,v|
31
+ print ":#{id2} #{k} #{v} .\n"
32
+ end
27
33
  end
34
+ print "\n"
28
35
  end
29
36
  end
30
37
  end
@@ -27,9 +27,10 @@ module BioVcf
27
27
  v = if @h
28
28
  @h[m.to_s.upcase]
29
29
  else
30
- @info =~ /#{m.to_s.upcase}=([^;]+)/
30
+ @info =~ /#{m.to_s}=([^;]+)/i
31
31
  value = $1
32
- # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/)
32
+ # p [m,value]
33
+ # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
33
34
  # value = m[:value]
34
35
  if value == nil
35
36
  split_fields # no option but to split
@@ -117,6 +118,7 @@ module BioVcf
117
118
  def initialize fields, header
118
119
  @fields = fields
119
120
  @header = header
121
+ @sample_by_index = []
120
122
  end
121
123
 
122
124
  def chrom
@@ -176,14 +178,15 @@ module BioVcf
176
178
  sample[name]
177
179
  end
178
180
 
181
+ def sample_by_index i
182
+ # p [i,@fields[i+9]]
183
+ @sample_by_index[i] ||= VcfGenotypeField.new(@fields[i+9],format,@header,alt)
184
+ end
185
+
186
+ # Walk the samples. list contains an Array of int (the index)
179
187
  def each_sample(list = nil)
180
- samples = @header.column_names[9..-1]
181
- raise "Empty sample list, can not execute query!" if not samples
182
- samples.each_with_index { |name,i|
183
- # p [i,list]
184
- next if list and not list.index(i.to_s)
185
- yield VcfSample::Sample.new(self,sample[name])
186
- }
188
+ list = @header.samples_index_array() if not list
189
+ list.each { |i| yield VcfSample::Sample.new(self,sample_by_index(i)) }
187
190
  end
188
191
 
189
192
  def missing_samples?
@@ -230,6 +233,39 @@ module BioVcf
230
233
  end
231
234
  end
232
235
 
236
+ def filter expr, ignore_missing_data, quiet
237
+ begin
238
+ if not respond_to?(:call_cached_filter)
239
+ code =
240
+ """
241
+ def call_cached_filter(rec,fields)
242
+ r = rec
243
+ #{expr}
244
+ end
245
+ """
246
+ self.class.class_eval(code)
247
+ end
248
+ res = call_cached_filter(self,@fields)
249
+ if res.kind_of?(Array)
250
+ res.join("\t")
251
+ else
252
+ res
253
+ end
254
+ rescue NoMethodError => e
255
+ if not quiet
256
+ $stderr.print "RECORD ERROR!\n"
257
+ $stderr.print [@fields],"\n"
258
+ $stderr.print expr,"\n"
259
+ end
260
+ if ignore_missing_data
261
+ $stderr.print e.message if not quiet
262
+ return false
263
+ else
264
+ raise
265
+ end
266
+ end
267
+ end
268
+
233
269
  # Return the sample
234
270
  def method_missing(m, *args, &block)
235
271
  name = m.to_s