bio-vcf 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +12 -0
- data/Gemfile +6 -6
- data/Gemfile.lock +49 -54
- data/README.md +83 -10
- data/Rakefile +5 -5
- data/VERSION +1 -1
- data/bin/bio-vcf +24 -8
- data/bio-vcf.gemspec +73 -0
- data/features/diff_count.feature +30 -0
- data/features/multisample.feature +37 -0
- data/features/somaticsniper.feature +84 -0
- data/features/step_definitions/diff_count.rb +41 -0
- data/features/step_definitions/multisample.rb +73 -0
- data/features/step_definitions/somaticsniper.rb +122 -0
- data/features/support/env.rb +4 -0
- data/lib/bio-vcf/variant.rb +38 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +118 -10
- data/lib/bio-vcf/vcfheader.rb +5 -0
- data/lib/bio-vcf/vcfrdf.rb +30 -0
- data/lib/bio-vcf/vcfrecord.rb +68 -5
- data/lib/bio-vcf.rb +1 -0
- data/test/data/input/multisample.vcf +150 -0
- metadata +28 -76
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 014f3f4adef8a533501e115027fc8a920487f949
|
4
|
+
data.tar.gz: 89e442176c21a0893c267dc324db403fd57e7577
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 624f6cd5251384da85b824cd1d98e49be2efcb765d34b902b225ba8e8dc7ce7b6fa68d9a47b8bea5f7eb60b33e21182c7d941d7b0da70ce58228354021c596a0
|
7
|
+
data.tar.gz: 7ffe437d72368db97987fe53c8aa93bebdcff68002975f86e02cf45ec69b48f76e060e76140e27242122b3c533021eae4073a19c0c0a09089edac62b02394682
|
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.3
|
4
|
+
- 2.1.0
|
5
|
+
- jruby-head
|
6
|
+
# - jruby-19mode # JRuby in 1.9 mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source "http://rubygems.org"
|
|
6
6
|
# Add dependencies to develop your gem here.
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
group :development do
|
9
|
-
gem "minitest"
|
10
|
-
gem "rspec"
|
11
|
-
gem "cucumber"
|
12
|
-
gem "jeweler", "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
gem "bundler", ">= 1.0.21"
|
9
|
+
# gem "minitest"
|
10
|
+
gem "rspec"
|
11
|
+
gem "cucumber"
|
12
|
+
gem "jeweler" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
+
# gem "bundler", ">= 1.0.21"
|
14
14
|
# gem "bio", ">= 1.4.2"
|
15
|
-
gem "rdoc", "~> 3.12"
|
15
|
+
# gem "rdoc", "~> 3.12"
|
16
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,78 +1,73 @@
|
|
1
|
-
GIT
|
2
|
-
remote: https://github.com/technicalpickles/jeweler.git
|
3
|
-
revision: f7e0a55a207d83f56637dd8fbabf26a803410faf
|
4
|
-
specs:
|
5
|
-
jeweler (1.8.7)
|
6
|
-
builder
|
7
|
-
bundler (~> 1.0)
|
8
|
-
git (>= 1.2.5)
|
9
|
-
github_api (= 0.10.1)
|
10
|
-
highline (>= 1.6.15)
|
11
|
-
nokogiri (= 1.5.10)
|
12
|
-
rake
|
13
|
-
rdoc
|
14
|
-
|
15
1
|
GEM
|
16
2
|
remote: http://rubygems.org/
|
17
3
|
specs:
|
18
4
|
addressable (2.3.5)
|
19
5
|
builder (3.2.2)
|
20
|
-
cucumber (1.3.
|
6
|
+
cucumber (1.3.11)
|
21
7
|
builder (>= 2.1.2)
|
22
8
|
diff-lcs (>= 1.1.3)
|
23
|
-
gherkin (~> 2.12
|
24
|
-
multi_json (
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
9
|
+
gherkin (~> 2.12)
|
10
|
+
multi_json (>= 1.7.5, < 2.0)
|
11
|
+
multi_test (>= 0.0.2)
|
12
|
+
descendants_tracker (0.0.3)
|
13
|
+
diff-lcs (1.2.5)
|
14
|
+
faraday (0.9.0)
|
15
|
+
multipart-post (>= 1.2, < 3)
|
16
|
+
gherkin (2.12.2)
|
29
17
|
multi_json (~> 1.3)
|
30
18
|
git (1.2.6)
|
31
|
-
github_api (0.
|
32
|
-
addressable
|
33
|
-
|
19
|
+
github_api (0.11.3)
|
20
|
+
addressable (~> 2.3)
|
21
|
+
descendants_tracker (~> 0.0.1)
|
22
|
+
faraday (~> 0.8, < 0.10)
|
34
23
|
hashie (>= 1.2)
|
35
|
-
multi_json (
|
36
|
-
nokogiri (~> 1.
|
24
|
+
multi_json (>= 1.7.5, < 2.0)
|
25
|
+
nokogiri (~> 1.6.0)
|
37
26
|
oauth2
|
38
27
|
hashie (2.0.5)
|
39
|
-
highline (1.6.
|
40
|
-
|
41
|
-
|
42
|
-
|
28
|
+
highline (1.6.21)
|
29
|
+
jeweler (2.0.1)
|
30
|
+
builder
|
31
|
+
bundler (>= 1.0)
|
32
|
+
git (>= 1.2.5)
|
33
|
+
github_api
|
34
|
+
highline (>= 1.6.15)
|
35
|
+
nokogiri (>= 1.5.10)
|
36
|
+
rake
|
37
|
+
rdoc
|
38
|
+
json (1.8.1)
|
39
|
+
jwt (0.1.11)
|
43
40
|
multi_json (>= 1.5)
|
44
|
-
|
45
|
-
multi_json (1.
|
41
|
+
mini_portile (0.5.2)
|
42
|
+
multi_json (1.9.0)
|
43
|
+
multi_test (0.0.3)
|
46
44
|
multi_xml (0.5.5)
|
47
|
-
multipart-post (
|
48
|
-
nokogiri (1.
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
jwt (~> 0.1.
|
53
|
-
multi_json (~> 1.
|
45
|
+
multipart-post (2.0.0)
|
46
|
+
nokogiri (1.6.1)
|
47
|
+
mini_portile (~> 0.5.0)
|
48
|
+
oauth2 (0.9.3)
|
49
|
+
faraday (>= 0.8, < 0.10)
|
50
|
+
jwt (~> 0.1.8)
|
51
|
+
multi_json (~> 1.3)
|
54
52
|
multi_xml (~> 0.5)
|
55
53
|
rack (~> 1.2)
|
56
54
|
rack (1.5.2)
|
57
|
-
rake (10.1.
|
58
|
-
rdoc (
|
55
|
+
rake (10.1.1)
|
56
|
+
rdoc (4.1.1)
|
59
57
|
json (~> 1.4)
|
60
|
-
rspec (2.
|
61
|
-
rspec-core (~> 2.
|
62
|
-
rspec-expectations (~> 2.
|
63
|
-
rspec-mocks (~> 2.
|
64
|
-
rspec-core (2.8
|
65
|
-
rspec-expectations (2.
|
66
|
-
diff-lcs (
|
67
|
-
rspec-mocks (2.
|
58
|
+
rspec (2.14.1)
|
59
|
+
rspec-core (~> 2.14.0)
|
60
|
+
rspec-expectations (~> 2.14.0)
|
61
|
+
rspec-mocks (~> 2.14.0)
|
62
|
+
rspec-core (2.14.8)
|
63
|
+
rspec-expectations (2.14.5)
|
64
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
65
|
+
rspec-mocks (2.14.6)
|
68
66
|
|
69
67
|
PLATFORMS
|
70
68
|
ruby
|
71
69
|
|
72
70
|
DEPENDENCIES
|
73
|
-
bundler (>= 1.0.21)
|
74
71
|
cucumber
|
75
|
-
jeweler
|
76
|
-
|
77
|
-
rdoc (~> 3.12)
|
78
|
-
rspec (~> 2.8.0)
|
72
|
+
jeweler
|
73
|
+
rspec
|
data/README.md
CHANGED
@@ -2,18 +2,24 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-vcf.png)](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
Yet another VCF parser. This one may give better performance
|
6
|
-
useful command line
|
5
|
+
Yet another VCF parser. This one may give better performance because
|
6
|
+
of lazy parsing and useful combinations of (fancy) command line
|
7
|
+
filtering. For example, to filter somatic data
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
11
|
+
```
|
7
12
|
|
8
13
|
The VCF format is commonly used for variant calling between NGS
|
9
14
|
samples. The fast parser needs to carry some state, recorded for each
|
10
15
|
file in VcfHeader, which contains the VCF file header. Individual
|
11
16
|
lines (variant calls) first go through a raw parser returning an array
|
12
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
17
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
13
18
|
|
14
|
-
|
15
|
-
|
16
|
-
code. It is
|
19
|
+
At this point the filter is pretty generic with multi-sample support.
|
20
|
+
If something is not working, check out the feature descriptions and
|
21
|
+
the source code. It is not hard to add features. Otherwise, send me a short
|
22
|
+
example of a VCF statement you need to work on.
|
17
23
|
|
18
24
|
## Installation
|
19
25
|
|
@@ -35,10 +41,17 @@ Get the version of the VCF file
|
|
35
41
|
Get the column headers
|
36
42
|
|
37
43
|
```ruby
|
38
|
-
bio-vcf -q
|
44
|
+
bio-vcf -q --eval-once 'header.column_names.join(",")' < file.vcf
|
39
45
|
CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR
|
40
46
|
```
|
41
47
|
|
48
|
+
Get the sample names
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
bio-vcf -q --eval-once 'header.samples.join(",")' < file.vcf
|
52
|
+
NORMAL,TUMOR
|
53
|
+
```
|
54
|
+
|
42
55
|
The 'fields' array contains unprocessed data (strings). Print first
|
43
56
|
five raw fields
|
44
57
|
|
@@ -59,6 +72,12 @@ object named 'rec'. Position is a value, so we can filter a range
|
|
59
72
|
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
60
73
|
```
|
61
74
|
|
75
|
+
Info fields are referenced by
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
79
|
+
```
|
80
|
+
|
62
81
|
With subfields defined by rec.format
|
63
82
|
|
64
83
|
```ruby
|
@@ -68,19 +87,23 @@ With subfields defined by rec.format
|
|
68
87
|
Output
|
69
88
|
|
70
89
|
```ruby
|
71
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
90
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
91
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq].join("\t")'
|
92
|
+
< file.vcf
|
72
93
|
```
|
73
94
|
|
74
95
|
Show the count of the bases that were scored as somatic
|
75
96
|
|
76
97
|
```ruby
|
77
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount.split(",")[["A","C","G","T"].index(rec.alt)]+
|
98
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount.split(",")[["A","C","G","T"].index(rec.alt)]+
|
99
|
+
"\t"+rec.tumor.gq.to_s' < file.vcf
|
78
100
|
```
|
79
101
|
|
80
102
|
Actually, we have a convenience implementation for bcount, so this is the same
|
81
103
|
|
82
104
|
```ruby
|
83
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
105
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
106
|
+
< file.vcf
|
84
107
|
```
|
85
108
|
|
86
109
|
Filter on the somatic results that were scored at least 4 times
|
@@ -95,6 +118,56 @@ Similar for base quality scores
|
|
95
118
|
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
96
119
|
```
|
97
120
|
|
121
|
+
If your samples have other names you can fetch genotypes for that
|
122
|
+
sample with
|
123
|
+
|
124
|
+
```sh
|
125
|
+
bio-vcf --eval "rec.sample['BIOPSY17513D'].gt" < file.vcf
|
126
|
+
```
|
127
|
+
|
128
|
+
Or read depth for another
|
129
|
+
|
130
|
+
```sh
|
131
|
+
bio-vcf --eval "rec.sample['subclone46'].dp" < file.vcf
|
132
|
+
```
|
133
|
+
|
134
|
+
Better even, you can access samples directly with
|
135
|
+
|
136
|
+
```sh
|
137
|
+
bio-vcf --eval "rec.sample.biopsy17513d.gt" < file.vcf
|
138
|
+
bio-vcf --eval "rec.sample.subclone46.dp" < file.vcf
|
139
|
+
```
|
140
|
+
|
141
|
+
For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
142
|
+
|
143
|
+
## API
|
144
|
+
|
145
|
+
BioVcf can also be used as an API. The following code is basically
|
146
|
+
what the command line interface uses (see ./bin/bio-vcf)
|
147
|
+
|
148
|
+
```ruby
|
149
|
+
FILE.each_line do | line |
|
150
|
+
if line =~ /^##fileformat=/
|
151
|
+
# ---- We have a new file header
|
152
|
+
header = VcfHeader.new
|
153
|
+
header.add(line)
|
154
|
+
STDIN.each_line do | headerline |
|
155
|
+
if headerline !~ /^#/
|
156
|
+
line = headerline
|
157
|
+
break # end of header
|
158
|
+
end
|
159
|
+
header.add(headerline)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# ---- Parse VCF record line
|
163
|
+
# fields = VcfLine.parse(line,header.columns)
|
164
|
+
fields = VcfLine.parse(line)
|
165
|
+
rec = VcfRecord.new(fields,header)
|
166
|
+
#
|
167
|
+
# Do something with rec
|
168
|
+
#
|
169
|
+
end
|
170
|
+
```
|
98
171
|
|
99
172
|
## Project home page
|
100
173
|
|
data/Rakefile
CHANGED
@@ -36,16 +36,16 @@ Jeweler::RubygemsDotOrgTasks.new
|
|
36
36
|
# spec.rcov = true
|
37
37
|
# end
|
38
38
|
|
39
|
-
require 'rake/testtask'
|
39
|
+
# require 'rake/testtask'
|
40
40
|
|
41
|
-
Rake::TestTask.new do |t|
|
42
|
-
|
43
|
-
end
|
41
|
+
# Rake::TestTask.new do |t|
|
42
|
+
# t.pattern = "spec/*_spec.rb"
|
43
|
+
# end
|
44
44
|
|
45
45
|
require 'cucumber/rake/task'
|
46
46
|
Cucumber::Rake::Task.new(:features)
|
47
47
|
|
48
|
-
task :default => :
|
48
|
+
task :default => :features
|
49
49
|
|
50
50
|
require 'rdoc/task'
|
51
51
|
Rake::RDocTask.new do |rdoc|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/bin/bio-vcf
CHANGED
@@ -25,21 +25,28 @@ require 'optparse'
|
|
25
25
|
|
26
26
|
options = { show_help: false}
|
27
27
|
opts = OptionParser.new do |o|
|
28
|
-
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)}
|
28
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
29
29
|
|
30
|
-
o.
|
30
|
+
o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
|
31
31
|
options[:filter] = cmd
|
32
32
|
end
|
33
|
-
o.
|
33
|
+
o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
|
34
34
|
options[:eval] = cmd
|
35
35
|
end
|
36
|
-
o.
|
36
|
+
o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
|
37
37
|
options[:eval_once] = true
|
38
38
|
options[:eval] = cmd
|
39
39
|
end
|
40
|
-
o.on("--rdf", "Generate RDF") do |b|
|
40
|
+
o.on("--rdf", "Generate Turtle RDF") do |b|
|
41
|
+
require 'bio-vcf/vcfrdf'
|
41
42
|
options[:rdf] = true
|
42
43
|
end
|
44
|
+
o.on_tail("--id name", String, "Identifier") do |s|
|
45
|
+
options[:id] = s
|
46
|
+
end
|
47
|
+
o.on_tail("--tags list", String, "Add tags") do |s|
|
48
|
+
options[:tags] = eval(s)
|
49
|
+
end
|
43
50
|
|
44
51
|
# Uncomment the following when using the bio-logger
|
45
52
|
# o.separator ""
|
@@ -77,7 +84,6 @@ begin
|
|
77
84
|
|
78
85
|
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
79
86
|
|
80
|
-
|
81
87
|
if options[:show_help]
|
82
88
|
print opts
|
83
89
|
print USAGE
|
@@ -87,6 +93,7 @@ begin
|
|
87
93
|
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
88
94
|
|
89
95
|
header = VcfHeader.new
|
96
|
+
header_out = false
|
90
97
|
|
91
98
|
STDIN.each_line do | line |
|
92
99
|
if line =~ /^##fileformat=/
|
@@ -110,8 +117,17 @@ begin
|
|
110
117
|
print eval(options[:eval])
|
111
118
|
exit(1) if options[:eval_once]
|
112
119
|
else
|
113
|
-
|
114
|
-
|
120
|
+
if options[:rdf]
|
121
|
+
# Output Turtle RDF
|
122
|
+
if not header_out
|
123
|
+
VcfRdf::header
|
124
|
+
header_out = true
|
125
|
+
end
|
126
|
+
VcfRdf::record(options[:id],rec,options[:tags])
|
127
|
+
else
|
128
|
+
# Default behaviour prints VCF line
|
129
|
+
print fields.join("\t")
|
130
|
+
end
|
115
131
|
end
|
116
132
|
print "\n"
|
117
133
|
end
|
data/bio-vcf.gemspec
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "bio-vcf"
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = "2014-03-05"
|
13
|
+
s.description = "Smart parser for VCF format"
|
14
|
+
s.email = "pjotr.public01@thebird.nl"
|
15
|
+
s.executables = ["bio-vcf"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".travis.yml",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"bin/bio-vcf",
|
29
|
+
"bio-vcf.gemspec",
|
30
|
+
"features/diff_count.feature",
|
31
|
+
"features/multisample.feature",
|
32
|
+
"features/somaticsniper.feature",
|
33
|
+
"features/step_definitions/bio-vcf_steps.rb",
|
34
|
+
"features/step_definitions/diff_count.rb",
|
35
|
+
"features/step_definitions/multisample.rb",
|
36
|
+
"features/step_definitions/somaticsniper.rb",
|
37
|
+
"features/support/env.rb",
|
38
|
+
"lib/bio-vcf.rb",
|
39
|
+
"lib/bio-vcf/variant.rb",
|
40
|
+
"lib/bio-vcf/vcf.rb",
|
41
|
+
"lib/bio-vcf/vcfgenotypefield.rb",
|
42
|
+
"lib/bio-vcf/vcfheader.rb",
|
43
|
+
"lib/bio-vcf/vcfline.rb",
|
44
|
+
"lib/bio-vcf/vcfrdf.rb",
|
45
|
+
"lib/bio-vcf/vcfrecord.rb",
|
46
|
+
"test/data/input/multisample.vcf",
|
47
|
+
"test/data/input/somaticsniper.vcf"
|
48
|
+
]
|
49
|
+
s.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
50
|
+
s.licenses = ["MIT"]
|
51
|
+
s.require_paths = ["lib"]
|
52
|
+
s.rubygems_version = "2.0.3"
|
53
|
+
s.summary = "VCF parser"
|
54
|
+
|
55
|
+
if s.respond_to? :specification_version then
|
56
|
+
s.specification_version = 4
|
57
|
+
|
58
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
59
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
60
|
+
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
61
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
62
|
+
else
|
63
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
64
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
65
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
66
|
+
end
|
67
|
+
else
|
68
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
69
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
70
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
@diff
|
2
|
+
|
3
|
+
Feature: Variant calling (filters) - diffing nucleotide counts
|
4
|
+
|
5
|
+
Basic filtering happens on the command line with the --filter switch. To
|
6
|
+
support somewhat more advanced features the following features are
|
7
|
+
included.
|
8
|
+
|
9
|
+
When diffing nucleotide counts we want to find out which nucleotide defines
|
10
|
+
the tumor. The difference has to be larger than 0 and the relative difference
|
11
|
+
is the max. When a threshold is set only those nucleotides are included which
|
12
|
+
pass the threshold (i.e., no more than x supporting nucleotides in the
|
13
|
+
reference).
|
14
|
+
|
15
|
+
The advantage is that filtering is possible without actually looking at
|
16
|
+
the rec.alt and rec.ref values, i.e., no assumptions are being made
|
17
|
+
about the underlying nucleotides.
|
18
|
+
|
19
|
+
Scenario: Diffing nucleotide counts
|
20
|
+
|
21
|
+
Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
|
22
|
+
When I look for the difference
|
23
|
+
Then I expect the diff to be [0,15,0,11]
|
24
|
+
And the relative diff to be [0,0.23,0,0.85]
|
25
|
+
And I expect the defining tumor nucleotide to be "T"
|
26
|
+
And I expect the tumor count to be 12
|
27
|
+
When I set an inclusion threshold for the reference
|
28
|
+
Then I expect the diff for threshold 2 to be [0,0,0,11]
|
29
|
+
And the relative diff to be [0,0,0,0.85]
|
30
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
@multi
|
2
|
+
Feature: Multi-sample VCF
|
3
|
+
|
4
|
+
Here we take a VCF line and parse the information for multiple named
|
5
|
+
samples
|
6
|
+
|
7
|
+
Scenario: When parsing a record
|
8
|
+
|
9
|
+
Given the multi sample header line
|
10
|
+
"""
|
11
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BIOPSY17513D clone10 clone3 clone4 subclone105 subclone33 subclone46
|
12
|
+
"""
|
13
|
+
When I parse the header
|
14
|
+
Given multisample vcf line
|
15
|
+
"""
|
16
|
+
1 10321 . C T 106.30 . AC=5;AF=0.357;AN=14;BaseQRankSum=3.045;DP=1537;Dels=0.01;FS=5.835;HaplotypeScore=220.1531;MLEAC=5;MLEAF=0.357;MQ=26.69;MQ0=258;MQRankSum=-4.870;QD=0.10;ReadPosRankSum=0.815 GT:AD:DP:GQ:PL 0/1:189,25:218:30:30,0,810 0/0:219,22:246:24:0,24,593 0/1:218,27:248:34:34,0,1134 0/0:220,22:248:56:0,56,1207 0/1:168,23:193:19:19,0,493 0/1:139,22:164:46:46,0,689 0/1:167,26:196:20:20,0,522
|
17
|
+
"""
|
18
|
+
When I parse the record
|
19
|
+
Then I expect rec.chrom to contain "1"
|
20
|
+
Then I expect rec.pos to contain 10321
|
21
|
+
Then I expect rec.ref to contain "C"
|
22
|
+
And I expect multisample rec.alt to contain ["T"]
|
23
|
+
And I expect rec.qual to be 106.30
|
24
|
+
And I expect rec.info.ac to be 5
|
25
|
+
And I expect rec.info.af to be 0.357
|
26
|
+
And I expect rec.info.dp to be 1537
|
27
|
+
And I expect rec.info.readposranksum to be 0.815
|
28
|
+
And I expect rec.sample['BIOPSY17513D'].gt to be "0/1"
|
29
|
+
And I expect rec.sample['BIOPSY17513D'].ad to be [189,25]
|
30
|
+
And I expect rec.sample['subclone46'].ad to be [167,26]
|
31
|
+
And I expect rec.sample['subclone46'].dp to be 196
|
32
|
+
And I expect rec.sample['subclone46'].gq to be 20
|
33
|
+
And I expect rec.sample['subclone46'].pl to be [20,0,522]
|
34
|
+
# And the nicer self resolving
|
35
|
+
And I expect rec.sample.biopsy17513d.gt to be [0,1]
|
36
|
+
And I expect rec.sample.subclone46.pl to be [20,0,522]
|
37
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
@sniper
|
2
|
+
Feature: VCF for Somatic Sniper
|
3
|
+
|
4
|
+
Here we take a VCF line and parse the information given by Somatic Sniper.
|
5
|
+
|
6
|
+
At this position the reference contains: AAAGAAAAGAAAAA (12A,2G)
|
7
|
+
At this position the tumor contains: AAAAACACAA (8A,2C)
|
8
|
+
|
9
|
+
rec.alt contains variants C,G. rec.tumor.bcount reflects the contents of the
|
10
|
+
tumor (8A,2C) so rec.tumor.bcount[rec.alt] reflects the actual number of
|
11
|
+
variants in the tumor.
|
12
|
+
|
13
|
+
The mapping quality in the BAM file is 37/37 and base quality is 55/60 in normal
|
14
|
+
and tumor respectively.
|
15
|
+
|
16
|
+
For the second scenario:
|
17
|
+
|
18
|
+
At this position the reference contains: (15A)
|
19
|
+
At this position the tumor contains: AAAAAAAAATATTA (13A, 3T)
|
20
|
+
|
21
|
+
Scenario: When parsing a record
|
22
|
+
|
23
|
+
Given the somatic sniper vcf line
|
24
|
+
"""
|
25
|
+
1 27691244 . A C,G . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/2:0/2:14:0,12,0,2:12,0,2,0:14:35:14:14,35:37:37,37:1:. 0/1:0/1:10:0,8,0,2:8,2,0,0:18:35:18:20,51:37:37,37:2:33
|
26
|
+
"""
|
27
|
+
When I parse the record
|
28
|
+
Then I expect rec.chrom to contain "1"
|
29
|
+
Then I expect rec.pos to contain 27691244
|
30
|
+
Then I expect rec.ref to contain "A"
|
31
|
+
And I expect rec.alt to contain ["C","G"]
|
32
|
+
And I expect rec.tumor.dp to be 10
|
33
|
+
And I expect rec.tumor.dp4 to be [0,8,0,2]
|
34
|
+
And I expect rec.tumor.bcount.to_ary to be [8,2,0,0]
|
35
|
+
And I expect rec.tumor.bcount[rec.alt] to be [2,0]
|
36
|
+
And I expect rec.tumor.bcount["G"] to be 0
|
37
|
+
And I expect rec.tumor.bcount[1] to be 2
|
38
|
+
And I expect rec.tumor.bcount[3] to be 0
|
39
|
+
And I expect rec.tumor.bcount.sum to be 2
|
40
|
+
And I expect rec.tumor.bcount.max to be 2
|
41
|
+
And I expect rec.tumor.bq.to_ary to be [20,51]
|
42
|
+
And I expect rec.tumor.bq["G"] to be 51
|
43
|
+
And I expect rec.tumor.bq[1] to be 51
|
44
|
+
And I expect rec.tumor.bq.min to be 20
|
45
|
+
And I expect rec.tumor.bq.max to be 51
|
46
|
+
And I expect rec.tumor.amq.to_ary to be [37,37]
|
47
|
+
And I expect rec.tumor.mq to be 37
|
48
|
+
And I expect rec.tumor.ss to be 2
|
49
|
+
# The following are additional functions
|
50
|
+
And I expect rec.call_diff to be [-4,2,-2,0]
|
51
|
+
And I expect rec.call_nuc to be "C"
|
52
|
+
And I expect rec.call_tumor_count to be 2
|
53
|
+
And I expect rec.call_normal_count to be 0
|
54
|
+
And I expect rec.call_tumor_relative_count to be 1.0
|
55
|
+
|
56
|
+
Given the somatic sniper vcf line
|
57
|
+
"""
|
58
|
+
1 27686841 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:15:3,12,0,0:15,0,0,0:66:37:0:25:37:37:0:. 0/1:0/1:16:2,11,0,3:13,0,0,3:30:37:30:34,55:37:37,37:2:37
|
59
|
+
"""
|
60
|
+
When I parse the record
|
61
|
+
Then I expect rec.chrom to contain "1"
|
62
|
+
Then I expect rec.pos to contain 27686841
|
63
|
+
Then I expect rec.ref to contain "A"
|
64
|
+
And I expect rec.alt to contain one ["T"]
|
65
|
+
And I expect rec.tumor.dp to be 16
|
66
|
+
And I expect rec.tumor.dp4 to be [2,11,0,3]
|
67
|
+
And I expect rec.tumor.bcount.to_ary to be [13,0,0,3]
|
68
|
+
And I expect rec.tumor.bcount[rec.alt] to be one [3]
|
69
|
+
And I expect rec.tumor.bcount["G"] to be 0
|
70
|
+
And I expect rec.tumor.bcount["T"] to be 3
|
71
|
+
And I expect rec.tumor.bcount[1] to be 0
|
72
|
+
And I expect rec.tumor.bcount[3] to be 3
|
73
|
+
And I expect rec.tumor.bcount.sum to be 3
|
74
|
+
And I expect rec.tumor.bcount.max to be 3
|
75
|
+
And I expect rec.tumor.bq.to_ary to be [34,55]
|
76
|
+
And I expect rec.tumor.bq["T"] to be 34
|
77
|
+
And I expect rec.tumor.bq[1] to be 55
|
78
|
+
And I expect rec.tumor.bq.min to be 34
|
79
|
+
And I expect rec.tumor.bq.max to be 55
|
80
|
+
And I expect rec.tumor.amq.to_ary to be [37,37]
|
81
|
+
And I expect rec.tumor.mq to be 37
|
82
|
+
And I expect rec.tumor.ss to be 2
|
83
|
+
|
84
|
+
|