bio-vcf 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +12 -0
- data/Gemfile +6 -6
- data/Gemfile.lock +49 -54
- data/README.md +83 -10
- data/Rakefile +5 -5
- data/VERSION +1 -1
- data/bin/bio-vcf +24 -8
- data/bio-vcf.gemspec +73 -0
- data/features/diff_count.feature +30 -0
- data/features/multisample.feature +37 -0
- data/features/somaticsniper.feature +84 -0
- data/features/step_definitions/diff_count.rb +41 -0
- data/features/step_definitions/multisample.rb +73 -0
- data/features/step_definitions/somaticsniper.rb +122 -0
- data/features/support/env.rb +4 -0
- data/lib/bio-vcf/variant.rb +38 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +118 -10
- data/lib/bio-vcf/vcfheader.rb +5 -0
- data/lib/bio-vcf/vcfrdf.rb +30 -0
- data/lib/bio-vcf/vcfrecord.rb +68 -5
- data/lib/bio-vcf.rb +1 -0
- data/test/data/input/multisample.vcf +150 -0
- metadata +28 -76
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 014f3f4adef8a533501e115027fc8a920487f949
|
4
|
+
data.tar.gz: 89e442176c21a0893c267dc324db403fd57e7577
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 624f6cd5251384da85b824cd1d98e49be2efcb765d34b902b225ba8e8dc7ce7b6fa68d9a47b8bea5f7eb60b33e21182c7d941d7b0da70ce58228354021c596a0
|
7
|
+
data.tar.gz: 7ffe437d72368db97987fe53c8aa93bebdcff68002975f86e02cf45ec69b48f76e060e76140e27242122b3c533021eae4073a19c0c0a09089edac62b02394682
|
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.3
|
4
|
+
- 2.1.0
|
5
|
+
- jruby-head
|
6
|
+
# - jruby-19mode # JRuby in 1.9 mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source "http://rubygems.org"
|
|
6
6
|
# Add dependencies to develop your gem here.
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
group :development do
|
9
|
-
gem "minitest"
|
10
|
-
gem "rspec"
|
11
|
-
gem "cucumber"
|
12
|
-
gem "jeweler", "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
-
gem "bundler", ">= 1.0.21"
|
9
|
+
# gem "minitest"
|
10
|
+
gem "rspec"
|
11
|
+
gem "cucumber"
|
12
|
+
gem "jeweler" # , "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
|
+
# gem "bundler", ">= 1.0.21"
|
14
14
|
# gem "bio", ">= 1.4.2"
|
15
|
-
gem "rdoc", "~> 3.12"
|
15
|
+
# gem "rdoc", "~> 3.12"
|
16
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,78 +1,73 @@
|
|
1
|
-
GIT
|
2
|
-
remote: https://github.com/technicalpickles/jeweler.git
|
3
|
-
revision: f7e0a55a207d83f56637dd8fbabf26a803410faf
|
4
|
-
specs:
|
5
|
-
jeweler (1.8.7)
|
6
|
-
builder
|
7
|
-
bundler (~> 1.0)
|
8
|
-
git (>= 1.2.5)
|
9
|
-
github_api (= 0.10.1)
|
10
|
-
highline (>= 1.6.15)
|
11
|
-
nokogiri (= 1.5.10)
|
12
|
-
rake
|
13
|
-
rdoc
|
14
|
-
|
15
1
|
GEM
|
16
2
|
remote: http://rubygems.org/
|
17
3
|
specs:
|
18
4
|
addressable (2.3.5)
|
19
5
|
builder (3.2.2)
|
20
|
-
cucumber (1.3.
|
6
|
+
cucumber (1.3.11)
|
21
7
|
builder (>= 2.1.2)
|
22
8
|
diff-lcs (>= 1.1.3)
|
23
|
-
gherkin (~> 2.12
|
24
|
-
multi_json (
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
9
|
+
gherkin (~> 2.12)
|
10
|
+
multi_json (>= 1.7.5, < 2.0)
|
11
|
+
multi_test (>= 0.0.2)
|
12
|
+
descendants_tracker (0.0.3)
|
13
|
+
diff-lcs (1.2.5)
|
14
|
+
faraday (0.9.0)
|
15
|
+
multipart-post (>= 1.2, < 3)
|
16
|
+
gherkin (2.12.2)
|
29
17
|
multi_json (~> 1.3)
|
30
18
|
git (1.2.6)
|
31
|
-
github_api (0.
|
32
|
-
addressable
|
33
|
-
|
19
|
+
github_api (0.11.3)
|
20
|
+
addressable (~> 2.3)
|
21
|
+
descendants_tracker (~> 0.0.1)
|
22
|
+
faraday (~> 0.8, < 0.10)
|
34
23
|
hashie (>= 1.2)
|
35
|
-
multi_json (
|
36
|
-
nokogiri (~> 1.
|
24
|
+
multi_json (>= 1.7.5, < 2.0)
|
25
|
+
nokogiri (~> 1.6.0)
|
37
26
|
oauth2
|
38
27
|
hashie (2.0.5)
|
39
|
-
highline (1.6.
|
40
|
-
|
41
|
-
|
42
|
-
|
28
|
+
highline (1.6.21)
|
29
|
+
jeweler (2.0.1)
|
30
|
+
builder
|
31
|
+
bundler (>= 1.0)
|
32
|
+
git (>= 1.2.5)
|
33
|
+
github_api
|
34
|
+
highline (>= 1.6.15)
|
35
|
+
nokogiri (>= 1.5.10)
|
36
|
+
rake
|
37
|
+
rdoc
|
38
|
+
json (1.8.1)
|
39
|
+
jwt (0.1.11)
|
43
40
|
multi_json (>= 1.5)
|
44
|
-
|
45
|
-
multi_json (1.
|
41
|
+
mini_portile (0.5.2)
|
42
|
+
multi_json (1.9.0)
|
43
|
+
multi_test (0.0.3)
|
46
44
|
multi_xml (0.5.5)
|
47
|
-
multipart-post (
|
48
|
-
nokogiri (1.
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
jwt (~> 0.1.
|
53
|
-
multi_json (~> 1.
|
45
|
+
multipart-post (2.0.0)
|
46
|
+
nokogiri (1.6.1)
|
47
|
+
mini_portile (~> 0.5.0)
|
48
|
+
oauth2 (0.9.3)
|
49
|
+
faraday (>= 0.8, < 0.10)
|
50
|
+
jwt (~> 0.1.8)
|
51
|
+
multi_json (~> 1.3)
|
54
52
|
multi_xml (~> 0.5)
|
55
53
|
rack (~> 1.2)
|
56
54
|
rack (1.5.2)
|
57
|
-
rake (10.1.
|
58
|
-
rdoc (
|
55
|
+
rake (10.1.1)
|
56
|
+
rdoc (4.1.1)
|
59
57
|
json (~> 1.4)
|
60
|
-
rspec (2.
|
61
|
-
rspec-core (~> 2.
|
62
|
-
rspec-expectations (~> 2.
|
63
|
-
rspec-mocks (~> 2.
|
64
|
-
rspec-core (2.8
|
65
|
-
rspec-expectations (2.
|
66
|
-
diff-lcs (
|
67
|
-
rspec-mocks (2.
|
58
|
+
rspec (2.14.1)
|
59
|
+
rspec-core (~> 2.14.0)
|
60
|
+
rspec-expectations (~> 2.14.0)
|
61
|
+
rspec-mocks (~> 2.14.0)
|
62
|
+
rspec-core (2.14.8)
|
63
|
+
rspec-expectations (2.14.5)
|
64
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
65
|
+
rspec-mocks (2.14.6)
|
68
66
|
|
69
67
|
PLATFORMS
|
70
68
|
ruby
|
71
69
|
|
72
70
|
DEPENDENCIES
|
73
|
-
bundler (>= 1.0.21)
|
74
71
|
cucumber
|
75
|
-
jeweler
|
76
|
-
|
77
|
-
rdoc (~> 3.12)
|
78
|
-
rspec (~> 2.8.0)
|
72
|
+
jeweler
|
73
|
+
rspec
|
data/README.md
CHANGED
@@ -2,18 +2,24 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/pjotrp/bioruby-vcf)
|
4
4
|
|
5
|
-
Yet another VCF parser. This one may give better performance
|
6
|
-
useful command line
|
5
|
+
Yet another VCF parser. This one may give better performance because
|
6
|
+
of lazy parsing and useful combinations of (fancy) command line
|
7
|
+
filtering. For example, to filter somatic data
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
11
|
+
```
|
7
12
|
|
8
13
|
The VCF format is commonly used for variant calling between NGS
|
9
14
|
samples. The fast parser needs to carry some state, recorded for each
|
10
15
|
file in VcfHeader, which contains the VCF file header. Individual
|
11
16
|
lines (variant calls) first go through a raw parser returning an array
|
12
|
-
of fields. Further (lazy) parsing is handled through VcfRecord.
|
17
|
+
of fields. Further (lazy) parsing is handled through VcfRecord.
|
13
18
|
|
14
|
-
|
15
|
-
|
16
|
-
code. It is
|
19
|
+
At this point the filter is pretty generic with multi-sample support.
|
20
|
+
If something is not working, check out the feature descriptions and
|
21
|
+
the source code. It is not hard to add features. Otherwise, send me a short
|
22
|
+
example of a VCF statement you need to work on.
|
17
23
|
|
18
24
|
## Installation
|
19
25
|
|
@@ -35,10 +41,17 @@ Get the version of the VCF file
|
|
35
41
|
Get the column headers
|
36
42
|
|
37
43
|
```ruby
|
38
|
-
bio-vcf -q
|
44
|
+
bio-vcf -q --eval-once 'header.column_names.join(",")' < file.vcf
|
39
45
|
CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR
|
40
46
|
```
|
41
47
|
|
48
|
+
Get the sample names
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
bio-vcf -q --eval-once 'header.samples.join(",")' < file.vcf
|
52
|
+
NORMAL,TUMOR
|
53
|
+
```
|
54
|
+
|
42
55
|
The 'fields' array contains unprocessed data (strings). Print first
|
43
56
|
five raw fields
|
44
57
|
|
@@ -59,6 +72,12 @@ object named 'rec'. Position is a value, so we can filter a range
|
|
59
72
|
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
60
73
|
```
|
61
74
|
|
75
|
+
Info fields are referenced by
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
bio-vcf --filter 'rec.info.dp>100 and rec.info.readposranksum<=0.815' < file.vcf
|
79
|
+
```
|
80
|
+
|
62
81
|
With subfields defined by rec.format
|
63
82
|
|
64
83
|
```ruby
|
@@ -68,19 +87,23 @@ With subfields defined by rec.format
|
|
68
87
|
Output
|
69
88
|
|
70
89
|
```ruby
|
71
|
-
bio-vcf --filter 'rec.tumor.gq>30'
|
90
|
+
bio-vcf --filter 'rec.tumor.gq>30'
|
91
|
+
--eval '[rec.ref,rec.alt,rec.tumor.bcount,rec.tumor.gq,rec.normal.gq].join("\t")'
|
92
|
+
< file.vcf
|
72
93
|
```
|
73
94
|
|
74
95
|
Show the count of the bases that were scored as somatic
|
75
96
|
|
76
97
|
```ruby
|
77
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount.split(",")[["A","C","G","T"].index(rec.alt)]+
|
98
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount.split(",")[["A","C","G","T"].index(rec.alt)]+
|
99
|
+
"\t"+rec.tumor.gq.to_s' < file.vcf
|
78
100
|
```
|
79
101
|
|
80
102
|
Actually, we have a convenience implementation for bcount, so this is the same
|
81
103
|
|
82
104
|
```ruby
|
83
|
-
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
105
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.tumor.bcount[rec.alt].to_s+"\t"+rec.tumor.gq.to_s'
|
106
|
+
< file.vcf
|
84
107
|
```
|
85
108
|
|
86
109
|
Filter on the somatic results that were scored at least 4 times
|
@@ -95,6 +118,56 @@ Similar for base quality scores
|
|
95
118
|
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
96
119
|
```
|
97
120
|
|
121
|
+
If your samples have other names you can fetch genotypes for that
|
122
|
+
sample with
|
123
|
+
|
124
|
+
```sh
|
125
|
+
bio-vcf --eval "rec.sample['BIOPSY17513D'].gt" < file.vcf
|
126
|
+
```
|
127
|
+
|
128
|
+
Or read depth for another
|
129
|
+
|
130
|
+
```sh
|
131
|
+
bio-vcf --eval "rec.sample['subclone46'].dp" < file.vcf
|
132
|
+
```
|
133
|
+
|
134
|
+
Better even, you can access samples directly with
|
135
|
+
|
136
|
+
```sh
|
137
|
+
bio-vcf --eval "rec.sample.biopsy17513d.gt" < file.vcf
|
138
|
+
bio-vcf --eval "rec.sample.subclone46.dp" < file.vcf
|
139
|
+
```
|
140
|
+
|
141
|
+
For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
142
|
+
|
143
|
+
## API
|
144
|
+
|
145
|
+
BioVcf can also be used as an API. The following code is basically
|
146
|
+
what the command line interface uses (see ./bin/bio-vcf)
|
147
|
+
|
148
|
+
```ruby
|
149
|
+
FILE.each_line do | line |
|
150
|
+
if line =~ /^##fileformat=/
|
151
|
+
# ---- We have a new file header
|
152
|
+
header = VcfHeader.new
|
153
|
+
header.add(line)
|
154
|
+
STDIN.each_line do | headerline |
|
155
|
+
if headerline !~ /^#/
|
156
|
+
line = headerline
|
157
|
+
break # end of header
|
158
|
+
end
|
159
|
+
header.add(headerline)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# ---- Parse VCF record line
|
163
|
+
# fields = VcfLine.parse(line,header.columns)
|
164
|
+
fields = VcfLine.parse(line)
|
165
|
+
rec = VcfRecord.new(fields,header)
|
166
|
+
#
|
167
|
+
# Do something with rec
|
168
|
+
#
|
169
|
+
end
|
170
|
+
```
|
98
171
|
|
99
172
|
## Project home page
|
100
173
|
|
data/Rakefile
CHANGED
@@ -36,16 +36,16 @@ Jeweler::RubygemsDotOrgTasks.new
|
|
36
36
|
# spec.rcov = true
|
37
37
|
# end
|
38
38
|
|
39
|
-
require 'rake/testtask'
|
39
|
+
# require 'rake/testtask'
|
40
40
|
|
41
|
-
Rake::TestTask.new do |t|
|
42
|
-
|
43
|
-
end
|
41
|
+
# Rake::TestTask.new do |t|
|
42
|
+
# t.pattern = "spec/*_spec.rb"
|
43
|
+
# end
|
44
44
|
|
45
45
|
require 'cucumber/rake/task'
|
46
46
|
Cucumber::Rake::Task.new(:features)
|
47
47
|
|
48
|
-
task :default => :
|
48
|
+
task :default => :features
|
49
49
|
|
50
50
|
require 'rdoc/task'
|
51
51
|
Rake::RDocTask.new do |rdoc|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/bin/bio-vcf
CHANGED
@@ -25,21 +25,28 @@ require 'optparse'
|
|
25
25
|
|
26
26
|
options = { show_help: false}
|
27
27
|
opts = OptionParser.new do |o|
|
28
|
-
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)}
|
28
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
29
29
|
|
30
|
-
o.
|
30
|
+
o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
|
31
31
|
options[:filter] = cmd
|
32
32
|
end
|
33
|
-
o.
|
33
|
+
o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
|
34
34
|
options[:eval] = cmd
|
35
35
|
end
|
36
|
-
o.
|
36
|
+
o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
|
37
37
|
options[:eval_once] = true
|
38
38
|
options[:eval] = cmd
|
39
39
|
end
|
40
|
-
o.on("--rdf", "Generate RDF") do |b|
|
40
|
+
o.on("--rdf", "Generate Turtle RDF") do |b|
|
41
|
+
require 'bio-vcf/vcfrdf'
|
41
42
|
options[:rdf] = true
|
42
43
|
end
|
44
|
+
o.on_tail("--id name", String, "Identifier") do |s|
|
45
|
+
options[:id] = s
|
46
|
+
end
|
47
|
+
o.on_tail("--tags list", String, "Add tags") do |s|
|
48
|
+
options[:tags] = eval(s)
|
49
|
+
end
|
43
50
|
|
44
51
|
# Uncomment the following when using the bio-logger
|
45
52
|
# o.separator ""
|
@@ -77,7 +84,6 @@ begin
|
|
77
84
|
|
78
85
|
$stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
79
86
|
|
80
|
-
|
81
87
|
if options[:show_help]
|
82
88
|
print opts
|
83
89
|
print USAGE
|
@@ -87,6 +93,7 @@ begin
|
|
87
93
|
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
88
94
|
|
89
95
|
header = VcfHeader.new
|
96
|
+
header_out = false
|
90
97
|
|
91
98
|
STDIN.each_line do | line |
|
92
99
|
if line =~ /^##fileformat=/
|
@@ -110,8 +117,17 @@ begin
|
|
110
117
|
print eval(options[:eval])
|
111
118
|
exit(1) if options[:eval_once]
|
112
119
|
else
|
113
|
-
|
114
|
-
|
120
|
+
if options[:rdf]
|
121
|
+
# Output Turtle RDF
|
122
|
+
if not header_out
|
123
|
+
VcfRdf::header
|
124
|
+
header_out = true
|
125
|
+
end
|
126
|
+
VcfRdf::record(options[:id],rec,options[:tags])
|
127
|
+
else
|
128
|
+
# Default behaviour prints VCF line
|
129
|
+
print fields.join("\t")
|
130
|
+
end
|
115
131
|
end
|
116
132
|
print "\n"
|
117
133
|
end
|
data/bio-vcf.gemspec
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "bio-vcf"
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = "2014-03-05"
|
13
|
+
s.description = "Smart parser for VCF format"
|
14
|
+
s.email = "pjotr.public01@thebird.nl"
|
15
|
+
s.executables = ["bio-vcf"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".travis.yml",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"bin/bio-vcf",
|
29
|
+
"bio-vcf.gemspec",
|
30
|
+
"features/diff_count.feature",
|
31
|
+
"features/multisample.feature",
|
32
|
+
"features/somaticsniper.feature",
|
33
|
+
"features/step_definitions/bio-vcf_steps.rb",
|
34
|
+
"features/step_definitions/diff_count.rb",
|
35
|
+
"features/step_definitions/multisample.rb",
|
36
|
+
"features/step_definitions/somaticsniper.rb",
|
37
|
+
"features/support/env.rb",
|
38
|
+
"lib/bio-vcf.rb",
|
39
|
+
"lib/bio-vcf/variant.rb",
|
40
|
+
"lib/bio-vcf/vcf.rb",
|
41
|
+
"lib/bio-vcf/vcfgenotypefield.rb",
|
42
|
+
"lib/bio-vcf/vcfheader.rb",
|
43
|
+
"lib/bio-vcf/vcfline.rb",
|
44
|
+
"lib/bio-vcf/vcfrdf.rb",
|
45
|
+
"lib/bio-vcf/vcfrecord.rb",
|
46
|
+
"test/data/input/multisample.vcf",
|
47
|
+
"test/data/input/somaticsniper.vcf"
|
48
|
+
]
|
49
|
+
s.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
50
|
+
s.licenses = ["MIT"]
|
51
|
+
s.require_paths = ["lib"]
|
52
|
+
s.rubygems_version = "2.0.3"
|
53
|
+
s.summary = "VCF parser"
|
54
|
+
|
55
|
+
if s.respond_to? :specification_version then
|
56
|
+
s.specification_version = 4
|
57
|
+
|
58
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
59
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
60
|
+
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
61
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
62
|
+
else
|
63
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
64
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
65
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
66
|
+
end
|
67
|
+
else
|
68
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
69
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
70
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
@diff
|
2
|
+
|
3
|
+
Feature: Variant calling (filters) - diffing nucleotide counts
|
4
|
+
|
5
|
+
Basic filtering happens on the command line with the --filter switch. To
|
6
|
+
support somewhat more advanced features the following features are
|
7
|
+
included.
|
8
|
+
|
9
|
+
When diffing nucleotide counts we want to find out which nucleotide defines
|
10
|
+
the tumor. The difference has to be larger than 0 and the relative difference
|
11
|
+
is the max. When a threshold is set only those nucleotides are included which
|
12
|
+
pass the threshold (i.e., no more than x supporting nucleotides in the
|
13
|
+
reference).
|
14
|
+
|
15
|
+
The advantage is that filtering is possible without actually looking at
|
16
|
+
the rec.alt and rec.ref values, i.e., no assumptions are being made
|
17
|
+
about the underlying nucleotides.
|
18
|
+
|
19
|
+
Scenario: Diffing nucleotide counts
|
20
|
+
|
21
|
+
Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
|
22
|
+
When I look for the difference
|
23
|
+
Then I expect the diff to be [0,15,0,11]
|
24
|
+
And the relative diff to be [0,0.23,0,0.85]
|
25
|
+
And I expect the defining tumor nucleotide to be "T"
|
26
|
+
And I expect the tumor count to be 12
|
27
|
+
When I set an inclusion threshold for the reference
|
28
|
+
Then I expect the diff for threshold 2 to be [0,0,0,11]
|
29
|
+
And the relative diff to be [0,0,0,0.85]
|
30
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
@multi
|
2
|
+
Feature: Multi-sample VCF
|
3
|
+
|
4
|
+
Here we take a VCF line and parse the information for multiple named
|
5
|
+
samples
|
6
|
+
|
7
|
+
Scenario: When parsing a record
|
8
|
+
|
9
|
+
Given the multi sample header line
|
10
|
+
"""
|
11
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BIOPSY17513D clone10 clone3 clone4 subclone105 subclone33 subclone46
|
12
|
+
"""
|
13
|
+
When I parse the header
|
14
|
+
Given multisample vcf line
|
15
|
+
"""
|
16
|
+
1 10321 . C T 106.30 . AC=5;AF=0.357;AN=14;BaseQRankSum=3.045;DP=1537;Dels=0.01;FS=5.835;HaplotypeScore=220.1531;MLEAC=5;MLEAF=0.357;MQ=26.69;MQ0=258;MQRankSum=-4.870;QD=0.10;ReadPosRankSum=0.815 GT:AD:DP:GQ:PL 0/1:189,25:218:30:30,0,810 0/0:219,22:246:24:0,24,593 0/1:218,27:248:34:34,0,1134 0/0:220,22:248:56:0,56,1207 0/1:168,23:193:19:19,0,493 0/1:139,22:164:46:46,0,689 0/1:167,26:196:20:20,0,522
|
17
|
+
"""
|
18
|
+
When I parse the record
|
19
|
+
Then I expect rec.chrom to contain "1"
|
20
|
+
Then I expect rec.pos to contain 10321
|
21
|
+
Then I expect rec.ref to contain "C"
|
22
|
+
And I expect multisample rec.alt to contain ["T"]
|
23
|
+
And I expect rec.qual to be 106.30
|
24
|
+
And I expect rec.info.ac to be 5
|
25
|
+
And I expect rec.info.af to be 0.357
|
26
|
+
And I expect rec.info.dp to be 1537
|
27
|
+
And I expect rec.info.readposranksum to be 0.815
|
28
|
+
And I expect rec.sample['BIOPSY17513D'].gt to be "0/1"
|
29
|
+
And I expect rec.sample['BIOPSY17513D'].ad to be [189,25]
|
30
|
+
And I expect rec.sample['subclone46'].ad to be [167,26]
|
31
|
+
And I expect rec.sample['subclone46'].dp to be 196
|
32
|
+
And I expect rec.sample['subclone46'].gq to be 20
|
33
|
+
And I expect rec.sample['subclone46'].pl to be [20,0,522]
|
34
|
+
# And the nicer self resolving
|
35
|
+
And I expect rec.sample.biopsy17513d.gt to be [0,1]
|
36
|
+
And I expect rec.sample.subclone46.pl to be [20,0,522]
|
37
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
@sniper
|
2
|
+
Feature: VCF for Somatic Sniper
|
3
|
+
|
4
|
+
Here we take a VCF line and parse the information given by Somatic Sniper.
|
5
|
+
|
6
|
+
At this position the reference contains: AAAGAAAAGAAAAA (12A,2G)
|
7
|
+
At this position the tumor contains: AAAAACACAA (8A,2C)
|
8
|
+
|
9
|
+
rec.alt contains variants C,G. rec.tumor.bcount reflects the contents of the
|
10
|
+
tumor (8A,2C) so rec.tumor.bcount[rec.alt] reflects the actual number of
|
11
|
+
variants in the tumor.
|
12
|
+
|
13
|
+
The mapping quality in the BAM file is 37/37 and base quality is 55/60 in normal
|
14
|
+
and tumor respectively.
|
15
|
+
|
16
|
+
For the second scenario:
|
17
|
+
|
18
|
+
At this position the reference contains: (15A)
|
19
|
+
At this position the tumor contains: AAAAAAAAATATTA (13A, 3T)
|
20
|
+
|
21
|
+
Scenario: When parsing a record
|
22
|
+
|
23
|
+
Given the somatic sniper vcf line
|
24
|
+
"""
|
25
|
+
1 27691244 . A C,G . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/2:0/2:14:0,12,0,2:12,0,2,0:14:35:14:14,35:37:37,37:1:. 0/1:0/1:10:0,8,0,2:8,2,0,0:18:35:18:20,51:37:37,37:2:33
|
26
|
+
"""
|
27
|
+
When I parse the record
|
28
|
+
Then I expect rec.chrom to contain "1"
|
29
|
+
Then I expect rec.pos to contain 27691244
|
30
|
+
Then I expect rec.ref to contain "A"
|
31
|
+
And I expect rec.alt to contain ["C","G"]
|
32
|
+
And I expect rec.tumor.dp to be 10
|
33
|
+
And I expect rec.tumor.dp4 to be [0,8,0,2]
|
34
|
+
And I expect rec.tumor.bcount.to_ary to be [8,2,0,0]
|
35
|
+
And I expect rec.tumor.bcount[rec.alt] to be [2,0]
|
36
|
+
And I expect rec.tumor.bcount["G"] to be 0
|
37
|
+
And I expect rec.tumor.bcount[1] to be 2
|
38
|
+
And I expect rec.tumor.bcount[3] to be 0
|
39
|
+
And I expect rec.tumor.bcount.sum to be 2
|
40
|
+
And I expect rec.tumor.bcount.max to be 2
|
41
|
+
And I expect rec.tumor.bq.to_ary to be [20,51]
|
42
|
+
And I expect rec.tumor.bq["G"] to be 51
|
43
|
+
And I expect rec.tumor.bq[1] to be 51
|
44
|
+
And I expect rec.tumor.bq.min to be 20
|
45
|
+
And I expect rec.tumor.bq.max to be 51
|
46
|
+
And I expect rec.tumor.amq.to_ary to be [37,37]
|
47
|
+
And I expect rec.tumor.mq to be 37
|
48
|
+
And I expect rec.tumor.ss to be 2
|
49
|
+
# The following are additional functions
|
50
|
+
And I expect rec.call_diff to be [-4,2,-2,0]
|
51
|
+
And I expect rec.call_nuc to be "C"
|
52
|
+
And I expect rec.call_tumor_count to be 2
|
53
|
+
And I expect rec.call_normal_count to be 0
|
54
|
+
And I expect rec.call_tumor_relative_count to be 1.0
|
55
|
+
|
56
|
+
Given the somatic sniper vcf line
|
57
|
+
"""
|
58
|
+
1 27686841 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:15:3,12,0,0:15,0,0,0:66:37:0:25:37:37:0:. 0/1:0/1:16:2,11,0,3:13,0,0,3:30:37:30:34,55:37:37,37:2:37
|
59
|
+
"""
|
60
|
+
When I parse the record
|
61
|
+
Then I expect rec.chrom to contain "1"
|
62
|
+
Then I expect rec.pos to contain 27686841
|
63
|
+
Then I expect rec.ref to contain "A"
|
64
|
+
And I expect rec.alt to contain one ["T"]
|
65
|
+
And I expect rec.tumor.dp to be 16
|
66
|
+
And I expect rec.tumor.dp4 to be [2,11,0,3]
|
67
|
+
And I expect rec.tumor.bcount.to_ary to be [13,0,0,3]
|
68
|
+
And I expect rec.tumor.bcount[rec.alt] to be one [3]
|
69
|
+
And I expect rec.tumor.bcount["G"] to be 0
|
70
|
+
And I expect rec.tumor.bcount["T"] to be 3
|
71
|
+
And I expect rec.tumor.bcount[1] to be 0
|
72
|
+
And I expect rec.tumor.bcount[3] to be 3
|
73
|
+
And I expect rec.tumor.bcount.sum to be 3
|
74
|
+
And I expect rec.tumor.bcount.max to be 3
|
75
|
+
And I expect rec.tumor.bq.to_ary to be [34,55]
|
76
|
+
And I expect rec.tumor.bq["T"] to be 34
|
77
|
+
And I expect rec.tumor.bq[1] to be 55
|
78
|
+
And I expect rec.tumor.bq.min to be 34
|
79
|
+
And I expect rec.tumor.bq.max to be 55
|
80
|
+
And I expect rec.tumor.amq.to_ary to be [37,37]
|
81
|
+
And I expect rec.tumor.mq to be 37
|
82
|
+
And I expect rec.tumor.ss to be 2
|
83
|
+
|
84
|
+
|