bio-vcf 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +12 -0
- data/Gemfile +6 -6
- data/Gemfile.lock +49 -54
- data/README.md +83 -10
- data/Rakefile +5 -5
- data/VERSION +1 -1
- data/bin/bio-vcf +24 -8
- data/bio-vcf.gemspec +73 -0
- data/features/diff_count.feature +30 -0
- data/features/multisample.feature +37 -0
- data/features/somaticsniper.feature +84 -0
- data/features/step_definitions/diff_count.rb +41 -0
- data/features/step_definitions/multisample.rb +73 -0
- data/features/step_definitions/somaticsniper.rb +122 -0
- data/features/support/env.rb +4 -0
- data/lib/bio-vcf/variant.rb +38 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +118 -10
- data/lib/bio-vcf/vcfheader.rb +5 -0
- data/lib/bio-vcf/vcfrdf.rb +30 -0
- data/lib/bio-vcf/vcfrecord.rb +68 -5
- data/lib/bio-vcf.rb +1 -0
- data/test/data/input/multisample.vcf +150 -0
- metadata +28 -76
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
Given(/^normal and tumor counts \[(\d+),(\d+),(\d+),(\d+)\] and \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
|
3
|
+
@normal = [arg1,arg2,arg3,arg4].map{|i|i.to_i}
|
4
|
+
@tumor = [arg5,arg6,arg7,arg8].map{|i|i.to_i}
|
5
|
+
end
|
6
|
+
|
7
|
+
When(/^I look for the difference$/) do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect the diff to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
11
|
+
expect(Variant.diff(@normal,@tumor)).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^the relative diff to be \[(\d+),(\d+)\.(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6|
|
15
|
+
res = [arg1.to_f,(arg2+'.'+arg3).to_f,arg4.to_i,(arg5+'.'+arg6).to_f]
|
16
|
+
expect(Variant.relative_diff(@normal,@tumor)).to eq res
|
17
|
+
end
|
18
|
+
|
19
|
+
Then(/^I expect the defining tumor nucleotide to be "(.*?)"$/) do |arg1|
|
20
|
+
expect(['A','C','G','T'][Variant.index(@normal,@tumor)]).to eq arg1
|
21
|
+
end
|
22
|
+
|
23
|
+
Then(/^I expect the tumor count to be (\d+)$/) do |arg1|
|
24
|
+
expect(@tumor[Variant.index(@normal,@tumor)]).to eq arg1.to_i
|
25
|
+
end
|
26
|
+
|
27
|
+
When(/^I set an inclusion threshold for the reference$/) do
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
31
|
+
@t = arg1.to_i
|
32
|
+
@t_diff = Variant.threshold_diff(@t,@normal,@tumor)
|
33
|
+
expect(@t_diff).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
|
34
|
+
end
|
35
|
+
|
36
|
+
Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
37
|
+
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
|
38
|
+
expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
|
39
|
+
end
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
Given(/^the multi sample header line$/) do |string|
|
2
|
+
@header = VcfHeader.new
|
3
|
+
@header.add(string)
|
4
|
+
end
|
5
|
+
|
6
|
+
When(/^I parse the header$/) do
|
7
|
+
expect(@header.column_names.size).to eq 16
|
8
|
+
expect(@header.samples.size).to eq 7
|
9
|
+
expect(@header.samples).to eq ["BIOPSY17513D", "clone10", "clone3", "clone4", "subclone105", "subclone33", "subclone46"]
|
10
|
+
end
|
11
|
+
|
12
|
+
Given(/^multisample vcf line$/) do |string|
|
13
|
+
@fields = VcfLine.parse(string.split(/\s+/).join("\t"))
|
14
|
+
@rec1 = VcfRecord.new(@fields,@header)
|
15
|
+
end
|
16
|
+
|
17
|
+
Then(/^I expect multisample rec\.alt to contain \["(.*?)"\]$/) do |arg1|
|
18
|
+
expect(@rec1.alt).to eq ["T"]
|
19
|
+
end
|
20
|
+
|
21
|
+
Then(/^I expect rec\.qual to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
22
|
+
expect(@rec1.qual).to eq 106.3
|
23
|
+
end
|
24
|
+
|
25
|
+
Then(/^I expect rec\.info\.ac to be (\d+)$/) do |arg1|
|
26
|
+
expect(@rec1.info.ac).to eq arg1.to_i
|
27
|
+
end
|
28
|
+
Then(/^I expect rec\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
29
|
+
expect(@rec1.info.af).to eq 0.357
|
30
|
+
end
|
31
|
+
|
32
|
+
Then(/^I expect rec\.info\.dp to be (\d+)$/) do |arg1|
|
33
|
+
expect(@rec1.info.dp).to eq 1537
|
34
|
+
end
|
35
|
+
|
36
|
+
Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
37
|
+
expect(@rec1.info.readposranksum).to eq 0.815
|
38
|
+
end
|
39
|
+
|
40
|
+
Then(/^I expect rec\.sample\['BIOPSY(\d+)D'\]\.gt to be "(.*?)"$/) do |arg1, arg2|
|
41
|
+
# p @rec1.sample
|
42
|
+
expect(@rec1.sample['BIOPSY17513D'].gt).to eq "0/1"
|
43
|
+
end
|
44
|
+
|
45
|
+
Then(/^I expect rec\.sample\['BIOPSY(\d+)D'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
46
|
+
expect(@rec1.sample['BIOPSY17513D'].ad).to eq [189,25]
|
47
|
+
end
|
48
|
+
|
49
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
50
|
+
expect(@rec1.sample['subclone46'].ad).to eq [167,26]
|
51
|
+
end
|
52
|
+
|
53
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.dp to be (\d+)$/) do |arg1, arg2|
|
54
|
+
expect(@rec1.sample['subclone46'].dp).to eq 196
|
55
|
+
end
|
56
|
+
|
57
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.gq to be (\d+)$/) do |arg1, arg2|
|
58
|
+
expect(@rec1.sample['subclone46'].gq).to eq 20
|
59
|
+
end
|
60
|
+
|
61
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
62
|
+
expect(@rec1.sample['subclone46'].pl).to eq [20,0,522]
|
63
|
+
end
|
64
|
+
|
65
|
+
Then(/^I expect rec\.sample\.biopsy(\d+)d\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
66
|
+
expect(@rec1.sample.biopsy17513d.gt).to eq "0/1"
|
67
|
+
end
|
68
|
+
|
69
|
+
Then(/^I expect rec\.sample\.subclone(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
70
|
+
expect(@rec1.sample.subclone46.pl).to eq [20,0,522]
|
71
|
+
end
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,122 @@
|
|
1
|
+
Given(/^the somatic sniper vcf line$/) do |string|
|
2
|
+
@fields = VcfLine.parse(string.split(/\s+/).join("\t"))
|
3
|
+
end
|
4
|
+
|
5
|
+
When(/^I parse the record$/) do
|
6
|
+
header = VcfHeader.new
|
7
|
+
@rec = VcfRecord.new(@fields,header)
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect rec\.chrom to contain "(.*?)"$/) do |arg1|
|
11
|
+
expect(@rec.chrom).to eq "1"
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^I expect rec\.pos to contain (\d+)$/) do |arg1|
|
15
|
+
expect(@rec.pos).to eq arg1.to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
Then(/^I expect rec\.ref to contain "(.*?)"$/) do |arg1|
|
19
|
+
expect(@rec.ref).to eq arg1
|
20
|
+
end
|
21
|
+
|
22
|
+
Then(/^I expect rec\.alt to contain \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
|
23
|
+
expect(@rec.alt).to eq [arg1,arg2]
|
24
|
+
end
|
25
|
+
|
26
|
+
Then(/^I expect rec\.alt to contain one \["(.*?)"\]$/) do |arg1|
|
27
|
+
expect(@rec.alt).to eq [arg1]
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect rec\.tumor\.dp to be (\d+)$/) do |arg1|
|
31
|
+
expect(@rec.tumor.dp).to eq arg1.to_i
|
32
|
+
end
|
33
|
+
|
34
|
+
Then(/^I expect rec\.tumor\.dp(\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
35
|
+
expect(@rec.tumor.dp4).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
Then(/^I expect rec\.tumor\.bcount.to_ary to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
40
|
+
expect(@rec.tumor.bcount.to_ary).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
41
|
+
end
|
42
|
+
|
43
|
+
Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
44
|
+
expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i,arg2.to_i]
|
45
|
+
end
|
46
|
+
|
47
|
+
Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be one \[(\d+)\]$/) do |arg1|
|
48
|
+
expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i]
|
49
|
+
end
|
50
|
+
|
51
|
+
Then(/^I expect rec\.tumor\.bcount\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
|
52
|
+
expect(@rec.tumor.bcount[arg1]).to eq arg2.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
Then(/^I expect rec\.tumor\.bcount\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
|
56
|
+
expect(@rec.tumor.bcount[arg1.to_i]).to eq arg2.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
Then(/^I expect rec\.tumor\.bcount\.sum to be (\d+)$/) do |arg1|
|
60
|
+
expect(@rec.tumor.bcount.sum).to eq arg1.to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
Then(/^I expect rec\.tumor\.bcount\.max to be (\d+)$/) do |arg1|
|
64
|
+
expect(@rec.tumor.bcount.max).to eq arg1.to_i
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
Then(/^I expect rec\.tumor\.bq\.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
69
|
+
expect(@rec.tumor.bq.to_ary).to eq [arg1.to_i,arg2.to_i]
|
70
|
+
end
|
71
|
+
|
72
|
+
Then(/^I expect rec\.tumor\.bq\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
|
73
|
+
expect(@rec.tumor.bq[arg1]).to eq arg2.to_i
|
74
|
+
end
|
75
|
+
|
76
|
+
Then(/^I expect rec\.tumor\.bq\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
|
77
|
+
expect(@rec.tumor.bq[arg1.to_i]).to eq arg2.to_i
|
78
|
+
end
|
79
|
+
|
80
|
+
Then(/^I expect rec\.tumor\.bq\.min to be (\d+)$/) do |arg1|
|
81
|
+
expect(@rec.tumor.bq.min).to eq arg1.to_i
|
82
|
+
end
|
83
|
+
|
84
|
+
Then(/^I expect rec\.tumor\.bq\.max to be (\d+)$/) do |arg1|
|
85
|
+
expect(@rec.tumor.bq.max).to eq arg1.to_i
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
Then(/^I expect rec\.tumor\.amq.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
90
|
+
expect(@rec.tumor.amq.to_ary).to eq [arg1.to_i,arg2.to_i]
|
91
|
+
end
|
92
|
+
|
93
|
+
Then(/^I expect rec\.tumor\.mq to be (\d+)$/) do |arg1|
|
94
|
+
expect(@rec.tumor.mq).to eq arg1.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
|
98
|
+
expect(@rec.tumor.ss).to eq arg1.to_i
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
103
|
+
expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
104
|
+
end
|
105
|
+
|
106
|
+
Then(/^I expect rec.call_nuc to be "(.*?)"$/) do |arg1|
|
107
|
+
expect(@rec.call_nuc).to eq arg1
|
108
|
+
end
|
109
|
+
|
110
|
+
Then(/^I expect rec.call_tumor_count to be (\d+)$/) do |arg1|
|
111
|
+
expect(@rec.call_tumor_count).to eq arg1.to_i
|
112
|
+
end
|
113
|
+
|
114
|
+
Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
|
115
|
+
expect(@rec.call_normal_count).to eq arg1.to_i
|
116
|
+
end
|
117
|
+
|
118
|
+
Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
119
|
+
expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
|
120
|
+
end
|
121
|
+
|
122
|
+
|
data/features/support/env.rb
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
module Variant
|
4
|
+
|
5
|
+
def Variant.diff normal,tumor
|
6
|
+
tumor.each_with_index.map {|t,i| t-normal[i]}
|
7
|
+
end
|
8
|
+
|
9
|
+
def Variant.threshold_diff t,normal,tumor
|
10
|
+
normal2,tumor2 = apply_threshold(t,normal,tumor)
|
11
|
+
diff(normal2,tumor2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def Variant.relative_diff normal,tumor
|
15
|
+
d = diff(normal,tumor)
|
16
|
+
total = tumor.each_with_index.map {|t,i| t+normal[i]}
|
17
|
+
total.each_with_index.map {|t,i| (t==0 ? 0 : ((d[i].to_f/t)*100.0).round/100.0)}
|
18
|
+
end
|
19
|
+
|
20
|
+
def Variant.relative_threshold_diff t,normal,tumor
|
21
|
+
normal2,tumor2 = apply_threshold(t,normal,tumor)
|
22
|
+
relative_diff(normal2,tumor2)
|
23
|
+
end
|
24
|
+
|
25
|
+
def Variant.index normal,tumor
|
26
|
+
rd = relative_diff(normal,tumor)
|
27
|
+
max = rd.reduce(0){|mem,v| (v>mem ? v : mem) }
|
28
|
+
rd.index(max)
|
29
|
+
end
|
30
|
+
|
31
|
+
def Variant.apply_threshold t,normal,tumor
|
32
|
+
normal2 = normal.map{|v| (v>t ? 0 : v) }
|
33
|
+
tumor2 = tumor.each_with_index.map{|v,i| (normal2[i]==0 ? 0 : v) }
|
34
|
+
return normal2,tumor2
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -1,37 +1,122 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
+
MAXINT=100_000
|
4
|
+
|
5
|
+
# Helper class for a list of (variant) values, such as A,G.
|
6
|
+
# The [] function does the hard work (see ./features for examples)
|
3
7
|
class VcfNucleotides
|
4
|
-
def initialize list
|
5
|
-
@
|
8
|
+
def initialize alt,list
|
9
|
+
@alt = alt
|
10
|
+
@list = list.map{|i| i.to_i}
|
6
11
|
end
|
7
12
|
|
8
13
|
def [] idx
|
9
|
-
|
10
|
-
|
11
|
-
|
14
|
+
if idx.kind_of?(Integer)
|
15
|
+
@list[idx].to_i
|
16
|
+
elsif idx.kind_of?(String)
|
17
|
+
@list[["A","C","G","T"].index(idx)].to_i
|
18
|
+
else idx.kind_of?(Array)
|
19
|
+
idx.map { |nuc|
|
20
|
+
idx2 = ["A","C","G","T"].index(nuc)
|
21
|
+
# p [idx,nuc,idx2,@list]
|
22
|
+
@list[idx2].to_i
|
23
|
+
}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_ary
|
28
|
+
@list
|
29
|
+
end
|
30
|
+
|
31
|
+
# Return the max value on the nucleotides in the list (typically rec.alt)
|
32
|
+
def max list = @alt
|
33
|
+
values = self[list]
|
34
|
+
values.reduce(0){ |memo,v| (v>memo ? v : memo) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def min list = @alt
|
38
|
+
values = self[list]
|
39
|
+
values.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def sum list = @alt
|
43
|
+
values = self[list]
|
44
|
+
values.reduce(0){ |memo,v| v+memo }
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class VcfAltInfo
|
51
|
+
def initialize alt,list
|
52
|
+
@alt = alt
|
53
|
+
@list = list.map{|i| i.to_i}
|
54
|
+
end
|
55
|
+
|
56
|
+
def [] idx
|
57
|
+
if idx.kind_of?(Integer)
|
58
|
+
@list[idx].to_i
|
59
|
+
elsif idx.kind_of?(String)
|
60
|
+
@list[@alt.index(idx)].to_i
|
61
|
+
else idx.kind_of?(Array)
|
62
|
+
idx.map { |nuc|
|
63
|
+
idx2 = @alt.index(nuc)
|
64
|
+
# p [idx,nuc,idx2,@list]
|
65
|
+
@list[idx2].to_i
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_ary
|
71
|
+
@list
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return the max value on the nucleotides in the list (typically rec.alt)
|
75
|
+
def max
|
76
|
+
@list.reduce(0){ |memo,v| (v>memo ? v : memo) }
|
77
|
+
end
|
78
|
+
|
79
|
+
def min
|
80
|
+
@list.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
|
81
|
+
end
|
82
|
+
|
83
|
+
def sum
|
84
|
+
@list.reduce(0){ |memo,v| v+memo }
|
12
85
|
end
|
13
86
|
end
|
14
87
|
|
15
88
|
class VcfGenotypeField
|
16
|
-
def initialize s, format, header
|
89
|
+
def initialize s, format, header, alt
|
17
90
|
@values = s.split(/:/)
|
18
91
|
@format = format
|
19
92
|
@header = header
|
93
|
+
@alt = alt
|
94
|
+
end
|
95
|
+
|
96
|
+
def dp4
|
97
|
+
@values[@format['DP4']].split(',').map{|i| i.to_i}
|
98
|
+
end
|
99
|
+
|
100
|
+
def ad
|
101
|
+
@values[@format['AD']].split(',').map{|i| i.to_i}
|
102
|
+
end
|
103
|
+
|
104
|
+
def pl
|
105
|
+
@values[@format['PL']].split(',').map{|i| i.to_i}
|
20
106
|
end
|
21
107
|
|
22
108
|
def bcount
|
23
|
-
VcfNucleotides.new(@values[@format['BCOUNT']].split(','))
|
109
|
+
VcfNucleotides.new(@alt,@values[@format['BCOUNT']].split(','))
|
24
110
|
end
|
25
111
|
|
26
112
|
def bq
|
27
|
-
|
113
|
+
VcfAltInfo.new(@alt,@values[@format['BQ']].split(','))
|
28
114
|
end
|
29
115
|
|
30
116
|
def amq
|
31
|
-
|
117
|
+
VcfAltInfo.new(@alt,@values[@format['AMQ']].split(','))
|
32
118
|
end
|
33
119
|
|
34
|
-
|
35
120
|
def method_missing(m, *args, &block)
|
36
121
|
v = @values[@format[m.to_s.upcase]]
|
37
122
|
v = v.to_i if v =~ /^\d+$/
|
@@ -40,4 +125,27 @@ module BioVcf
|
|
40
125
|
end
|
41
126
|
|
42
127
|
end
|
128
|
+
|
129
|
+
# Holds all samples
|
130
|
+
class VcfGenotypeFields
|
131
|
+
def initialize fields, format, header, alt
|
132
|
+
@fields = fields
|
133
|
+
@format = format
|
134
|
+
@header = header
|
135
|
+
@alt = alt
|
136
|
+
@samples = {} # lazy cache
|
137
|
+
@index = {}
|
138
|
+
@header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
|
139
|
+
end
|
140
|
+
|
141
|
+
def [] name
|
142
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
|
143
|
+
end
|
144
|
+
|
145
|
+
def method_missing(m, *args, &block)
|
146
|
+
name = m.to_s
|
147
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
43
151
|
end
|
data/lib/bio-vcf/vcfheader.rb
CHANGED
@@ -5,6 +5,7 @@ module BioVcf
|
|
5
5
|
def VcfHeaderParser.get_column_names(lines)
|
6
6
|
lines.each do | line |
|
7
7
|
if line =~ /^#[^#]/
|
8
|
+
# the first line that starts with a single hash
|
8
9
|
names = line.split
|
9
10
|
names[0].sub!(/^#/,'')
|
10
11
|
return names
|
@@ -37,6 +38,10 @@ module BioVcf
|
|
37
38
|
def columns
|
38
39
|
@column ||= column_names.size
|
39
40
|
end
|
41
|
+
|
42
|
+
def samples
|
43
|
+
@samples ||= column_names[9..-1]
|
44
|
+
end
|
40
45
|
end
|
41
46
|
|
42
47
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
# This is some primarily RDF support - which may be moved to another gem
|
4
|
+
|
5
|
+
module VcfRdf
|
6
|
+
|
7
|
+
def VcfRdf::header
|
8
|
+
print <<EOB
|
9
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
10
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
11
|
+
@prefix dc: <http://purl.org/dc/elements/1.1/> .
|
12
|
+
@prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
|
13
|
+
@prefix doi: <http://dx.doi.org/> .
|
14
|
+
@prefix : <http://biobeat.org/rdf/ns#> .
|
15
|
+
EOB
|
16
|
+
end
|
17
|
+
|
18
|
+
def VcfRdf::record id,rec,hash = {}
|
19
|
+
id2 = [id,'ch'+rec.chrom,rec.pos].join('_')
|
20
|
+
print <<OUT
|
21
|
+
:#{id2} :chr \"#{rec.chrom}\" .
|
22
|
+
:#{id2} :pos #{rec.pos} .
|
23
|
+
:#{id2} :vcf true .
|
24
|
+
OUT
|
25
|
+
hash.each do |k,v|
|
26
|
+
print ":#{id2} :#{k} #{v} .\n"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -1,18 +1,67 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
+
class VcfRecordInfo
|
4
|
+
def initialize s
|
5
|
+
h = {}
|
6
|
+
s.split(/;/).each { |f| k,v=f.split(/=/) ; h[k.upcase] = v }
|
7
|
+
@h = h
|
8
|
+
end
|
9
|
+
def method_missing(m, *args, &block)
|
10
|
+
v = @h[m.to_s.upcase]
|
11
|
+
v = v.to_i if v =~ /^\d+$/
|
12
|
+
v = v.to_f if v =~ /^\d+\.\d+$/
|
13
|
+
v
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
3
18
|
module VcfRecordParser
|
19
|
+
# Parse the format field into a Hash
|
4
20
|
def VcfRecordParser.get_format s
|
5
21
|
h = {}
|
6
22
|
s.split(/:/).each_with_index { |v,i| h[v] = i }
|
7
23
|
h
|
8
24
|
end
|
25
|
+
def VcfRecordParser.get_info s
|
26
|
+
VcfRecordInfo.new(s)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
module VcfRecordCall
|
31
|
+
def call_diff
|
32
|
+
Variant.diff(normal.bcount.to_ary,tumor.bcount.to_ary)
|
33
|
+
end
|
34
|
+
|
35
|
+
def call_nuc
|
36
|
+
['A','C','G','T'][index()]
|
37
|
+
end
|
38
|
+
|
39
|
+
def call_tumor_count
|
40
|
+
tumor.bcount.to_ary[index()]
|
41
|
+
end
|
42
|
+
|
43
|
+
def call_tumor_relative_count
|
44
|
+
Variant.relative_diff(normal.bcount.to_ary,tumor.bcount.to_ary)[index()]
|
45
|
+
end
|
46
|
+
|
47
|
+
def call_normal_count
|
48
|
+
normal.bcount.to_ary[index()]
|
49
|
+
end
|
50
|
+
|
51
|
+
def index
|
52
|
+
Variant.index(self.normal.bcount.to_ary,self.tumor.bcount.to_ary)
|
53
|
+
end
|
9
54
|
end
|
10
55
|
|
11
56
|
class VcfRecord
|
12
57
|
|
58
|
+
include VcfRecordCall
|
59
|
+
|
60
|
+
attr_reader :header
|
61
|
+
|
13
62
|
def initialize fields, header
|
14
|
-
@header = header
|
15
63
|
@fields = fields
|
64
|
+
@header = header
|
16
65
|
end
|
17
66
|
|
18
67
|
def chrom
|
@@ -36,20 +85,34 @@ module BioVcf
|
|
36
85
|
end
|
37
86
|
|
38
87
|
def alt
|
39
|
-
@alt ||= @fields[4]
|
88
|
+
@alt ||= @fields[4].split(/,/)
|
89
|
+
end
|
90
|
+
|
91
|
+
def qual
|
92
|
+
@qual ||= @fields[5].to_f
|
93
|
+
end
|
94
|
+
|
95
|
+
def info
|
96
|
+
@info ||= VcfRecordParser.get_info(@fields[7])
|
40
97
|
end
|
41
98
|
|
42
99
|
def format
|
43
100
|
@format ||= VcfRecordParser.get_format(@fields[8])
|
44
101
|
end
|
45
102
|
|
103
|
+
# Return the normal sample (used in two sample VCF)
|
46
104
|
def normal
|
47
|
-
@normal ||= VcfGenotypeField.new(@fields[9],format,@header)
|
105
|
+
@normal ||= VcfGenotypeField.new(@fields[9],format,@header,alt)
|
48
106
|
end
|
49
107
|
|
108
|
+
# Return the tumor sample (used in two sample VCF)
|
50
109
|
def tumor
|
51
|
-
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header)
|
110
|
+
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Return the sample as a named hash
|
114
|
+
def sample
|
115
|
+
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
52
116
|
end
|
53
|
-
|
54
117
|
end
|
55
118
|
end
|
data/lib/bio-vcf.rb
CHANGED