bio-vcf 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +12 -0
- data/Gemfile +6 -6
- data/Gemfile.lock +49 -54
- data/README.md +83 -10
- data/Rakefile +5 -5
- data/VERSION +1 -1
- data/bin/bio-vcf +24 -8
- data/bio-vcf.gemspec +73 -0
- data/features/diff_count.feature +30 -0
- data/features/multisample.feature +37 -0
- data/features/somaticsniper.feature +84 -0
- data/features/step_definitions/diff_count.rb +41 -0
- data/features/step_definitions/multisample.rb +73 -0
- data/features/step_definitions/somaticsniper.rb +122 -0
- data/features/support/env.rb +4 -0
- data/lib/bio-vcf/variant.rb +38 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +118 -10
- data/lib/bio-vcf/vcfheader.rb +5 -0
- data/lib/bio-vcf/vcfrdf.rb +30 -0
- data/lib/bio-vcf/vcfrecord.rb +68 -5
- data/lib/bio-vcf.rb +1 -0
- data/test/data/input/multisample.vcf +150 -0
- metadata +28 -76
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
Given(/^normal and tumor counts \[(\d+),(\d+),(\d+),(\d+)\] and \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
|
3
|
+
@normal = [arg1,arg2,arg3,arg4].map{|i|i.to_i}
|
4
|
+
@tumor = [arg5,arg6,arg7,arg8].map{|i|i.to_i}
|
5
|
+
end
|
6
|
+
|
7
|
+
When(/^I look for the difference$/) do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect the diff to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
11
|
+
expect(Variant.diff(@normal,@tumor)).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^the relative diff to be \[(\d+),(\d+)\.(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6|
|
15
|
+
res = [arg1.to_f,(arg2+'.'+arg3).to_f,arg4.to_i,(arg5+'.'+arg6).to_f]
|
16
|
+
expect(Variant.relative_diff(@normal,@tumor)).to eq res
|
17
|
+
end
|
18
|
+
|
19
|
+
Then(/^I expect the defining tumor nucleotide to be "(.*?)"$/) do |arg1|
|
20
|
+
expect(['A','C','G','T'][Variant.index(@normal,@tumor)]).to eq arg1
|
21
|
+
end
|
22
|
+
|
23
|
+
Then(/^I expect the tumor count to be (\d+)$/) do |arg1|
|
24
|
+
expect(@tumor[Variant.index(@normal,@tumor)]).to eq arg1.to_i
|
25
|
+
end
|
26
|
+
|
27
|
+
When(/^I set an inclusion threshold for the reference$/) do
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
31
|
+
@t = arg1.to_i
|
32
|
+
@t_diff = Variant.threshold_diff(@t,@normal,@tumor)
|
33
|
+
expect(@t_diff).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
|
34
|
+
end
|
35
|
+
|
36
|
+
Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
37
|
+
res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
|
38
|
+
expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
|
39
|
+
end
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
Given(/^the multi sample header line$/) do |string|
|
2
|
+
@header = VcfHeader.new
|
3
|
+
@header.add(string)
|
4
|
+
end
|
5
|
+
|
6
|
+
When(/^I parse the header$/) do
|
7
|
+
expect(@header.column_names.size).to eq 16
|
8
|
+
expect(@header.samples.size).to eq 7
|
9
|
+
expect(@header.samples).to eq ["BIOPSY17513D", "clone10", "clone3", "clone4", "subclone105", "subclone33", "subclone46"]
|
10
|
+
end
|
11
|
+
|
12
|
+
Given(/^multisample vcf line$/) do |string|
|
13
|
+
@fields = VcfLine.parse(string.split(/\s+/).join("\t"))
|
14
|
+
@rec1 = VcfRecord.new(@fields,@header)
|
15
|
+
end
|
16
|
+
|
17
|
+
Then(/^I expect multisample rec\.alt to contain \["(.*?)"\]$/) do |arg1|
|
18
|
+
expect(@rec1.alt).to eq ["T"]
|
19
|
+
end
|
20
|
+
|
21
|
+
Then(/^I expect rec\.qual to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
22
|
+
expect(@rec1.qual).to eq 106.3
|
23
|
+
end
|
24
|
+
|
25
|
+
Then(/^I expect rec\.info\.ac to be (\d+)$/) do |arg1|
|
26
|
+
expect(@rec1.info.ac).to eq arg1.to_i
|
27
|
+
end
|
28
|
+
Then(/^I expect rec\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
29
|
+
expect(@rec1.info.af).to eq 0.357
|
30
|
+
end
|
31
|
+
|
32
|
+
Then(/^I expect rec\.info\.dp to be (\d+)$/) do |arg1|
|
33
|
+
expect(@rec1.info.dp).to eq 1537
|
34
|
+
end
|
35
|
+
|
36
|
+
Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
37
|
+
expect(@rec1.info.readposranksum).to eq 0.815
|
38
|
+
end
|
39
|
+
|
40
|
+
Then(/^I expect rec\.sample\['BIOPSY(\d+)D'\]\.gt to be "(.*?)"$/) do |arg1, arg2|
|
41
|
+
# p @rec1.sample
|
42
|
+
expect(@rec1.sample['BIOPSY17513D'].gt).to eq "0/1"
|
43
|
+
end
|
44
|
+
|
45
|
+
Then(/^I expect rec\.sample\['BIOPSY(\d+)D'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
46
|
+
expect(@rec1.sample['BIOPSY17513D'].ad).to eq [189,25]
|
47
|
+
end
|
48
|
+
|
49
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
50
|
+
expect(@rec1.sample['subclone46'].ad).to eq [167,26]
|
51
|
+
end
|
52
|
+
|
53
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.dp to be (\d+)$/) do |arg1, arg2|
|
54
|
+
expect(@rec1.sample['subclone46'].dp).to eq 196
|
55
|
+
end
|
56
|
+
|
57
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.gq to be (\d+)$/) do |arg1, arg2|
|
58
|
+
expect(@rec1.sample['subclone46'].gq).to eq 20
|
59
|
+
end
|
60
|
+
|
61
|
+
Then(/^I expect rec\.sample\['subclone(\d+)'\]\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
62
|
+
expect(@rec1.sample['subclone46'].pl).to eq [20,0,522]
|
63
|
+
end
|
64
|
+
|
65
|
+
Then(/^I expect rec\.sample\.biopsy(\d+)d\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3|
|
66
|
+
expect(@rec1.sample.biopsy17513d.gt).to eq "0/1"
|
67
|
+
end
|
68
|
+
|
69
|
+
Then(/^I expect rec\.sample\.subclone(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
70
|
+
expect(@rec1.sample.subclone46.pl).to eq [20,0,522]
|
71
|
+
end
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,122 @@
|
|
1
|
+
Given(/^the somatic sniper vcf line$/) do |string|
|
2
|
+
@fields = VcfLine.parse(string.split(/\s+/).join("\t"))
|
3
|
+
end
|
4
|
+
|
5
|
+
When(/^I parse the record$/) do
|
6
|
+
header = VcfHeader.new
|
7
|
+
@rec = VcfRecord.new(@fields,header)
|
8
|
+
end
|
9
|
+
|
10
|
+
Then(/^I expect rec\.chrom to contain "(.*?)"$/) do |arg1|
|
11
|
+
expect(@rec.chrom).to eq "1"
|
12
|
+
end
|
13
|
+
|
14
|
+
Then(/^I expect rec\.pos to contain (\d+)$/) do |arg1|
|
15
|
+
expect(@rec.pos).to eq arg1.to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
Then(/^I expect rec\.ref to contain "(.*?)"$/) do |arg1|
|
19
|
+
expect(@rec.ref).to eq arg1
|
20
|
+
end
|
21
|
+
|
22
|
+
Then(/^I expect rec\.alt to contain \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
|
23
|
+
expect(@rec.alt).to eq [arg1,arg2]
|
24
|
+
end
|
25
|
+
|
26
|
+
Then(/^I expect rec\.alt to contain one \["(.*?)"\]$/) do |arg1|
|
27
|
+
expect(@rec.alt).to eq [arg1]
|
28
|
+
end
|
29
|
+
|
30
|
+
Then(/^I expect rec\.tumor\.dp to be (\d+)$/) do |arg1|
|
31
|
+
expect(@rec.tumor.dp).to eq arg1.to_i
|
32
|
+
end
|
33
|
+
|
34
|
+
Then(/^I expect rec\.tumor\.dp(\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
|
35
|
+
expect(@rec.tumor.dp4).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
Then(/^I expect rec\.tumor\.bcount.to_ary to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
40
|
+
expect(@rec.tumor.bcount.to_ary).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
41
|
+
end
|
42
|
+
|
43
|
+
Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
44
|
+
expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i,arg2.to_i]
|
45
|
+
end
|
46
|
+
|
47
|
+
Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be one \[(\d+)\]$/) do |arg1|
|
48
|
+
expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i]
|
49
|
+
end
|
50
|
+
|
51
|
+
Then(/^I expect rec\.tumor\.bcount\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
|
52
|
+
expect(@rec.tumor.bcount[arg1]).to eq arg2.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
Then(/^I expect rec\.tumor\.bcount\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
|
56
|
+
expect(@rec.tumor.bcount[arg1.to_i]).to eq arg2.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
Then(/^I expect rec\.tumor\.bcount\.sum to be (\d+)$/) do |arg1|
|
60
|
+
expect(@rec.tumor.bcount.sum).to eq arg1.to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
Then(/^I expect rec\.tumor\.bcount\.max to be (\d+)$/) do |arg1|
|
64
|
+
expect(@rec.tumor.bcount.max).to eq arg1.to_i
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
Then(/^I expect rec\.tumor\.bq\.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
69
|
+
expect(@rec.tumor.bq.to_ary).to eq [arg1.to_i,arg2.to_i]
|
70
|
+
end
|
71
|
+
|
72
|
+
Then(/^I expect rec\.tumor\.bq\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
|
73
|
+
expect(@rec.tumor.bq[arg1]).to eq arg2.to_i
|
74
|
+
end
|
75
|
+
|
76
|
+
Then(/^I expect rec\.tumor\.bq\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
|
77
|
+
expect(@rec.tumor.bq[arg1.to_i]).to eq arg2.to_i
|
78
|
+
end
|
79
|
+
|
80
|
+
Then(/^I expect rec\.tumor\.bq\.min to be (\d+)$/) do |arg1|
|
81
|
+
expect(@rec.tumor.bq.min).to eq arg1.to_i
|
82
|
+
end
|
83
|
+
|
84
|
+
Then(/^I expect rec\.tumor\.bq\.max to be (\d+)$/) do |arg1|
|
85
|
+
expect(@rec.tumor.bq.max).to eq arg1.to_i
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
Then(/^I expect rec\.tumor\.amq.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
|
90
|
+
expect(@rec.tumor.amq.to_ary).to eq [arg1.to_i,arg2.to_i]
|
91
|
+
end
|
92
|
+
|
93
|
+
Then(/^I expect rec\.tumor\.mq to be (\d+)$/) do |arg1|
|
94
|
+
expect(@rec.tumor.mq).to eq arg1.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
|
98
|
+
expect(@rec.tumor.ss).to eq arg1.to_i
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
103
|
+
expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
104
|
+
end
|
105
|
+
|
106
|
+
Then(/^I expect rec.call_nuc to be "(.*?)"$/) do |arg1|
|
107
|
+
expect(@rec.call_nuc).to eq arg1
|
108
|
+
end
|
109
|
+
|
110
|
+
Then(/^I expect rec.call_tumor_count to be (\d+)$/) do |arg1|
|
111
|
+
expect(@rec.call_tumor_count).to eq arg1.to_i
|
112
|
+
end
|
113
|
+
|
114
|
+
Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
|
115
|
+
expect(@rec.call_normal_count).to eq arg1.to_i
|
116
|
+
end
|
117
|
+
|
118
|
+
Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
|
119
|
+
expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
|
120
|
+
end
|
121
|
+
|
122
|
+
|
data/features/support/env.rb
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
module Variant
|
4
|
+
|
5
|
+
def Variant.diff normal,tumor
|
6
|
+
tumor.each_with_index.map {|t,i| t-normal[i]}
|
7
|
+
end
|
8
|
+
|
9
|
+
def Variant.threshold_diff t,normal,tumor
|
10
|
+
normal2,tumor2 = apply_threshold(t,normal,tumor)
|
11
|
+
diff(normal2,tumor2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def Variant.relative_diff normal,tumor
|
15
|
+
d = diff(normal,tumor)
|
16
|
+
total = tumor.each_with_index.map {|t,i| t+normal[i]}
|
17
|
+
total.each_with_index.map {|t,i| (t==0 ? 0 : ((d[i].to_f/t)*100.0).round/100.0)}
|
18
|
+
end
|
19
|
+
|
20
|
+
def Variant.relative_threshold_diff t,normal,tumor
|
21
|
+
normal2,tumor2 = apply_threshold(t,normal,tumor)
|
22
|
+
relative_diff(normal2,tumor2)
|
23
|
+
end
|
24
|
+
|
25
|
+
def Variant.index normal,tumor
|
26
|
+
rd = relative_diff(normal,tumor)
|
27
|
+
max = rd.reduce(0){|mem,v| (v>mem ? v : mem) }
|
28
|
+
rd.index(max)
|
29
|
+
end
|
30
|
+
|
31
|
+
def Variant.apply_threshold t,normal,tumor
|
32
|
+
normal2 = normal.map{|v| (v>t ? 0 : v) }
|
33
|
+
tumor2 = tumor.each_with_index.map{|v,i| (normal2[i]==0 ? 0 : v) }
|
34
|
+
return normal2,tumor2
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -1,37 +1,122 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
+
MAXINT=100_000
|
4
|
+
|
5
|
+
# Helper class for a list of (variant) values, such as A,G.
|
6
|
+
# The [] function does the hard work (see ./features for examples)
|
3
7
|
class VcfNucleotides
|
4
|
-
def initialize list
|
5
|
-
@
|
8
|
+
def initialize alt,list
|
9
|
+
@alt = alt
|
10
|
+
@list = list.map{|i| i.to_i}
|
6
11
|
end
|
7
12
|
|
8
13
|
def [] idx
|
9
|
-
|
10
|
-
|
11
|
-
|
14
|
+
if idx.kind_of?(Integer)
|
15
|
+
@list[idx].to_i
|
16
|
+
elsif idx.kind_of?(String)
|
17
|
+
@list[["A","C","G","T"].index(idx)].to_i
|
18
|
+
else idx.kind_of?(Array)
|
19
|
+
idx.map { |nuc|
|
20
|
+
idx2 = ["A","C","G","T"].index(nuc)
|
21
|
+
# p [idx,nuc,idx2,@list]
|
22
|
+
@list[idx2].to_i
|
23
|
+
}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_ary
|
28
|
+
@list
|
29
|
+
end
|
30
|
+
|
31
|
+
# Return the max value on the nucleotides in the list (typically rec.alt)
|
32
|
+
def max list = @alt
|
33
|
+
values = self[list]
|
34
|
+
values.reduce(0){ |memo,v| (v>memo ? v : memo) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def min list = @alt
|
38
|
+
values = self[list]
|
39
|
+
values.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def sum list = @alt
|
43
|
+
values = self[list]
|
44
|
+
values.reduce(0){ |memo,v| v+memo }
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class VcfAltInfo
|
51
|
+
def initialize alt,list
|
52
|
+
@alt = alt
|
53
|
+
@list = list.map{|i| i.to_i}
|
54
|
+
end
|
55
|
+
|
56
|
+
def [] idx
|
57
|
+
if idx.kind_of?(Integer)
|
58
|
+
@list[idx].to_i
|
59
|
+
elsif idx.kind_of?(String)
|
60
|
+
@list[@alt.index(idx)].to_i
|
61
|
+
else idx.kind_of?(Array)
|
62
|
+
idx.map { |nuc|
|
63
|
+
idx2 = @alt.index(nuc)
|
64
|
+
# p [idx,nuc,idx2,@list]
|
65
|
+
@list[idx2].to_i
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_ary
|
71
|
+
@list
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return the max value on the nucleotides in the list (typically rec.alt)
|
75
|
+
def max
|
76
|
+
@list.reduce(0){ |memo,v| (v>memo ? v : memo) }
|
77
|
+
end
|
78
|
+
|
79
|
+
def min
|
80
|
+
@list.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
|
81
|
+
end
|
82
|
+
|
83
|
+
def sum
|
84
|
+
@list.reduce(0){ |memo,v| v+memo }
|
12
85
|
end
|
13
86
|
end
|
14
87
|
|
15
88
|
class VcfGenotypeField
|
16
|
-
def initialize s, format, header
|
89
|
+
def initialize s, format, header, alt
|
17
90
|
@values = s.split(/:/)
|
18
91
|
@format = format
|
19
92
|
@header = header
|
93
|
+
@alt = alt
|
94
|
+
end
|
95
|
+
|
96
|
+
def dp4
|
97
|
+
@values[@format['DP4']].split(',').map{|i| i.to_i}
|
98
|
+
end
|
99
|
+
|
100
|
+
def ad
|
101
|
+
@values[@format['AD']].split(',').map{|i| i.to_i}
|
102
|
+
end
|
103
|
+
|
104
|
+
def pl
|
105
|
+
@values[@format['PL']].split(',').map{|i| i.to_i}
|
20
106
|
end
|
21
107
|
|
22
108
|
def bcount
|
23
|
-
VcfNucleotides.new(@values[@format['BCOUNT']].split(','))
|
109
|
+
VcfNucleotides.new(@alt,@values[@format['BCOUNT']].split(','))
|
24
110
|
end
|
25
111
|
|
26
112
|
def bq
|
27
|
-
|
113
|
+
VcfAltInfo.new(@alt,@values[@format['BQ']].split(','))
|
28
114
|
end
|
29
115
|
|
30
116
|
def amq
|
31
|
-
|
117
|
+
VcfAltInfo.new(@alt,@values[@format['AMQ']].split(','))
|
32
118
|
end
|
33
119
|
|
34
|
-
|
35
120
|
def method_missing(m, *args, &block)
|
36
121
|
v = @values[@format[m.to_s.upcase]]
|
37
122
|
v = v.to_i if v =~ /^\d+$/
|
@@ -40,4 +125,27 @@ module BioVcf
|
|
40
125
|
end
|
41
126
|
|
42
127
|
end
|
128
|
+
|
129
|
+
# Holds all samples
|
130
|
+
class VcfGenotypeFields
|
131
|
+
def initialize fields, format, header, alt
|
132
|
+
@fields = fields
|
133
|
+
@format = format
|
134
|
+
@header = header
|
135
|
+
@alt = alt
|
136
|
+
@samples = {} # lazy cache
|
137
|
+
@index = {}
|
138
|
+
@header.samples.each_with_index { |k,i| @index[k] = i+9 ; @index[k.downcase] = i+9 }
|
139
|
+
end
|
140
|
+
|
141
|
+
def [] name
|
142
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
|
143
|
+
end
|
144
|
+
|
145
|
+
def method_missing(m, *args, &block)
|
146
|
+
name = m.to_s
|
147
|
+
@samples[name] ||= VcfGenotypeField.new(@fields[@index[name]],@format,@header,@alt)
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
43
151
|
end
|
data/lib/bio-vcf/vcfheader.rb
CHANGED
@@ -5,6 +5,7 @@ module BioVcf
|
|
5
5
|
def VcfHeaderParser.get_column_names(lines)
|
6
6
|
lines.each do | line |
|
7
7
|
if line =~ /^#[^#]/
|
8
|
+
# the first line that starts with a single hash
|
8
9
|
names = line.split
|
9
10
|
names[0].sub!(/^#/,'')
|
10
11
|
return names
|
@@ -37,6 +38,10 @@ module BioVcf
|
|
37
38
|
def columns
|
38
39
|
@column ||= column_names.size
|
39
40
|
end
|
41
|
+
|
42
|
+
def samples
|
43
|
+
@samples ||= column_names[9..-1]
|
44
|
+
end
|
40
45
|
end
|
41
46
|
|
42
47
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module BioVcf
|
2
|
+
|
3
|
+
# This is some primarily RDF support - which may be moved to another gem
|
4
|
+
|
5
|
+
module VcfRdf
|
6
|
+
|
7
|
+
def VcfRdf::header
|
8
|
+
print <<EOB
|
9
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
10
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
11
|
+
@prefix dc: <http://purl.org/dc/elements/1.1/> .
|
12
|
+
@prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
|
13
|
+
@prefix doi: <http://dx.doi.org/> .
|
14
|
+
@prefix : <http://biobeat.org/rdf/ns#> .
|
15
|
+
EOB
|
16
|
+
end
|
17
|
+
|
18
|
+
def VcfRdf::record id,rec,hash = {}
|
19
|
+
id2 = [id,'ch'+rec.chrom,rec.pos].join('_')
|
20
|
+
print <<OUT
|
21
|
+
:#{id2} :chr \"#{rec.chrom}\" .
|
22
|
+
:#{id2} :pos #{rec.pos} .
|
23
|
+
:#{id2} :vcf true .
|
24
|
+
OUT
|
25
|
+
hash.each do |k,v|
|
26
|
+
print ":#{id2} :#{k} #{v} .\n"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/bio-vcf/vcfrecord.rb
CHANGED
@@ -1,18 +1,67 @@
|
|
1
1
|
module BioVcf
|
2
2
|
|
3
|
+
class VcfRecordInfo
|
4
|
+
def initialize s
|
5
|
+
h = {}
|
6
|
+
s.split(/;/).each { |f| k,v=f.split(/=/) ; h[k.upcase] = v }
|
7
|
+
@h = h
|
8
|
+
end
|
9
|
+
def method_missing(m, *args, &block)
|
10
|
+
v = @h[m.to_s.upcase]
|
11
|
+
v = v.to_i if v =~ /^\d+$/
|
12
|
+
v = v.to_f if v =~ /^\d+\.\d+$/
|
13
|
+
v
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
3
18
|
module VcfRecordParser
|
19
|
+
# Parse the format field into a Hash
|
4
20
|
def VcfRecordParser.get_format s
|
5
21
|
h = {}
|
6
22
|
s.split(/:/).each_with_index { |v,i| h[v] = i }
|
7
23
|
h
|
8
24
|
end
|
25
|
+
def VcfRecordParser.get_info s
|
26
|
+
VcfRecordInfo.new(s)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
module VcfRecordCall
|
31
|
+
def call_diff
|
32
|
+
Variant.diff(normal.bcount.to_ary,tumor.bcount.to_ary)
|
33
|
+
end
|
34
|
+
|
35
|
+
def call_nuc
|
36
|
+
['A','C','G','T'][index()]
|
37
|
+
end
|
38
|
+
|
39
|
+
def call_tumor_count
|
40
|
+
tumor.bcount.to_ary[index()]
|
41
|
+
end
|
42
|
+
|
43
|
+
def call_tumor_relative_count
|
44
|
+
Variant.relative_diff(normal.bcount.to_ary,tumor.bcount.to_ary)[index()]
|
45
|
+
end
|
46
|
+
|
47
|
+
def call_normal_count
|
48
|
+
normal.bcount.to_ary[index()]
|
49
|
+
end
|
50
|
+
|
51
|
+
def index
|
52
|
+
Variant.index(self.normal.bcount.to_ary,self.tumor.bcount.to_ary)
|
53
|
+
end
|
9
54
|
end
|
10
55
|
|
11
56
|
class VcfRecord
|
12
57
|
|
58
|
+
include VcfRecordCall
|
59
|
+
|
60
|
+
attr_reader :header
|
61
|
+
|
13
62
|
def initialize fields, header
|
14
|
-
@header = header
|
15
63
|
@fields = fields
|
64
|
+
@header = header
|
16
65
|
end
|
17
66
|
|
18
67
|
def chrom
|
@@ -36,20 +85,34 @@ module BioVcf
|
|
36
85
|
end
|
37
86
|
|
38
87
|
def alt
|
39
|
-
@alt ||= @fields[4]
|
88
|
+
@alt ||= @fields[4].split(/,/)
|
89
|
+
end
|
90
|
+
|
91
|
+
def qual
|
92
|
+
@qual ||= @fields[5].to_f
|
93
|
+
end
|
94
|
+
|
95
|
+
def info
|
96
|
+
@info ||= VcfRecordParser.get_info(@fields[7])
|
40
97
|
end
|
41
98
|
|
42
99
|
def format
|
43
100
|
@format ||= VcfRecordParser.get_format(@fields[8])
|
44
101
|
end
|
45
102
|
|
103
|
+
# Return the normal sample (used in two sample VCF)
|
46
104
|
def normal
|
47
|
-
@normal ||= VcfGenotypeField.new(@fields[9],format,@header)
|
105
|
+
@normal ||= VcfGenotypeField.new(@fields[9],format,@header,alt)
|
48
106
|
end
|
49
107
|
|
108
|
+
# Return the tumor sample (used in two sample VCF)
|
50
109
|
def tumor
|
51
|
-
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header)
|
110
|
+
@tumor ||= VcfGenotypeField.new(@fields[10],format,@header,alt)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Return the sample as a named hash
|
114
|
+
def sample
|
115
|
+
@sample ||= VcfGenotypeFields.new(@fields,format,@header,alt)
|
52
116
|
end
|
53
|
-
|
54
117
|
end
|
55
118
|
end
|
data/lib/bio-vcf.rb
CHANGED