bio-sam-mutation 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +20 -0
- data/README.md +88 -0
- data/README.rdoc +48 -0
- data/Rakefile +54 -0
- data/bin/mutations +108 -0
- data/bin/sam-mutation +20 -0
- data/lib/bio-sam-mutation.rb +26 -0
- data/lib/bio-sam-mutation/bio/alignment/cigar.rb +239 -0
- data/lib/bio-sam-mutation/bio/alignment/iterate_pairs.rb +68 -0
- data/lib/bio-sam-mutation/bio/db/alignment.rb +176 -0
- data/lib/bio-sam-mutation/bio/db/tag.rb +5 -0
- data/lib/bio-sam-mutation/bio/db/tag/md.rb +126 -0
- data/lib/bio-sam-mutation/bio/mutantallele.rb +24 -0
- data/lib/bio-sam-mutation/bio/mutation.rb +63 -0
- data/lib/bio-sam-mutation/bio/mutation_array.rb +15 -0
- data/lib/bio-sam-mutation/bio/vephgvs.rb +21 -0
- data/lib/bio-sam-mutation/mutationscli.rb +83 -0
- data/test/helper.rb +34 -0
- data/test/test_cigar.rb +145 -0
- data/test/test_mdtag.rb +46 -0
- data/test/test_mutant_allele.rb +21 -0
- data/test/test_mutation.rb +84 -0
- data/test/test_mutation_array.rb +13 -0
- data/test/test_sam.rb +160 -0
- data/test/test_vep_hgvs.rb +9 -0
- metadata +247 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bio::Alignment::IteratePairs
|
2
|
+
private
|
3
|
+
#Mixin to iterate through ordered paired [operation, value] data and take subsets - e.g. broken down CIGAR and MD:Z tags.
|
4
|
+
#Can set a regexp for the operation; default matches everything
|
5
|
+
def iterate_pairs(pairs,offset,length,regexp = //)
|
6
|
+
offset = offset.to_i
|
7
|
+
length - length.to_i
|
8
|
+
total = 0
|
9
|
+
new_array = []
|
10
|
+
first = true
|
11
|
+
pairs.each do |pair|
|
12
|
+
new_pair = pair.dup
|
13
|
+
if pair[1].is_a? String
|
14
|
+
pairlength = pair[1].length
|
15
|
+
elsif pair[1].is_a? Integer
|
16
|
+
pairlength = pair[1]
|
17
|
+
else
|
18
|
+
raise "Value for operation must be a string or integer"
|
19
|
+
end
|
20
|
+
# Only count pairs where first element matches a regexp.
|
21
|
+
# e.g. for CIGAR:
|
22
|
+
# ref M + ref D = ref length
|
23
|
+
# query M + query I = query length
|
24
|
+
if pair[0].match(regexp)
|
25
|
+
total += pairlength
|
26
|
+
end
|
27
|
+
# Just keep going until we get to the start of the subalignment
|
28
|
+
if total < offset
|
29
|
+
next
|
30
|
+
end
|
31
|
+
# If the offset is partway through a pair, need to split it up.
|
32
|
+
if first
|
33
|
+
# adjust the number in this pair; it will be added below
|
34
|
+
if pair[1].is_a? String
|
35
|
+
new_pair[1] = new_pair[1][total-offset..pairlength]
|
36
|
+
else
|
37
|
+
new_pair[1] = total - offset + 1 # Add one for bases
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Once we are at/beyond the end of the desired region:
|
42
|
+
if total >= offset + length
|
43
|
+
# Special case where the whole subalignment is contained within one cigar element:
|
44
|
+
if first
|
45
|
+
if pair[1].is_a? String
|
46
|
+
new_pair[1] = new_pair[1][total-offset-pairlength,length]
|
47
|
+
else
|
48
|
+
new_pair[1] = length
|
49
|
+
end
|
50
|
+
new_array << new_pair
|
51
|
+
break
|
52
|
+
end
|
53
|
+
# Adding the last part of the alignment
|
54
|
+
previous_total = total - pairlength
|
55
|
+
if pair[1].is_a? Integer
|
56
|
+
new_pair[1] = offset + length - previous_total - 1 #-1 extra for base arithmetic
|
57
|
+
else
|
58
|
+
new_pair[1] = new_pair[1][0..offset + length - previous_total]
|
59
|
+
end
|
60
|
+
new_array << new_pair
|
61
|
+
break
|
62
|
+
end
|
63
|
+
first = false
|
64
|
+
new_array << new_pair
|
65
|
+
end
|
66
|
+
new_array
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# Extending Bio::DB::Alignment with mutation calling method
|
2
|
+
Bio::DB::Alignment.class_eval do
|
3
|
+
# Aliases included for more intuitive naming when using for genomic alignments:
|
4
|
+
alias_method :chr, :rname
|
5
|
+
alias_method :opt, :tags # vice versa as opt is the "proper" sam name for the tag fields
|
6
|
+
attr_accessor :cigar_obj
|
7
|
+
|
8
|
+
def add_tag!(new_tag)
|
9
|
+
if new_tag.is_a? String
|
10
|
+
new_tag = Bio::DB::Tag.new(new_tag)
|
11
|
+
# new_tag_obj.set(new_tag)
|
12
|
+
# new_tag = new_tag_obj
|
13
|
+
else
|
14
|
+
raise "Tag not recognised - pass a string or Bio::DB::Tag object" unless new_tag.is_a? Bio::DB::Tag
|
15
|
+
end
|
16
|
+
@tags[new_tag.tag] = new_tag
|
17
|
+
|
18
|
+
regenerate_string
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_tag(tag)
|
22
|
+
dup.add_tag!(tag)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Output a representation of the query sequence
|
26
|
+
def query offset=1, length=@seq.length, reference_pos=@pos-1, ins_chr="_"
|
27
|
+
mutations = self.mutations(offset,length)
|
28
|
+
cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
|
29
|
+
preceding = cigar.subalignment(0,offset-1)
|
30
|
+
preceding_diff = preceding.query_length-(offset-1)
|
31
|
+
pointer = preceding.query_length
|
32
|
+
output = []
|
33
|
+
deletions = 0
|
34
|
+
insertions = 0
|
35
|
+
if mutations
|
36
|
+
mutations.each do |mut|
|
37
|
+
mut.position = mut.position + insertions - deletions + preceding_diff
|
38
|
+
case mut.type
|
39
|
+
when :deletion
|
40
|
+
# position for deletion is the first deleted base
|
41
|
+
fillin = mut.position-1-reference_pos-1
|
42
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
43
|
+
mut.reference.length.times{ output << "-" }
|
44
|
+
pointer += fillin - pointer + 1
|
45
|
+
deletions += mut.reference.length
|
46
|
+
when :insertion
|
47
|
+
# position for insertion is the base we want
|
48
|
+
fillin = mut.position-reference_pos-1
|
49
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
50
|
+
output << ins_chr + mut.mutant.downcase + ins_chr
|
51
|
+
pointer += fillin - pointer + 1 + mut.mutant.length
|
52
|
+
insertions += mut.mutant.length
|
53
|
+
when :substitution
|
54
|
+
# position for substitution is the first subbed base
|
55
|
+
fillin = mut.position-1-reference_pos-1
|
56
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
57
|
+
output << mut.mutant.downcase
|
58
|
+
pointer += fillin - pointer + 1 + mut.mutant.length
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# Remaining sequence
|
63
|
+
if offset + length > pointer
|
64
|
+
output << @seq[pointer..offset-1+length-1-deletions+insertions+preceding_diff]
|
65
|
+
end
|
66
|
+
output.join
|
67
|
+
end
|
68
|
+
|
69
|
+
# Call mutations
|
70
|
+
# Want to be able to give a length and offset - use this to generate appropriate sub CIGARs, subMDs & call
|
71
|
+
def mutations offset=1, length=nil, translation_start=1
|
72
|
+
return nil if @query_unmapped
|
73
|
+
cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
|
74
|
+
length ||= cigar.reference_length - offset
|
75
|
+
return nil if offset+length > cigar.reference_length
|
76
|
+
seq = Bio::Sequence::NA.new(@seq)
|
77
|
+
@cigar_obj = cigar
|
78
|
+
# Generate subalignments from the CIGAR and MD:Z
|
79
|
+
subcigar = cigar.subalignment(offset,length)
|
80
|
+
mdz = Bio::DB::Tag::MD.new(@tags["MD"].value)
|
81
|
+
mdz = mdz.slice(offset,length)
|
82
|
+
# Get inserted bases from the read sequence, only within the region of interest
|
83
|
+
insertions = []
|
84
|
+
insertion_positions = subcigar.positions(/I/)
|
85
|
+
unless insertion_positions.empty?
|
86
|
+
insertion_positions["I"].each do |ins|
|
87
|
+
# Sam.seq returns a Sequence::NA object
|
88
|
+
# Need a -1 as ruby counts characters
|
89
|
+
# Use ins[2] to retrieve the base as this is the position on query. ins[1] is position on reference, used to annotate position.
|
90
|
+
i = seq.seq[(offset+ins[2]-1),ins[1]]
|
91
|
+
insertions << i
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
first_match = true
|
96
|
+
total = 0
|
97
|
+
mutations = []
|
98
|
+
reference_pos = @pos - 1
|
99
|
+
subcigar.pairs.each do |pair|
|
100
|
+
case pair[0]
|
101
|
+
when "M"
|
102
|
+
#break if first_match == false
|
103
|
+
reference_pos += pair[1]
|
104
|
+
total += pair[1]
|
105
|
+
first_match = false
|
106
|
+
# Call deletions using the MD:Z tag - avoid need to supply reference seq.
|
107
|
+
when "D"
|
108
|
+
# Deletions are called below but still need to count here
|
109
|
+
reference_pos += pair[1]
|
110
|
+
when "I"
|
111
|
+
mut = Bio::Mutation.new
|
112
|
+
mut.type = :insertion
|
113
|
+
mut.reference = nil
|
114
|
+
mut.position = reference_pos + offset - translation_start
|
115
|
+
bases = insertions.shift
|
116
|
+
mut.mutant = bases ? bases.upcase : "N"
|
117
|
+
mut.seqname = @rname.to_s
|
118
|
+
mutations << mut
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Now substitutions & deletions - these need the MD tag
|
123
|
+
sub_pos = mdz.report(/[sd]/)
|
124
|
+
previous_sub_position = 0
|
125
|
+
unless sub_pos.empty?
|
126
|
+
sub_pos.each do |p|
|
127
|
+
# Reference base is in the MD:Z tag (p[1] here), for the actual base need to go to the read
|
128
|
+
# p[3] is the length of operations preceding the substitution on the read, p[2] on the reference.
|
129
|
+
# p[2] and p[3] are defined on the subalignment, so should add them onto the preceding.
|
130
|
+
# Need to add in any inserted bases from the CIGAR string using query_length
|
131
|
+
preceding = cigar.subalignment(0,offset-1)
|
132
|
+
# Masked length is not included in the MD:Z string so need to add it
|
133
|
+
read_position = preceding.query_length+preceding.masked_length+p[3]
|
134
|
+
# This is the adjustment needed to get the correct annotation:
|
135
|
+
substart = @pos + offset - translation_start - 1
|
136
|
+
case p[0]
|
137
|
+
when "s"
|
138
|
+
mut = Bio::Mutation.new
|
139
|
+
mut.type = :substitution
|
140
|
+
mut.position = substart+p[2] + 1
|
141
|
+
mut.reference = p[1].upcase
|
142
|
+
mut.mutant = seq[read_position,p[1].length].upcase
|
143
|
+
mut.seqname = @rname.to_s
|
144
|
+
mutations << mut
|
145
|
+
when "d"
|
146
|
+
mut = Bio::Mutation.new
|
147
|
+
mut.type = :deletion
|
148
|
+
mut.reference = p[1].upcase
|
149
|
+
mut.position = substart+p[2] + 1
|
150
|
+
mut.mutant = nil
|
151
|
+
mut.seqname = @rname.to_s
|
152
|
+
mutations << mut
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
# mutations.length > 0 ? mutations.sort{|x,y| x.position.to_i <=> y.position.to_i} : nil
|
157
|
+
mutations.length > 0 ? Bio::MutationArray.new(mutations.sort) : nil
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
def regenerate_string
|
162
|
+
tags_string = @tags.map{|k,v| [v.tag, v.type, v.value].join(":") }
|
163
|
+
self.sam_string = [@qname,
|
164
|
+
@flag,
|
165
|
+
@rname,
|
166
|
+
@pos,
|
167
|
+
@mapq,
|
168
|
+
@cigar,
|
169
|
+
@mrnm,
|
170
|
+
@mpos,
|
171
|
+
@isize,
|
172
|
+
@seq,
|
173
|
+
@qual,
|
174
|
+
tags_string].join("\t")
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
class Bio::DB::Tag::MD
|
2
|
+
include Bio::Alignment::IteratePairs
|
3
|
+
attr_accessor :tag, :pairs, :cumulative
|
4
|
+
@@regexp = /MD:Z:([\w^]+)/
|
5
|
+
@@format = /[\w^]+/
|
6
|
+
@@splitter = /(?<match>\d+)|(?<substitution>[GATCN]+)|\^(?<deletion>[GATCN]+)/
|
7
|
+
# Operations that consume reference seqeunce:
|
8
|
+
@@reference = /[msd]/
|
9
|
+
def initialize(data)
|
10
|
+
if data.is_a? String
|
11
|
+
if data.match(@@regexp)
|
12
|
+
@tag = $~[1]
|
13
|
+
elsif data.match(@@format)
|
14
|
+
#Assume tag given without MD:Z: leader
|
15
|
+
@tag = data
|
16
|
+
else
|
17
|
+
raise "Tag not of expected format."
|
18
|
+
end
|
19
|
+
elsif data.is_a? Bio::DB::Tag
|
20
|
+
@tag = data.value
|
21
|
+
warn "Not an MD tag" if data.tag == "MD"
|
22
|
+
else
|
23
|
+
raise "Tag not of expected format."
|
24
|
+
end
|
25
|
+
|
26
|
+
# Splits the string into operations using the splitter regexp class variable, returns array of two-element arrays describing operations
|
27
|
+
spl = @tag.scan(@@splitter)
|
28
|
+
# Returns an array of matches [match,substition,deletion]
|
29
|
+
# Although regexp captures are named, these don't get included automatically with scan as it doesn't return MatchData objects.
|
30
|
+
spl.map! do |a|
|
31
|
+
array = [["m", a[0]],["s", a[1]],["d", a[2]]]
|
32
|
+
# Only one of these will be non-nil
|
33
|
+
array.keep_if{|i| i[1]}
|
34
|
+
array.map!{|i| if i[0] == "m" then i[1] = i[1].to_i end; i}
|
35
|
+
array[0]
|
36
|
+
end
|
37
|
+
@pairs = spl
|
38
|
+
|
39
|
+
@cumulative = []
|
40
|
+
cumulative_length = 0
|
41
|
+
read_length = 0
|
42
|
+
@pairs.each do |q|
|
43
|
+
p = q.dup
|
44
|
+
case p[0]
|
45
|
+
when "m"
|
46
|
+
len = p[1]
|
47
|
+
rlen = p[1]
|
48
|
+
when "s"
|
49
|
+
len = p[1].length
|
50
|
+
rlen = p[1].length
|
51
|
+
when "d"
|
52
|
+
len = p[1].length
|
53
|
+
# Deleted bases don't appear in the read, so don't count to the length
|
54
|
+
rlen = 0
|
55
|
+
end
|
56
|
+
# third element in each array will be the total preceding length on the reference, i.e. the position of the operation.
|
57
|
+
# fourth element is similar for the read.
|
58
|
+
@cumulative << p.dup.push(cumulative_length).push(read_length)
|
59
|
+
cumulative_length += len
|
60
|
+
read_length += rlen
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def deletions
|
65
|
+
report(/d/)
|
66
|
+
end
|
67
|
+
|
68
|
+
def substitutions
|
69
|
+
report(/s/)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Report the positions of given events
|
73
|
+
def report(regexp=/[sd]/)
|
74
|
+
to_return = []
|
75
|
+
@cumulative.each do |p|
|
76
|
+
if p[0] =~ regexp
|
77
|
+
to_return << p
|
78
|
+
end
|
79
|
+
end
|
80
|
+
to_return
|
81
|
+
end
|
82
|
+
|
83
|
+
# Reconstruct a MD:Z tag from the pairs array
|
84
|
+
def reconstruct_tag(array=@pairs)
|
85
|
+
new_tag = []
|
86
|
+
array.each do |p|
|
87
|
+
case p[0]
|
88
|
+
when "m"
|
89
|
+
string = p[1].to_s
|
90
|
+
when "s"
|
91
|
+
string = p[1]
|
92
|
+
when "d"
|
93
|
+
string = "^"+p[1]
|
94
|
+
end
|
95
|
+
new_tag << string
|
96
|
+
end
|
97
|
+
new_tag.join("")
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
# Sums the total length of the reference sequence represented by the MD:Z tag (or part of)
|
103
|
+
def ref_length
|
104
|
+
#Need the sum of all "movement" operations (i.e. numbers) as well as any substituted bases (count 1 each)
|
105
|
+
if @tag =~ /^\d+$/
|
106
|
+
@tag.to_i
|
107
|
+
else
|
108
|
+
temp_tag = @tag.dup
|
109
|
+
temp_tag.gsub!(/\^/,"") # Deletions need to be counted - sub the caret character out and count the remaining base characters
|
110
|
+
movements = temp_tag.split(/[GATCN]+/).map(&:to_i).reduce(:+) # Sum numbers
|
111
|
+
deletions = temp_tag.split(/\d+/).map(&:length).reduce(:+) # Sum number of base chars
|
112
|
+
movements + deletions
|
113
|
+
end
|
114
|
+
end
|
115
|
+
# Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
|
116
|
+
def slice(offset,length)
|
117
|
+
new_array = iterate_pairs(@pairs,offset,length,@@reference)
|
118
|
+
# Return a MDZ instance with just the new alignment
|
119
|
+
new_tag = reconstruct_tag(new_array)
|
120
|
+
Bio::DB::Tag::MD.new(new_tag)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# DKNQZ:00025:00303 0 5 112767204 37 60M1D7M2I6M * 0 0 GCAGTAATTTCCCTGGAGTAAAACTGCGGTCAAAAATGTCCCTCCGTTCTTATGGAAGCCGGAAGGAAGTCTGTA CCCCCC@CE>CC<CC@CB;;;;.;;;;;AC;::::+:92A:=CCAEE=?>;=:@<B?:<6<*/*/*/*/911112 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:60^G13
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Used for tracking mutations appearing more than once and cacheing VEP lookups
|
2
|
+
class MutantAllele
|
3
|
+
attr_accessor :mutations, :count, :example, :seq
|
4
|
+
class << self
|
5
|
+
attr_accessor :previous_lookups
|
6
|
+
end
|
7
|
+
self.previous_lookups = {}
|
8
|
+
|
9
|
+
def initialize (mutations: nil, count: 0, example: nil, seq: nil)
|
10
|
+
@mutations = mutations
|
11
|
+
@count = count
|
12
|
+
@example = example
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns JSON from Ensembl VEP
|
16
|
+
def lookup species="human", ref_type=nil
|
17
|
+
key = mutations.to_hgvs(ref_type)
|
18
|
+
if key && (MutantAllele.previous_lookups.keys.include? key)
|
19
|
+
MutantAllele.previous_lookups[key]
|
20
|
+
else
|
21
|
+
mutations.vep(species,ref_type)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class Bio::Mutation
|
2
|
+
include VepHgvs
|
3
|
+
attr_accessor :position, :type, :reference, :mutant, :seqname
|
4
|
+
def initialize params={position: 1,type: :uninitialized, reference: nil, mutant: nil, seqname: nil}
|
5
|
+
@position = params[:position]
|
6
|
+
@type = params[:type]
|
7
|
+
@reference = params[:reference]
|
8
|
+
@mutant = params[:mutant]
|
9
|
+
@seqname = params[:seqname]
|
10
|
+
end
|
11
|
+
|
12
|
+
def <=> other
|
13
|
+
return 0 if self.position == other.position
|
14
|
+
self.position > other.position ? 1 : -1
|
15
|
+
end
|
16
|
+
|
17
|
+
# http://www.hgvs.org/mutnomen/recs.html
|
18
|
+
# This gives just the annotation. To convert to a full allele description, needs to be combined
|
19
|
+
# with e.g. g. for genomic: g. - can supply this "g", "c" as type to annotate a single mutation directly
|
20
|
+
# for compound mutants, need to join an array of annotations e.g. 1:g.[213456A>C;213460_213461delTG]
|
21
|
+
def to_hgvs(reference_type=nil)
|
22
|
+
if reference_type
|
23
|
+
hgvs_arr = [@seqname,":",reference_type,".",@position.to_s]
|
24
|
+
else
|
25
|
+
hgvs_arr = [@position.to_s]
|
26
|
+
end
|
27
|
+
|
28
|
+
case @type
|
29
|
+
when :deletion
|
30
|
+
if @reference.length == 1
|
31
|
+
hgvs_arr << "del"+@reference
|
32
|
+
else
|
33
|
+
hgvs_arr = hgvs_arr + ["_",
|
34
|
+
(@position.to_i+@reference.length-1).to_s,
|
35
|
+
"del",
|
36
|
+
@reference]
|
37
|
+
end
|
38
|
+
hgvs_arr.join
|
39
|
+
|
40
|
+
when :substitution
|
41
|
+
if @reference.length > 1
|
42
|
+
hgvs_arr = hgvs_arr + ["_",
|
43
|
+
(@position.to_i+@reference.length-1).to_s]
|
44
|
+
end
|
45
|
+
hgvs_arr << @reference+">"+@mutant
|
46
|
+
hgvs_arr.join
|
47
|
+
|
48
|
+
when :insertion
|
49
|
+
hgvs_arr << "_" + (@position.to_i+1).to_s
|
50
|
+
hgvs_arr << "ins"+@mutant
|
51
|
+
hgvs_arr.join
|
52
|
+
# TODO - distinguish duplications from insertions? Needs further input from ref.
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_json
|
57
|
+
Oj.dump self
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_yaml
|
61
|
+
YAML.dump self
|
62
|
+
end
|
63
|
+
end
|