bio-sam-mutation 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +20 -0
- data/README.md +88 -0
- data/README.rdoc +48 -0
- data/Rakefile +54 -0
- data/bin/mutations +108 -0
- data/bin/sam-mutation +20 -0
- data/lib/bio-sam-mutation.rb +26 -0
- data/lib/bio-sam-mutation/bio/alignment/cigar.rb +239 -0
- data/lib/bio-sam-mutation/bio/alignment/iterate_pairs.rb +68 -0
- data/lib/bio-sam-mutation/bio/db/alignment.rb +176 -0
- data/lib/bio-sam-mutation/bio/db/tag.rb +5 -0
- data/lib/bio-sam-mutation/bio/db/tag/md.rb +126 -0
- data/lib/bio-sam-mutation/bio/mutantallele.rb +24 -0
- data/lib/bio-sam-mutation/bio/mutation.rb +63 -0
- data/lib/bio-sam-mutation/bio/mutation_array.rb +15 -0
- data/lib/bio-sam-mutation/bio/vephgvs.rb +21 -0
- data/lib/bio-sam-mutation/mutationscli.rb +83 -0
- data/test/helper.rb +34 -0
- data/test/test_cigar.rb +145 -0
- data/test/test_mdtag.rb +46 -0
- data/test/test_mutant_allele.rb +21 -0
- data/test/test_mutation.rb +84 -0
- data/test/test_mutation_array.rb +13 -0
- data/test/test_sam.rb +160 -0
- data/test/test_vep_hgvs.rb +9 -0
- metadata +247 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bio::Alignment::IteratePairs
|
2
|
+
private
|
3
|
+
#Mixin to iterate through ordered paired [operation, value] data and take subsets - e.g. broken down CIGAR and MD:Z tags.
|
4
|
+
#Can set a regexp for the operation; default matches everything
|
5
|
+
def iterate_pairs(pairs,offset,length,regexp = //)
|
6
|
+
offset = offset.to_i
|
7
|
+
length - length.to_i
|
8
|
+
total = 0
|
9
|
+
new_array = []
|
10
|
+
first = true
|
11
|
+
pairs.each do |pair|
|
12
|
+
new_pair = pair.dup
|
13
|
+
if pair[1].is_a? String
|
14
|
+
pairlength = pair[1].length
|
15
|
+
elsif pair[1].is_a? Integer
|
16
|
+
pairlength = pair[1]
|
17
|
+
else
|
18
|
+
raise "Value for operation must be a string or integer"
|
19
|
+
end
|
20
|
+
# Only count pairs where first element matches a regexp.
|
21
|
+
# e.g. for CIGAR:
|
22
|
+
# ref M + ref D = ref length
|
23
|
+
# query M + query I = query length
|
24
|
+
if pair[0].match(regexp)
|
25
|
+
total += pairlength
|
26
|
+
end
|
27
|
+
# Just keep going until we get to the start of the subalignment
|
28
|
+
if total < offset
|
29
|
+
next
|
30
|
+
end
|
31
|
+
# If the offset is partway through a pair, need to split it up.
|
32
|
+
if first
|
33
|
+
# adjust the number in this pair; it will be added below
|
34
|
+
if pair[1].is_a? String
|
35
|
+
new_pair[1] = new_pair[1][total-offset..pairlength]
|
36
|
+
else
|
37
|
+
new_pair[1] = total - offset + 1 # Add one for bases
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Once we are at/beyond the end of the desired region:
|
42
|
+
if total >= offset + length
|
43
|
+
# Special case where the whole subalignment is contained within one cigar element:
|
44
|
+
if first
|
45
|
+
if pair[1].is_a? String
|
46
|
+
new_pair[1] = new_pair[1][total-offset-pairlength,length]
|
47
|
+
else
|
48
|
+
new_pair[1] = length
|
49
|
+
end
|
50
|
+
new_array << new_pair
|
51
|
+
break
|
52
|
+
end
|
53
|
+
# Adding the last part of the alignment
|
54
|
+
previous_total = total - pairlength
|
55
|
+
if pair[1].is_a? Integer
|
56
|
+
new_pair[1] = offset + length - previous_total - 1 #-1 extra for base arithmetic
|
57
|
+
else
|
58
|
+
new_pair[1] = new_pair[1][0..offset + length - previous_total]
|
59
|
+
end
|
60
|
+
new_array << new_pair
|
61
|
+
break
|
62
|
+
end
|
63
|
+
first = false
|
64
|
+
new_array << new_pair
|
65
|
+
end
|
66
|
+
new_array
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# Extending Bio::DB::Alignment with mutation calling method
|
2
|
+
Bio::DB::Alignment.class_eval do
|
3
|
+
# Aliases included for more intuitive naming when using for genomic alignments:
|
4
|
+
alias_method :chr, :rname
|
5
|
+
alias_method :opt, :tags # vice versa as opt is the "proper" sam name for the tag fields
|
6
|
+
attr_accessor :cigar_obj
|
7
|
+
|
8
|
+
def add_tag!(new_tag)
|
9
|
+
if new_tag.is_a? String
|
10
|
+
new_tag = Bio::DB::Tag.new(new_tag)
|
11
|
+
# new_tag_obj.set(new_tag)
|
12
|
+
# new_tag = new_tag_obj
|
13
|
+
else
|
14
|
+
raise "Tag not recognised - pass a string or Bio::DB::Tag object" unless new_tag.is_a? Bio::DB::Tag
|
15
|
+
end
|
16
|
+
@tags[new_tag.tag] = new_tag
|
17
|
+
|
18
|
+
regenerate_string
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_tag(tag)
|
22
|
+
dup.add_tag!(tag)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Output a representation of the query sequence
|
26
|
+
def query offset=1, length=@seq.length, reference_pos=@pos-1, ins_chr="_"
|
27
|
+
mutations = self.mutations(offset,length)
|
28
|
+
cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
|
29
|
+
preceding = cigar.subalignment(0,offset-1)
|
30
|
+
preceding_diff = preceding.query_length-(offset-1)
|
31
|
+
pointer = preceding.query_length
|
32
|
+
output = []
|
33
|
+
deletions = 0
|
34
|
+
insertions = 0
|
35
|
+
if mutations
|
36
|
+
mutations.each do |mut|
|
37
|
+
mut.position = mut.position + insertions - deletions + preceding_diff
|
38
|
+
case mut.type
|
39
|
+
when :deletion
|
40
|
+
# position for deletion is the first deleted base
|
41
|
+
fillin = mut.position-1-reference_pos-1
|
42
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
43
|
+
mut.reference.length.times{ output << "-" }
|
44
|
+
pointer += fillin - pointer + 1
|
45
|
+
deletions += mut.reference.length
|
46
|
+
when :insertion
|
47
|
+
# position for insertion is the base we want
|
48
|
+
fillin = mut.position-reference_pos-1
|
49
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
50
|
+
output << ins_chr + mut.mutant.downcase + ins_chr
|
51
|
+
pointer += fillin - pointer + 1 + mut.mutant.length
|
52
|
+
insertions += mut.mutant.length
|
53
|
+
when :substitution
|
54
|
+
# position for substitution is the first subbed base
|
55
|
+
fillin = mut.position-1-reference_pos-1
|
56
|
+
output << @seq[pointer..fillin] if fillin > pointer
|
57
|
+
output << mut.mutant.downcase
|
58
|
+
pointer += fillin - pointer + 1 + mut.mutant.length
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# Remaining sequence
|
63
|
+
if offset + length > pointer
|
64
|
+
output << @seq[pointer..offset-1+length-1-deletions+insertions+preceding_diff]
|
65
|
+
end
|
66
|
+
output.join
|
67
|
+
end
|
68
|
+
|
69
|
+
# Call mutations
|
70
|
+
# Want to be able to give a length and offset - use this to generate appropriate sub CIGARs, subMDs & call
|
71
|
+
def mutations offset=1, length=nil, translation_start=1
|
72
|
+
return nil if @query_unmapped
|
73
|
+
cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
|
74
|
+
length ||= cigar.reference_length - offset
|
75
|
+
return nil if offset+length > cigar.reference_length
|
76
|
+
seq = Bio::Sequence::NA.new(@seq)
|
77
|
+
@cigar_obj = cigar
|
78
|
+
# Generate subalignments from the CIGAR and MD:Z
|
79
|
+
subcigar = cigar.subalignment(offset,length)
|
80
|
+
mdz = Bio::DB::Tag::MD.new(@tags["MD"].value)
|
81
|
+
mdz = mdz.slice(offset,length)
|
82
|
+
# Get inserted bases from the read sequence, only within the region of interest
|
83
|
+
insertions = []
|
84
|
+
insertion_positions = subcigar.positions(/I/)
|
85
|
+
unless insertion_positions.empty?
|
86
|
+
insertion_positions["I"].each do |ins|
|
87
|
+
# Sam.seq returns a Sequence::NA object
|
88
|
+
# Need a -1 as ruby counts characters
|
89
|
+
# Use ins[2] to retrieve the base as this is the position on query. ins[1] is position on reference, used to annotate position.
|
90
|
+
i = seq.seq[(offset+ins[2]-1),ins[1]]
|
91
|
+
insertions << i
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
first_match = true
|
96
|
+
total = 0
|
97
|
+
mutations = []
|
98
|
+
reference_pos = @pos - 1
|
99
|
+
subcigar.pairs.each do |pair|
|
100
|
+
case pair[0]
|
101
|
+
when "M"
|
102
|
+
#break if first_match == false
|
103
|
+
reference_pos += pair[1]
|
104
|
+
total += pair[1]
|
105
|
+
first_match = false
|
106
|
+
# Call deletions using the MD:Z tag - avoid need to supply reference seq.
|
107
|
+
when "D"
|
108
|
+
# Deletions are called below but still need to count here
|
109
|
+
reference_pos += pair[1]
|
110
|
+
when "I"
|
111
|
+
mut = Bio::Mutation.new
|
112
|
+
mut.type = :insertion
|
113
|
+
mut.reference = nil
|
114
|
+
mut.position = reference_pos + offset - translation_start
|
115
|
+
bases = insertions.shift
|
116
|
+
mut.mutant = bases ? bases.upcase : "N"
|
117
|
+
mut.seqname = @rname.to_s
|
118
|
+
mutations << mut
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Now substitutions & deletions - these need the MD tag
|
123
|
+
sub_pos = mdz.report(/[sd]/)
|
124
|
+
previous_sub_position = 0
|
125
|
+
unless sub_pos.empty?
|
126
|
+
sub_pos.each do |p|
|
127
|
+
# Reference base is in the MD:Z tag (p[1] here), for the actual base need to go to the read
|
128
|
+
# p[3] is the length of operations preceding the substitution on the read, p[2] on the reference.
|
129
|
+
# p[2] and p[3] are defined on the subalignment, so should add them onto the preceding.
|
130
|
+
# Need to add in any inserted bases from the CIGAR string using query_length
|
131
|
+
preceding = cigar.subalignment(0,offset-1)
|
132
|
+
# Masked length is not included in the MD:Z string so need to add it
|
133
|
+
read_position = preceding.query_length+preceding.masked_length+p[3]
|
134
|
+
# This is the adjustment needed to get the correct annotation:
|
135
|
+
substart = @pos + offset - translation_start - 1
|
136
|
+
case p[0]
|
137
|
+
when "s"
|
138
|
+
mut = Bio::Mutation.new
|
139
|
+
mut.type = :substitution
|
140
|
+
mut.position = substart+p[2] + 1
|
141
|
+
mut.reference = p[1].upcase
|
142
|
+
mut.mutant = seq[read_position,p[1].length].upcase
|
143
|
+
mut.seqname = @rname.to_s
|
144
|
+
mutations << mut
|
145
|
+
when "d"
|
146
|
+
mut = Bio::Mutation.new
|
147
|
+
mut.type = :deletion
|
148
|
+
mut.reference = p[1].upcase
|
149
|
+
mut.position = substart+p[2] + 1
|
150
|
+
mut.mutant = nil
|
151
|
+
mut.seqname = @rname.to_s
|
152
|
+
mutations << mut
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
# mutations.length > 0 ? mutations.sort{|x,y| x.position.to_i <=> y.position.to_i} : nil
|
157
|
+
mutations.length > 0 ? Bio::MutationArray.new(mutations.sort) : nil
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
def regenerate_string
|
162
|
+
tags_string = @tags.map{|k,v| [v.tag, v.type, v.value].join(":") }
|
163
|
+
self.sam_string = [@qname,
|
164
|
+
@flag,
|
165
|
+
@rname,
|
166
|
+
@pos,
|
167
|
+
@mapq,
|
168
|
+
@cigar,
|
169
|
+
@mrnm,
|
170
|
+
@mpos,
|
171
|
+
@isize,
|
172
|
+
@seq,
|
173
|
+
@qual,
|
174
|
+
tags_string].join("\t")
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
class Bio::DB::Tag::MD
|
2
|
+
include Bio::Alignment::IteratePairs
|
3
|
+
attr_accessor :tag, :pairs, :cumulative
|
4
|
+
@@regexp = /MD:Z:([\w^]+)/
|
5
|
+
@@format = /[\w^]+/
|
6
|
+
@@splitter = /(?<match>\d+)|(?<substitution>[GATCN]+)|\^(?<deletion>[GATCN]+)/
|
7
|
+
# Operations that consume reference seqeunce:
|
8
|
+
@@reference = /[msd]/
|
9
|
+
def initialize(data)
|
10
|
+
if data.is_a? String
|
11
|
+
if data.match(@@regexp)
|
12
|
+
@tag = $~[1]
|
13
|
+
elsif data.match(@@format)
|
14
|
+
#Assume tag given without MD:Z: leader
|
15
|
+
@tag = data
|
16
|
+
else
|
17
|
+
raise "Tag not of expected format."
|
18
|
+
end
|
19
|
+
elsif data.is_a? Bio::DB::Tag
|
20
|
+
@tag = data.value
|
21
|
+
warn "Not an MD tag" if data.tag == "MD"
|
22
|
+
else
|
23
|
+
raise "Tag not of expected format."
|
24
|
+
end
|
25
|
+
|
26
|
+
# Splits the string into operations using the splitter regexp class variable, returns array of two-element arrays describing operations
|
27
|
+
spl = @tag.scan(@@splitter)
|
28
|
+
# Returns an array of matches [match,substition,deletion]
|
29
|
+
# Although regexp captures are named, these don't get included automatically with scan as it doesn't return MatchData objects.
|
30
|
+
spl.map! do |a|
|
31
|
+
array = [["m", a[0]],["s", a[1]],["d", a[2]]]
|
32
|
+
# Only one of these will be non-nil
|
33
|
+
array.keep_if{|i| i[1]}
|
34
|
+
array.map!{|i| if i[0] == "m" then i[1] = i[1].to_i end; i}
|
35
|
+
array[0]
|
36
|
+
end
|
37
|
+
@pairs = spl
|
38
|
+
|
39
|
+
@cumulative = []
|
40
|
+
cumulative_length = 0
|
41
|
+
read_length = 0
|
42
|
+
@pairs.each do |q|
|
43
|
+
p = q.dup
|
44
|
+
case p[0]
|
45
|
+
when "m"
|
46
|
+
len = p[1]
|
47
|
+
rlen = p[1]
|
48
|
+
when "s"
|
49
|
+
len = p[1].length
|
50
|
+
rlen = p[1].length
|
51
|
+
when "d"
|
52
|
+
len = p[1].length
|
53
|
+
# Deleted bases don't appear in the read, so don't count to the length
|
54
|
+
rlen = 0
|
55
|
+
end
|
56
|
+
# third element in each array will be the total preceding length on the reference, i.e. the position of the operation.
|
57
|
+
# fourth element is similar for the read.
|
58
|
+
@cumulative << p.dup.push(cumulative_length).push(read_length)
|
59
|
+
cumulative_length += len
|
60
|
+
read_length += rlen
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def deletions
|
65
|
+
report(/d/)
|
66
|
+
end
|
67
|
+
|
68
|
+
def substitutions
|
69
|
+
report(/s/)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Report the positions of given events
|
73
|
+
def report(regexp=/[sd]/)
|
74
|
+
to_return = []
|
75
|
+
@cumulative.each do |p|
|
76
|
+
if p[0] =~ regexp
|
77
|
+
to_return << p
|
78
|
+
end
|
79
|
+
end
|
80
|
+
to_return
|
81
|
+
end
|
82
|
+
|
83
|
+
# Reconstruct a MD:Z tag from the pairs array
|
84
|
+
def reconstruct_tag(array=@pairs)
|
85
|
+
new_tag = []
|
86
|
+
array.each do |p|
|
87
|
+
case p[0]
|
88
|
+
when "m"
|
89
|
+
string = p[1].to_s
|
90
|
+
when "s"
|
91
|
+
string = p[1]
|
92
|
+
when "d"
|
93
|
+
string = "^"+p[1]
|
94
|
+
end
|
95
|
+
new_tag << string
|
96
|
+
end
|
97
|
+
new_tag.join("")
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
# Sums the total length of the reference sequence represented by the MD:Z tag (or part of)
|
103
|
+
def ref_length
|
104
|
+
#Need the sum of all "movement" operations (i.e. numbers) as well as any substituted bases (count 1 each)
|
105
|
+
if @tag =~ /^\d+$/
|
106
|
+
@tag.to_i
|
107
|
+
else
|
108
|
+
temp_tag = @tag.dup
|
109
|
+
temp_tag.gsub!(/\^/,"") # Deletions need to be counted - sub the caret character out and count the remaining base characters
|
110
|
+
movements = temp_tag.split(/[GATCN]+/).map(&:to_i).reduce(:+) # Sum numbers
|
111
|
+
deletions = temp_tag.split(/\d+/).map(&:length).reduce(:+) # Sum number of base chars
|
112
|
+
movements + deletions
|
113
|
+
end
|
114
|
+
end
|
115
|
+
# Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
|
116
|
+
def slice(offset,length)
|
117
|
+
new_array = iterate_pairs(@pairs,offset,length,@@reference)
|
118
|
+
# Return a MDZ instance with just the new alignment
|
119
|
+
new_tag = reconstruct_tag(new_array)
|
120
|
+
Bio::DB::Tag::MD.new(new_tag)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# DKNQZ:00025:00303 0 5 112767204 37 60M1D7M2I6M * 0 0 GCAGTAATTTCCCTGGAGTAAAACTGCGGTCAAAAATGTCCCTCCGTTCTTATGGAAGCCGGAAGGAAGTCTGTA CCCCCC@CE>CC<CC@CB;;;;.;;;;;AC;::::+:92A:=CCAEE=?>;=:@<B?:<6<*/*/*/*/911112 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:60^G13
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Used for tracking mutations appearing more than once and cacheing VEP lookups
|
2
|
+
class MutantAllele
|
3
|
+
attr_accessor :mutations, :count, :example, :seq
|
4
|
+
class << self
|
5
|
+
attr_accessor :previous_lookups
|
6
|
+
end
|
7
|
+
self.previous_lookups = {}
|
8
|
+
|
9
|
+
def initialize (mutations: nil, count: 0, example: nil, seq: nil)
|
10
|
+
@mutations = mutations
|
11
|
+
@count = count
|
12
|
+
@example = example
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns JSON from Ensembl VEP
|
16
|
+
def lookup species="human", ref_type=nil
|
17
|
+
key = mutations.to_hgvs(ref_type)
|
18
|
+
if key && (MutantAllele.previous_lookups.keys.include? key)
|
19
|
+
MutantAllele.previous_lookups[key]
|
20
|
+
else
|
21
|
+
mutations.vep(species,ref_type)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class Bio::Mutation
|
2
|
+
include VepHgvs
|
3
|
+
attr_accessor :position, :type, :reference, :mutant, :seqname
|
4
|
+
def initialize params={position: 1,type: :uninitialized, reference: nil, mutant: nil, seqname: nil}
|
5
|
+
@position = params[:position]
|
6
|
+
@type = params[:type]
|
7
|
+
@reference = params[:reference]
|
8
|
+
@mutant = params[:mutant]
|
9
|
+
@seqname = params[:seqname]
|
10
|
+
end
|
11
|
+
|
12
|
+
def <=> other
|
13
|
+
return 0 if self.position == other.position
|
14
|
+
self.position > other.position ? 1 : -1
|
15
|
+
end
|
16
|
+
|
17
|
+
# http://www.hgvs.org/mutnomen/recs.html
|
18
|
+
# This gives just the annotation. To convert to a full allele description, needs to be combined
|
19
|
+
# with e.g. g. for genomic: g. - can supply this "g", "c" as type to annotate a single mutation directly
|
20
|
+
# for compound mutants, need to join an array of annotations e.g. 1:g.[213456A>C;213460_213461delTG]
|
21
|
+
def to_hgvs(reference_type=nil)
|
22
|
+
if reference_type
|
23
|
+
hgvs_arr = [@seqname,":",reference_type,".",@position.to_s]
|
24
|
+
else
|
25
|
+
hgvs_arr = [@position.to_s]
|
26
|
+
end
|
27
|
+
|
28
|
+
case @type
|
29
|
+
when :deletion
|
30
|
+
if @reference.length == 1
|
31
|
+
hgvs_arr << "del"+@reference
|
32
|
+
else
|
33
|
+
hgvs_arr = hgvs_arr + ["_",
|
34
|
+
(@position.to_i+@reference.length-1).to_s,
|
35
|
+
"del",
|
36
|
+
@reference]
|
37
|
+
end
|
38
|
+
hgvs_arr.join
|
39
|
+
|
40
|
+
when :substitution
|
41
|
+
if @reference.length > 1
|
42
|
+
hgvs_arr = hgvs_arr + ["_",
|
43
|
+
(@position.to_i+@reference.length-1).to_s]
|
44
|
+
end
|
45
|
+
hgvs_arr << @reference+">"+@mutant
|
46
|
+
hgvs_arr.join
|
47
|
+
|
48
|
+
when :insertion
|
49
|
+
hgvs_arr << "_" + (@position.to_i+1).to_s
|
50
|
+
hgvs_arr << "ins"+@mutant
|
51
|
+
hgvs_arr.join
|
52
|
+
# TODO - distinguish duplications from insertions? Needs further input from ref.
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_json
|
57
|
+
Oj.dump self
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_yaml
|
61
|
+
YAML.dump self
|
62
|
+
end
|
63
|
+
end
|