bio-sam-mutation 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ module Bio::Alignment::IteratePairs
2
+ private
3
+ #Mixin to iterate through ordered paired [operation, value] data and take subsets - e.g. broken down CIGAR and MD:Z tags.
4
+ #Can set a regexp for the operation; default matches everything
5
+ def iterate_pairs(pairs,offset,length,regexp = //)
6
+ offset = offset.to_i
7
+ length - length.to_i
8
+ total = 0
9
+ new_array = []
10
+ first = true
11
+ pairs.each do |pair|
12
+ new_pair = pair.dup
13
+ if pair[1].is_a? String
14
+ pairlength = pair[1].length
15
+ elsif pair[1].is_a? Integer
16
+ pairlength = pair[1]
17
+ else
18
+ raise "Value for operation must be a string or integer"
19
+ end
20
+ # Only count pairs where first element matches a regexp.
21
+ # e.g. for CIGAR:
22
+ # ref M + ref D = ref length
23
+ # query M + query I = query length
24
+ if pair[0].match(regexp)
25
+ total += pairlength
26
+ end
27
+ # Just keep going until we get to the start of the subalignment
28
+ if total < offset
29
+ next
30
+ end
31
+ # If the offset is partway through a pair, need to split it up.
32
+ if first
33
+ # adjust the number in this pair; it will be added below
34
+ if pair[1].is_a? String
35
+ new_pair[1] = new_pair[1][total-offset..pairlength]
36
+ else
37
+ new_pair[1] = total - offset + 1 # Add one for bases
38
+ end
39
+ end
40
+
41
+ # Once we are at/beyond the end of the desired region:
42
+ if total >= offset + length
43
+ # Special case where the whole subalignment is contained within one cigar element:
44
+ if first
45
+ if pair[1].is_a? String
46
+ new_pair[1] = new_pair[1][total-offset-pairlength,length]
47
+ else
48
+ new_pair[1] = length
49
+ end
50
+ new_array << new_pair
51
+ break
52
+ end
53
+ # Adding the last part of the alignment
54
+ previous_total = total - pairlength
55
+ if pair[1].is_a? Integer
56
+ new_pair[1] = offset + length - previous_total - 1 #-1 extra for base arithmetic
57
+ else
58
+ new_pair[1] = new_pair[1][0..offset + length - previous_total]
59
+ end
60
+ new_array << new_pair
61
+ break
62
+ end
63
+ first = false
64
+ new_array << new_pair
65
+ end
66
+ new_array
67
+ end
68
+ end
@@ -0,0 +1,176 @@
1
+ # Extending Bio::DB::Alignment with mutation calling method
2
+ Bio::DB::Alignment.class_eval do
3
+ # Aliases included for more intuitive naming when using for genomic alignments:
4
+ alias_method :chr, :rname
5
+ alias_method :opt, :tags # vice versa as opt is the "proper" sam name for the tag fields
6
+ attr_accessor :cigar_obj
7
+
8
+ def add_tag!(new_tag)
9
+ if new_tag.is_a? String
10
+ new_tag = Bio::DB::Tag.new(new_tag)
11
+ # new_tag_obj.set(new_tag)
12
+ # new_tag = new_tag_obj
13
+ else
14
+ raise "Tag not recognised - pass a string or Bio::DB::Tag object" unless new_tag.is_a? Bio::DB::Tag
15
+ end
16
+ @tags[new_tag.tag] = new_tag
17
+
18
+ regenerate_string
19
+ end
20
+
21
+ def add_tag(tag)
22
+ dup.add_tag!(tag)
23
+ end
24
+
25
+ # Output a representation of the query sequence
26
+ def query offset=1, length=@seq.length, reference_pos=@pos-1, ins_chr="_"
27
+ mutations = self.mutations(offset,length)
28
+ cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
29
+ preceding = cigar.subalignment(0,offset-1)
30
+ preceding_diff = preceding.query_length-(offset-1)
31
+ pointer = preceding.query_length
32
+ output = []
33
+ deletions = 0
34
+ insertions = 0
35
+ if mutations
36
+ mutations.each do |mut|
37
+ mut.position = mut.position + insertions - deletions + preceding_diff
38
+ case mut.type
39
+ when :deletion
40
+ # position for deletion is the first deleted base
41
+ fillin = mut.position-1-reference_pos-1
42
+ output << @seq[pointer..fillin] if fillin > pointer
43
+ mut.reference.length.times{ output << "-" }
44
+ pointer += fillin - pointer + 1
45
+ deletions += mut.reference.length
46
+ when :insertion
47
+ # position for insertion is the base we want
48
+ fillin = mut.position-reference_pos-1
49
+ output << @seq[pointer..fillin] if fillin > pointer
50
+ output << ins_chr + mut.mutant.downcase + ins_chr
51
+ pointer += fillin - pointer + 1 + mut.mutant.length
52
+ insertions += mut.mutant.length
53
+ when :substitution
54
+ # position for substitution is the first subbed base
55
+ fillin = mut.position-1-reference_pos-1
56
+ output << @seq[pointer..fillin] if fillin > pointer
57
+ output << mut.mutant.downcase
58
+ pointer += fillin - pointer + 1 + mut.mutant.length
59
+ end
60
+ end
61
+ end
62
+ # Remaining sequence
63
+ if offset + length > pointer
64
+ output << @seq[pointer..offset-1+length-1-deletions+insertions+preceding_diff]
65
+ end
66
+ output.join
67
+ end
68
+
69
+ # Call mutations
70
+ # Want to be able to give a length and offset - use this to generate appropriate sub CIGARs, subMDs & call
71
+ def mutations offset=1, length=nil, translation_start=1
72
+ return nil if @query_unmapped
73
+ cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
74
+ length ||= cigar.reference_length - offset
75
+ return nil if offset+length > cigar.reference_length
76
+ seq = Bio::Sequence::NA.new(@seq)
77
+ @cigar_obj = cigar
78
+ # Generate subalignments from the CIGAR and MD:Z
79
+ subcigar = cigar.subalignment(offset,length)
80
+ mdz = Bio::DB::Tag::MD.new(@tags["MD"].value)
81
+ mdz = mdz.slice(offset,length)
82
+ # Get inserted bases from the read sequence, only within the region of interest
83
+ insertions = []
84
+ insertion_positions = subcigar.positions(/I/)
85
+ unless insertion_positions.empty?
86
+ insertion_positions["I"].each do |ins|
87
+ # Sam.seq returns a Sequence::NA object
88
+ # Need a -1 as ruby counts characters
89
+ # Use ins[2] to retrieve the base as this is the position on query. ins[1] is position on reference, used to annotate position.
90
+ i = seq.seq[(offset+ins[2]-1),ins[1]]
91
+ insertions << i
92
+ end
93
+ end
94
+
95
+ first_match = true
96
+ total = 0
97
+ mutations = []
98
+ reference_pos = @pos - 1
99
+ subcigar.pairs.each do |pair|
100
+ case pair[0]
101
+ when "M"
102
+ #break if first_match == false
103
+ reference_pos += pair[1]
104
+ total += pair[1]
105
+ first_match = false
106
+ # Call deletions using the MD:Z tag - avoid need to supply reference seq.
107
+ when "D"
108
+ # Deletions are called below but still need to count here
109
+ reference_pos += pair[1]
110
+ when "I"
111
+ mut = Bio::Mutation.new
112
+ mut.type = :insertion
113
+ mut.reference = nil
114
+ mut.position = reference_pos + offset - translation_start
115
+ bases = insertions.shift
116
+ mut.mutant = bases ? bases.upcase : "N"
117
+ mut.seqname = @rname.to_s
118
+ mutations << mut
119
+ end
120
+ end
121
+
122
+ # Now substitutions & deletions - these need the MD tag
123
+ sub_pos = mdz.report(/[sd]/)
124
+ previous_sub_position = 0
125
+ unless sub_pos.empty?
126
+ sub_pos.each do |p|
127
+ # Reference base is in the MD:Z tag (p[1] here), for the actual base need to go to the read
128
+ # p[3] is the length of operations preceding the substitution on the read, p[2] on the reference.
129
+ # p[2] and p[3] are defined on the subalignment, so should add them onto the preceding.
130
+ # Need to add in any inserted bases from the CIGAR string using query_length
131
+ preceding = cigar.subalignment(0,offset-1)
132
+ # Masked length is not included in the MD:Z string so need to add it
133
+ read_position = preceding.query_length+preceding.masked_length+p[3]
134
+ # This is the adjustment needed to get the correct annotation:
135
+ substart = @pos + offset - translation_start - 1
136
+ case p[0]
137
+ when "s"
138
+ mut = Bio::Mutation.new
139
+ mut.type = :substitution
140
+ mut.position = substart+p[2] + 1
141
+ mut.reference = p[1].upcase
142
+ mut.mutant = seq[read_position,p[1].length].upcase
143
+ mut.seqname = @rname.to_s
144
+ mutations << mut
145
+ when "d"
146
+ mut = Bio::Mutation.new
147
+ mut.type = :deletion
148
+ mut.reference = p[1].upcase
149
+ mut.position = substart+p[2] + 1
150
+ mut.mutant = nil
151
+ mut.seqname = @rname.to_s
152
+ mutations << mut
153
+ end
154
+ end
155
+ end
156
+ # mutations.length > 0 ? mutations.sort{|x,y| x.position.to_i <=> y.position.to_i} : nil
157
+ mutations.length > 0 ? Bio::MutationArray.new(mutations.sort) : nil
158
+
159
+ end
160
+
161
+ def regenerate_string
162
+ tags_string = @tags.map{|k,v| [v.tag, v.type, v.value].join(":") }
163
+ self.sam_string = [@qname,
164
+ @flag,
165
+ @rname,
166
+ @pos,
167
+ @mapq,
168
+ @cigar,
169
+ @mrnm,
170
+ @mpos,
171
+ @isize,
172
+ @seq,
173
+ @qual,
174
+ tags_string].join("\t")
175
+ end
176
+ end
@@ -0,0 +1,5 @@
1
+ Bio::DB::Tag.class_eval do
2
+ def initialize(tag_string=nil)
3
+ set(tag_string) if tag_string
4
+ end
5
+ end
@@ -0,0 +1,126 @@
1
+ class Bio::DB::Tag::MD
2
+ include Bio::Alignment::IteratePairs
3
+ attr_accessor :tag, :pairs, :cumulative
4
+ @@regexp = /MD:Z:([\w^]+)/
5
+ @@format = /[\w^]+/
6
+ @@splitter = /(?<match>\d+)|(?<substitution>[GATCN]+)|\^(?<deletion>[GATCN]+)/
7
+ # Operations that consume reference seqeunce:
8
+ @@reference = /[msd]/
9
+ def initialize(data)
10
+ if data.is_a? String
11
+ if data.match(@@regexp)
12
+ @tag = $~[1]
13
+ elsif data.match(@@format)
14
+ #Assume tag given without MD:Z: leader
15
+ @tag = data
16
+ else
17
+ raise "Tag not of expected format."
18
+ end
19
+ elsif data.is_a? Bio::DB::Tag
20
+ @tag = data.value
21
+ warn "Not an MD tag" if data.tag == "MD"
22
+ else
23
+ raise "Tag not of expected format."
24
+ end
25
+
26
+ # Splits the string into operations using the splitter regexp class variable, returns array of two-element arrays describing operations
27
+ spl = @tag.scan(@@splitter)
28
+ # Returns an array of matches [match,substition,deletion]
29
+ # Although regexp captures are named, these don't get included automatically with scan as it doesn't return MatchData objects.
30
+ spl.map! do |a|
31
+ array = [["m", a[0]],["s", a[1]],["d", a[2]]]
32
+ # Only one of these will be non-nil
33
+ array.keep_if{|i| i[1]}
34
+ array.map!{|i| if i[0] == "m" then i[1] = i[1].to_i end; i}
35
+ array[0]
36
+ end
37
+ @pairs = spl
38
+
39
+ @cumulative = []
40
+ cumulative_length = 0
41
+ read_length = 0
42
+ @pairs.each do |q|
43
+ p = q.dup
44
+ case p[0]
45
+ when "m"
46
+ len = p[1]
47
+ rlen = p[1]
48
+ when "s"
49
+ len = p[1].length
50
+ rlen = p[1].length
51
+ when "d"
52
+ len = p[1].length
53
+ # Deleted bases don't appear in the read, so don't count to the length
54
+ rlen = 0
55
+ end
56
+ # third element in each array will be the total preceding length on the reference, i.e. the position of the operation.
57
+ # fourth element is similar for the read.
58
+ @cumulative << p.dup.push(cumulative_length).push(read_length)
59
+ cumulative_length += len
60
+ read_length += rlen
61
+ end
62
+ end
63
+
64
+ def deletions
65
+ report(/d/)
66
+ end
67
+
68
+ def substitutions
69
+ report(/s/)
70
+ end
71
+
72
+ # Report the positions of given events
73
+ def report(regexp=/[sd]/)
74
+ to_return = []
75
+ @cumulative.each do |p|
76
+ if p[0] =~ regexp
77
+ to_return << p
78
+ end
79
+ end
80
+ to_return
81
+ end
82
+
83
+ # Reconstruct a MD:Z tag from the pairs array
84
+ def reconstruct_tag(array=@pairs)
85
+ new_tag = []
86
+ array.each do |p|
87
+ case p[0]
88
+ when "m"
89
+ string = p[1].to_s
90
+ when "s"
91
+ string = p[1]
92
+ when "d"
93
+ string = "^"+p[1]
94
+ end
95
+ new_tag << string
96
+ end
97
+ new_tag.join("")
98
+ end
99
+
100
+
101
+
102
+ # Sums the total length of the reference sequence represented by the MD:Z tag (or part of)
103
+ def ref_length
104
+ #Need the sum of all "movement" operations (i.e. numbers) as well as any substituted bases (count 1 each)
105
+ if @tag =~ /^\d+$/
106
+ @tag.to_i
107
+ else
108
+ temp_tag = @tag.dup
109
+ temp_tag.gsub!(/\^/,"") # Deletions need to be counted - sub the caret character out and count the remaining base characters
110
+ movements = temp_tag.split(/[GATCN]+/).map(&:to_i).reduce(:+) # Sum numbers
111
+ deletions = temp_tag.split(/\d+/).map(&:length).reduce(:+) # Sum number of base chars
112
+ movements + deletions
113
+ end
114
+ end
115
+ # Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
116
+ def slice(offset,length)
117
+ new_array = iterate_pairs(@pairs,offset,length,@@reference)
118
+ # Return a MDZ instance with just the new alignment
119
+ new_tag = reconstruct_tag(new_array)
120
+ Bio::DB::Tag::MD.new(new_tag)
121
+ end
122
+
123
+ end
124
+
125
+
126
+ # DKNQZ:00025:00303 0 5 112767204 37 60M1D7M2I6M * 0 0 GCAGTAATTTCCCTGGAGTAAAACTGCGGTCAAAAATGTCCCTCCGTTCTTATGGAAGCCGGAAGGAAGTCTGTA CCCCCC@CE>CC<CC@CB;;;;.;;;;;AC;::::+:92A:=CCAEE=?>;=:@<B?:<6<*/*/*/*/911112 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:60^G13
@@ -0,0 +1,24 @@
1
+ # Used for tracking mutations appearing more than once and cacheing VEP lookups
2
+ class MutantAllele
3
+ attr_accessor :mutations, :count, :example, :seq
4
+ class << self
5
+ attr_accessor :previous_lookups
6
+ end
7
+ self.previous_lookups = {}
8
+
9
+ def initialize (mutations: nil, count: 0, example: nil, seq: nil)
10
+ @mutations = mutations
11
+ @count = count
12
+ @example = example
13
+ end
14
+
15
+ # Returns JSON from Ensembl VEP
16
+ def lookup species="human", ref_type=nil
17
+ key = mutations.to_hgvs(ref_type)
18
+ if key && (MutantAllele.previous_lookups.keys.include? key)
19
+ MutantAllele.previous_lookups[key]
20
+ else
21
+ mutations.vep(species,ref_type)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,63 @@
1
+ class Bio::Mutation
2
+ include VepHgvs
3
+ attr_accessor :position, :type, :reference, :mutant, :seqname
4
+ def initialize params={position: 1,type: :uninitialized, reference: nil, mutant: nil, seqname: nil}
5
+ @position = params[:position]
6
+ @type = params[:type]
7
+ @reference = params[:reference]
8
+ @mutant = params[:mutant]
9
+ @seqname = params[:seqname]
10
+ end
11
+
12
+ def <=> other
13
+ return 0 if self.position == other.position
14
+ self.position > other.position ? 1 : -1
15
+ end
16
+
17
+ # http://www.hgvs.org/mutnomen/recs.html
18
+ # This gives just the annotation. To convert to a full allele description, needs to be combined
19
+ # with e.g. g. for genomic: g. - can supply this "g", "c" as type to annotate a single mutation directly
20
+ # for compound mutants, need to join an array of annotations e.g. 1:g.[213456A>C;213460_213461delTG]
21
+ def to_hgvs(reference_type=nil)
22
+ if reference_type
23
+ hgvs_arr = [@seqname,":",reference_type,".",@position.to_s]
24
+ else
25
+ hgvs_arr = [@position.to_s]
26
+ end
27
+
28
+ case @type
29
+ when :deletion
30
+ if @reference.length == 1
31
+ hgvs_arr << "del"+@reference
32
+ else
33
+ hgvs_arr = hgvs_arr + ["_",
34
+ (@position.to_i+@reference.length-1).to_s,
35
+ "del",
36
+ @reference]
37
+ end
38
+ hgvs_arr.join
39
+
40
+ when :substitution
41
+ if @reference.length > 1
42
+ hgvs_arr = hgvs_arr + ["_",
43
+ (@position.to_i+@reference.length-1).to_s]
44
+ end
45
+ hgvs_arr << @reference+">"+@mutant
46
+ hgvs_arr.join
47
+
48
+ when :insertion
49
+ hgvs_arr << "_" + (@position.to_i+1).to_s
50
+ hgvs_arr << "ins"+@mutant
51
+ hgvs_arr.join
52
+ # TODO - distinguish duplications from insertions? Needs further input from ref.
53
+ end
54
+ end
55
+
56
+ def to_json
57
+ Oj.dump self
58
+ end
59
+
60
+ def to_yaml
61
+ YAML.dump self
62
+ end
63
+ end