bio-sam-mutation 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ module Bio::Alignment::IteratePairs
2
+ private
3
+ #Mixin to iterate through ordered paired [operation, value] data and take subsets - e.g. broken down CIGAR and MD:Z tags.
4
+ #Can set a regexp for the operation; default matches everything
5
+ def iterate_pairs(pairs,offset,length,regexp = //)
6
+ offset = offset.to_i
7
+ length - length.to_i
8
+ total = 0
9
+ new_array = []
10
+ first = true
11
+ pairs.each do |pair|
12
+ new_pair = pair.dup
13
+ if pair[1].is_a? String
14
+ pairlength = pair[1].length
15
+ elsif pair[1].is_a? Integer
16
+ pairlength = pair[1]
17
+ else
18
+ raise "Value for operation must be a string or integer"
19
+ end
20
+ # Only count pairs where first element matches a regexp.
21
+ # e.g. for CIGAR:
22
+ # ref M + ref D = ref length
23
+ # query M + query I = query length
24
+ if pair[0].match(regexp)
25
+ total += pairlength
26
+ end
27
+ # Just keep going until we get to the start of the subalignment
28
+ if total < offset
29
+ next
30
+ end
31
+ # If the offset is partway through a pair, need to split it up.
32
+ if first
33
+ # adjust the number in this pair; it will be added below
34
+ if pair[1].is_a? String
35
+ new_pair[1] = new_pair[1][total-offset..pairlength]
36
+ else
37
+ new_pair[1] = total - offset + 1 # Add one for bases
38
+ end
39
+ end
40
+
41
+ # Once we are at/beyond the end of the desired region:
42
+ if total >= offset + length
43
+ # Special case where the whole subalignment is contained within one cigar element:
44
+ if first
45
+ if pair[1].is_a? String
46
+ new_pair[1] = new_pair[1][total-offset-pairlength,length]
47
+ else
48
+ new_pair[1] = length
49
+ end
50
+ new_array << new_pair
51
+ break
52
+ end
53
+ # Adding the last part of the alignment
54
+ previous_total = total - pairlength
55
+ if pair[1].is_a? Integer
56
+ new_pair[1] = offset + length - previous_total - 1 #-1 extra for base arithmetic
57
+ else
58
+ new_pair[1] = new_pair[1][0..offset + length - previous_total]
59
+ end
60
+ new_array << new_pair
61
+ break
62
+ end
63
+ first = false
64
+ new_array << new_pair
65
+ end
66
+ new_array
67
+ end
68
+ end
@@ -0,0 +1,176 @@
1
+ # Extending Bio::DB::Alignment with mutation calling method
2
+ Bio::DB::Alignment.class_eval do
3
+ # Aliases included for more intuitive naming when using for genomic alignments:
4
+ alias_method :chr, :rname
5
+ alias_method :opt, :tags # vice versa as opt is the "proper" sam name for the tag fields
6
+ attr_accessor :cigar_obj
7
+
8
+ def add_tag!(new_tag)
9
+ if new_tag.is_a? String
10
+ new_tag = Bio::DB::Tag.new(new_tag)
11
+ # new_tag_obj.set(new_tag)
12
+ # new_tag = new_tag_obj
13
+ else
14
+ raise "Tag not recognised - pass a string or Bio::DB::Tag object" unless new_tag.is_a? Bio::DB::Tag
15
+ end
16
+ @tags[new_tag.tag] = new_tag
17
+
18
+ regenerate_string
19
+ end
20
+
21
+ def add_tag(tag)
22
+ dup.add_tag!(tag)
23
+ end
24
+
25
+ # Output a representation of the query sequence
26
+ def query offset=1, length=@seq.length, reference_pos=@pos-1, ins_chr="_"
27
+ mutations = self.mutations(offset,length)
28
+ cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
29
+ preceding = cigar.subalignment(0,offset-1)
30
+ preceding_diff = preceding.query_length-(offset-1)
31
+ pointer = preceding.query_length
32
+ output = []
33
+ deletions = 0
34
+ insertions = 0
35
+ if mutations
36
+ mutations.each do |mut|
37
+ mut.position = mut.position + insertions - deletions + preceding_diff
38
+ case mut.type
39
+ when :deletion
40
+ # position for deletion is the first deleted base
41
+ fillin = mut.position-1-reference_pos-1
42
+ output << @seq[pointer..fillin] if fillin > pointer
43
+ mut.reference.length.times{ output << "-" }
44
+ pointer += fillin - pointer + 1
45
+ deletions += mut.reference.length
46
+ when :insertion
47
+ # position for insertion is the base we want
48
+ fillin = mut.position-reference_pos-1
49
+ output << @seq[pointer..fillin] if fillin > pointer
50
+ output << ins_chr + mut.mutant.downcase + ins_chr
51
+ pointer += fillin - pointer + 1 + mut.mutant.length
52
+ insertions += mut.mutant.length
53
+ when :substitution
54
+ # position for substitution is the first subbed base
55
+ fillin = mut.position-1-reference_pos-1
56
+ output << @seq[pointer..fillin] if fillin > pointer
57
+ output << mut.mutant.downcase
58
+ pointer += fillin - pointer + 1 + mut.mutant.length
59
+ end
60
+ end
61
+ end
62
+ # Remaining sequence
63
+ if offset + length > pointer
64
+ output << @seq[pointer..offset-1+length-1-deletions+insertions+preceding_diff]
65
+ end
66
+ output.join
67
+ end
68
+
69
+ # Call mutations
70
+ # Want to be able to give a length and offset - use this to generate appropriate sub CIGARs, subMDs & call
71
+ def mutations offset=1, length=nil, translation_start=1
72
+ return nil if @query_unmapped
73
+ cigar = Bio::Alignment::CIGAR.new(@cigar,seq,source="sam")
74
+ length ||= cigar.reference_length - offset
75
+ return nil if offset+length > cigar.reference_length
76
+ seq = Bio::Sequence::NA.new(@seq)
77
+ @cigar_obj = cigar
78
+ # Generate subalignments from the CIGAR and MD:Z
79
+ subcigar = cigar.subalignment(offset,length)
80
+ mdz = Bio::DB::Tag::MD.new(@tags["MD"].value)
81
+ mdz = mdz.slice(offset,length)
82
+ # Get inserted bases from the read sequence, only within the region of interest
83
+ insertions = []
84
+ insertion_positions = subcigar.positions(/I/)
85
+ unless insertion_positions.empty?
86
+ insertion_positions["I"].each do |ins|
87
+ # Sam.seq returns a Sequence::NA object
88
+ # Need a -1 as ruby counts characters
89
+ # Use ins[2] to retrieve the base as this is the position on query. ins[1] is position on reference, used to annotate position.
90
+ i = seq.seq[(offset+ins[2]-1),ins[1]]
91
+ insertions << i
92
+ end
93
+ end
94
+
95
+ first_match = true
96
+ total = 0
97
+ mutations = []
98
+ reference_pos = @pos - 1
99
+ subcigar.pairs.each do |pair|
100
+ case pair[0]
101
+ when "M"
102
+ #break if first_match == false
103
+ reference_pos += pair[1]
104
+ total += pair[1]
105
+ first_match = false
106
+ # Call deletions using the MD:Z tag - avoid need to supply reference seq.
107
+ when "D"
108
+ # Deletions are called below but still need to count here
109
+ reference_pos += pair[1]
110
+ when "I"
111
+ mut = Bio::Mutation.new
112
+ mut.type = :insertion
113
+ mut.reference = nil
114
+ mut.position = reference_pos + offset - translation_start
115
+ bases = insertions.shift
116
+ mut.mutant = bases ? bases.upcase : "N"
117
+ mut.seqname = @rname.to_s
118
+ mutations << mut
119
+ end
120
+ end
121
+
122
+ # Now substitutions & deletions - these need the MD tag
123
+ sub_pos = mdz.report(/[sd]/)
124
+ previous_sub_position = 0
125
+ unless sub_pos.empty?
126
+ sub_pos.each do |p|
127
+ # Reference base is in the MD:Z tag (p[1] here), for the actual base need to go to the read
128
+ # p[3] is the length of operations preceding the substitution on the read, p[2] on the reference.
129
+ # p[2] and p[3] are defined on the subalignment, so should add them onto the preceding.
130
+ # Need to add in any inserted bases from the CIGAR string using query_length
131
+ preceding = cigar.subalignment(0,offset-1)
132
+ # Masked length is not included in the MD:Z string so need to add it
133
+ read_position = preceding.query_length+preceding.masked_length+p[3]
134
+ # This is the adjustment needed to get the correct annotation:
135
+ substart = @pos + offset - translation_start - 1
136
+ case p[0]
137
+ when "s"
138
+ mut = Bio::Mutation.new
139
+ mut.type = :substitution
140
+ mut.position = substart+p[2] + 1
141
+ mut.reference = p[1].upcase
142
+ mut.mutant = seq[read_position,p[1].length].upcase
143
+ mut.seqname = @rname.to_s
144
+ mutations << mut
145
+ when "d"
146
+ mut = Bio::Mutation.new
147
+ mut.type = :deletion
148
+ mut.reference = p[1].upcase
149
+ mut.position = substart+p[2] + 1
150
+ mut.mutant = nil
151
+ mut.seqname = @rname.to_s
152
+ mutations << mut
153
+ end
154
+ end
155
+ end
156
+ # mutations.length > 0 ? mutations.sort{|x,y| x.position.to_i <=> y.position.to_i} : nil
157
+ mutations.length > 0 ? Bio::MutationArray.new(mutations.sort) : nil
158
+
159
+ end
160
+
161
+ def regenerate_string
162
+ tags_string = @tags.map{|k,v| [v.tag, v.type, v.value].join(":") }
163
+ self.sam_string = [@qname,
164
+ @flag,
165
+ @rname,
166
+ @pos,
167
+ @mapq,
168
+ @cigar,
169
+ @mrnm,
170
+ @mpos,
171
+ @isize,
172
+ @seq,
173
+ @qual,
174
+ tags_string].join("\t")
175
+ end
176
+ end
@@ -0,0 +1,5 @@
1
+ Bio::DB::Tag.class_eval do
2
+ def initialize(tag_string=nil)
3
+ set(tag_string) if tag_string
4
+ end
5
+ end
@@ -0,0 +1,126 @@
1
+ class Bio::DB::Tag::MD
2
+ include Bio::Alignment::IteratePairs
3
+ attr_accessor :tag, :pairs, :cumulative
4
+ @@regexp = /MD:Z:([\w^]+)/
5
+ @@format = /[\w^]+/
6
+ @@splitter = /(?<match>\d+)|(?<substitution>[GATCN]+)|\^(?<deletion>[GATCN]+)/
7
+ # Operations that consume reference seqeunce:
8
+ @@reference = /[msd]/
9
+ def initialize(data)
10
+ if data.is_a? String
11
+ if data.match(@@regexp)
12
+ @tag = $~[1]
13
+ elsif data.match(@@format)
14
+ #Assume tag given without MD:Z: leader
15
+ @tag = data
16
+ else
17
+ raise "Tag not of expected format."
18
+ end
19
+ elsif data.is_a? Bio::DB::Tag
20
+ @tag = data.value
21
+ warn "Not an MD tag" if data.tag == "MD"
22
+ else
23
+ raise "Tag not of expected format."
24
+ end
25
+
26
+ # Splits the string into operations using the splitter regexp class variable, returns array of two-element arrays describing operations
27
+ spl = @tag.scan(@@splitter)
28
+ # Returns an array of matches [match,substition,deletion]
29
+ # Although regexp captures are named, these don't get included automatically with scan as it doesn't return MatchData objects.
30
+ spl.map! do |a|
31
+ array = [["m", a[0]],["s", a[1]],["d", a[2]]]
32
+ # Only one of these will be non-nil
33
+ array.keep_if{|i| i[1]}
34
+ array.map!{|i| if i[0] == "m" then i[1] = i[1].to_i end; i}
35
+ array[0]
36
+ end
37
+ @pairs = spl
38
+
39
+ @cumulative = []
40
+ cumulative_length = 0
41
+ read_length = 0
42
+ @pairs.each do |q|
43
+ p = q.dup
44
+ case p[0]
45
+ when "m"
46
+ len = p[1]
47
+ rlen = p[1]
48
+ when "s"
49
+ len = p[1].length
50
+ rlen = p[1].length
51
+ when "d"
52
+ len = p[1].length
53
+ # Deleted bases don't appear in the read, so don't count to the length
54
+ rlen = 0
55
+ end
56
+ # third element in each array will be the total preceding length on the reference, i.e. the position of the operation.
57
+ # fourth element is similar for the read.
58
+ @cumulative << p.dup.push(cumulative_length).push(read_length)
59
+ cumulative_length += len
60
+ read_length += rlen
61
+ end
62
+ end
63
+
64
+ def deletions
65
+ report(/d/)
66
+ end
67
+
68
+ def substitutions
69
+ report(/s/)
70
+ end
71
+
72
+ # Report the positions of given events
73
+ def report(regexp=/[sd]/)
74
+ to_return = []
75
+ @cumulative.each do |p|
76
+ if p[0] =~ regexp
77
+ to_return << p
78
+ end
79
+ end
80
+ to_return
81
+ end
82
+
83
+ # Reconstruct a MD:Z tag from the pairs array
84
+ def reconstruct_tag(array=@pairs)
85
+ new_tag = []
86
+ array.each do |p|
87
+ case p[0]
88
+ when "m"
89
+ string = p[1].to_s
90
+ when "s"
91
+ string = p[1]
92
+ when "d"
93
+ string = "^"+p[1]
94
+ end
95
+ new_tag << string
96
+ end
97
+ new_tag.join("")
98
+ end
99
+
100
+
101
+
102
+ # Sums the total length of the reference sequence represented by the MD:Z tag (or part of)
103
+ def ref_length
104
+ #Need the sum of all "movement" operations (i.e. numbers) as well as any substituted bases (count 1 each)
105
+ if @tag =~ /^\d+$/
106
+ @tag.to_i
107
+ else
108
+ temp_tag = @tag.dup
109
+ temp_tag.gsub!(/\^/,"") # Deletions need to be counted - sub the caret character out and count the remaining base characters
110
+ movements = temp_tag.split(/[GATCN]+/).map(&:to_i).reduce(:+) # Sum numbers
111
+ deletions = temp_tag.split(/\d+/).map(&:length).reduce(:+) # Sum number of base chars
112
+ movements + deletions
113
+ end
114
+ end
115
+ # Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
116
+ def slice(offset,length)
117
+ new_array = iterate_pairs(@pairs,offset,length,@@reference)
118
+ # Return a MDZ instance with just the new alignment
119
+ new_tag = reconstruct_tag(new_array)
120
+ Bio::DB::Tag::MD.new(new_tag)
121
+ end
122
+
123
+ end
124
+
125
+
126
+ # DKNQZ:00025:00303 0 5 112767204 37 60M1D7M2I6M * 0 0 GCAGTAATTTCCCTGGAGTAAAACTGCGGTCAAAAATGTCCCTCCGTTCTTATGGAAGCCGGAAGGAAGTCTGTA CCCCCC@CE>CC<CC@CB;;;;.;;;;;AC;::::+:92A:=CCAEE=?>;=:@<B?:<6<*/*/*/*/911112 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:60^G13
@@ -0,0 +1,24 @@
1
+ # Used for tracking mutations appearing more than once and cacheing VEP lookups
2
+ class MutantAllele
3
+ attr_accessor :mutations, :count, :example, :seq
4
+ class << self
5
+ attr_accessor :previous_lookups
6
+ end
7
+ self.previous_lookups = {}
8
+
9
+ def initialize (mutations: nil, count: 0, example: nil, seq: nil)
10
+ @mutations = mutations
11
+ @count = count
12
+ @example = example
13
+ end
14
+
15
+ # Returns JSON from Ensembl VEP
16
+ def lookup species="human", ref_type=nil
17
+ key = mutations.to_hgvs(ref_type)
18
+ if key && (MutantAllele.previous_lookups.keys.include? key)
19
+ MutantAllele.previous_lookups[key]
20
+ else
21
+ mutations.vep(species,ref_type)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,63 @@
1
+ class Bio::Mutation
2
+ include VepHgvs
3
+ attr_accessor :position, :type, :reference, :mutant, :seqname
4
+ def initialize params={position: 1,type: :uninitialized, reference: nil, mutant: nil, seqname: nil}
5
+ @position = params[:position]
6
+ @type = params[:type]
7
+ @reference = params[:reference]
8
+ @mutant = params[:mutant]
9
+ @seqname = params[:seqname]
10
+ end
11
+
12
+ def <=> other
13
+ return 0 if self.position == other.position
14
+ self.position > other.position ? 1 : -1
15
+ end
16
+
17
+ # http://www.hgvs.org/mutnomen/recs.html
18
+ # This gives just the annotation. To convert to a full allele description, needs to be combined
19
+ # with e.g. g. for genomic: g. - can supply this "g", "c" as type to annotate a single mutation directly
20
+ # for compound mutants, need to join an array of annotations e.g. 1:g.[213456A>C;213460_213461delTG]
21
+ def to_hgvs(reference_type=nil)
22
+ if reference_type
23
+ hgvs_arr = [@seqname,":",reference_type,".",@position.to_s]
24
+ else
25
+ hgvs_arr = [@position.to_s]
26
+ end
27
+
28
+ case @type
29
+ when :deletion
30
+ if @reference.length == 1
31
+ hgvs_arr << "del"+@reference
32
+ else
33
+ hgvs_arr = hgvs_arr + ["_",
34
+ (@position.to_i+@reference.length-1).to_s,
35
+ "del",
36
+ @reference]
37
+ end
38
+ hgvs_arr.join
39
+
40
+ when :substitution
41
+ if @reference.length > 1
42
+ hgvs_arr = hgvs_arr + ["_",
43
+ (@position.to_i+@reference.length-1).to_s]
44
+ end
45
+ hgvs_arr << @reference+">"+@mutant
46
+ hgvs_arr.join
47
+
48
+ when :insertion
49
+ hgvs_arr << "_" + (@position.to_i+1).to_s
50
+ hgvs_arr << "ins"+@mutant
51
+ hgvs_arr.join
52
+ # TODO - distinguish duplications from insertions? Needs further input from ref.
53
+ end
54
+ end
55
+
56
+ def to_json
57
+ Oj.dump self
58
+ end
59
+
60
+ def to_yaml
61
+ YAML.dump self
62
+ end
63
+ end