viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,219 @@
1
+
2
+ module ViralSeq
3
+
4
+ # Class for paired-end sequences.
5
+ # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
6
+ # my_seqhashpair = ViralSeq::SeqHashPair.fa('my_seq_directory')
7
+ # @example join the paired-end sequences with an overlap of 100 bp
8
+ # my_seqhashpair.join1(100)
9
+ # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
+ # my_seqhashpair.join1(:indiv)
11
+
12
+ class SeqHashPair
13
+
14
+ # initialize SeqHashPair object with @dna_hash, @title and @file
15
+
16
+ def initialize (dna_hash = {}, title = "", file = [])
17
+ @dna_hash = dna_hash
18
+ @title = title
19
+ @file = file
20
+ end
21
+
22
+ # @return [Hash] Hash object for :name => [:r1_sequence_string, :r2_sequence_string]
23
+
24
+ attr_accessor :dna_hash
25
+
26
+ # @return [String] the title of the SeqHash object.
27
+ # default as the directory basename if SeqHash object is initialized using ::fa
28
+
29
+ attr_accessor :title
30
+
31
+ # @return [String] the r1 and r2 files that are used to initialize SeqHash object, if they exist
32
+
33
+ attr_accessor :file
34
+
35
+ # initialize a new ViralSeq::SeqHashPair object from a directory containing paired sequence files in the FASTA format
36
+ # @param indir [String] directory containing paired sequence files in the FASTA format,
37
+ #
38
+ # Paired sequence files need to have "r1" and "r2" in their file names
39
+ #
40
+ # Example for the file structure
41
+ # ├───lib1
42
+ # │ lib1_r1.txt
43
+ # │ lib1_r2.txt
44
+ # The sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
45
+ # @return [ViralSeq::SeqHashPair] new SeqHashPair object from the paired FASTA sequence files
46
+ # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
47
+ # my_seqhashpair = ViralSeq::SeqHashPair.fa('spec/sample_paired_seq')
48
+
49
+ def self.new_from_fasta(indir)
50
+ files = Dir[indir + "/*"]
51
+ r1_file = ""
52
+ r2_file = ""
53
+ files.each do |f|
54
+ if File.basename(f) =~ /r1/i
55
+ r1_file = f
56
+ elsif File.basename(f) =~ /r2/i
57
+ r2_file = f
58
+ end
59
+ end
60
+
61
+ seq1 = ViralSeq::SeqHash.fa(r1_file).dna_hash
62
+ seq2 = ViralSeq::SeqHash.fa(r2_file).dna_hash
63
+
64
+ new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
65
+ new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
66
+
67
+ seq_pair_hash = {}
68
+
69
+ new_seq1.each do |seq_name,seq|
70
+ seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
71
+ end
72
+ seq_hash = ViralSeq::SeqHashPair.new
73
+ seq_hash.dna_hash = seq_pair_hash
74
+ seq_hash.title = File.basename(indir,".*")
75
+ seq_hash.file = [r1_file, r2_file]
76
+ return seq_hash
77
+ end # end of .new_from_fasta
78
+
79
+ class << self
80
+ alias_method :fa, :new_from_fasta
81
+ end
82
+
83
+ # Pair-end join function for KNOWN overlap size.
84
+ # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
+ # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
86
+ # @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
87
+ # @example join paired-end sequences with different :diff cut-offs, overlap provided.
88
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
89
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
90
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
91
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
92
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
93
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
94
+ # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seqs)
95
+ # my_seqhashpair.join1(100).dna_hash.keys
96
+ # => [">pair1"]
97
+ # my_seqhashpair.join1(100,0.01).dna_hash.keys
98
+ # => [">pair1", ">pair2"]
99
+ # my_seqhashpair.join1(100,0.02).dna_hash.keys
100
+ # => [">pair1", ">pair2", ">pair3"]
101
+
102
+ def join1(overlap = 0, diff = 0.0)
103
+ seq_pair_hash = self.dna_hash
104
+ raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
+ joined_seq = {}
107
+ seq_pair_hash.each do |seq_name, seq_pair|
108
+ r1_seq = seq_pair[0]
109
+ r2_seq = seq_pair[1]
110
+ if overlap.zero?
111
+ joined_seq[seq_name] = r1_seq + r2_seq
112
+ elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
+ joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
114
+ else
115
+ next
116
+ end
117
+ end
118
+ joined_seq_hash = ViralSeq::SeqHash.new
119
+ joined_seq_hash.dna_hash = joined_seq
120
+ joined_seq_hash.title = self.title + "_joined"
121
+ joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
122
+ return joined_seq_hash
123
+ end # end of join1
124
+
125
+
126
+ # Pair-end join function for UNKNOWN overlap.
127
+ # @param model [Symbol] models used to determine the overlap, `:con`, `:indiv`
128
+ #
129
+ # model `:con`: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
130
+ #
131
+ # note: minimal overlap as 4 bases.
132
+ # model `:indiv`: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
133
+ # @param diff (see #join1)
134
+ # @return (see #join1)
135
+ # @example join paired-end sequences, overlap NOT provided
136
+ # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
137
+ # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
138
+ # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"] }
139
+ # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
+ # my_seqhashpair.join2.dna_hash
141
+ # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
+ # my_seqhashpair.join2(:indiv).dna_hash
143
+ # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
+
145
+ def join2(model = :con, diff = 0.0)
146
+ seq_pair_hash = self.dna_hash
147
+ begin
148
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
149
+ if model == :con
150
+ overlap = determine_overlap_pid_pair(seq_pair_hash, diff)
151
+ return self.join1(overlap, diff)
152
+ elsif model == :indiv
153
+ joined_seq = {}
154
+ seq_pair_hash.each do |seq_name, seq_pair|
155
+ overlap_list = []
156
+ overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
157
+ cut_off_base = overlap1 * diff
158
+ overlap_list << overlap1 if diff_nt <= cut_off_base
159
+ end
160
+ if overlap_list.empty?
161
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
162
+ else
163
+ overlap = overlap_list.max
164
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
165
+ end
166
+ end
167
+ joined_seq_hash = ViralSeq::SeqHash.new
168
+ joined_seq_hash.dna_hash = joined_seq
169
+ joined_seq_hash.title = self.title + "_joined"
170
+ joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
171
+ return joined_seq_hash
172
+ else
173
+ raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \`#{model}\`, expected `:con` or `:indiv`.")
174
+ end
175
+ rescue ArgumentError => e
176
+ puts e
177
+ return nil
178
+ end
179
+ end # end of join2
180
+
181
+ private
182
+ # determine overlap size from @dna_hash
183
+ def determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
184
+ overlaps = []
185
+ seq_pair_hash.each do |_seq_name, seq_pair|
186
+ overlap_list = []
187
+ matrix = overlap_matrix(seq_pair[0], seq_pair[1])
188
+ matrix.each do |overlap, diff_nt|
189
+ cut_off_base = overlap * diff
190
+ overlap_list << overlap if diff_nt <= cut_off_base
191
+ end
192
+ if overlap_list.empty?
193
+ overlaps << 0
194
+ else
195
+ overlaps << overlap_list.max
196
+ end
197
+ end
198
+ count_overlaps = overlaps.count_freq
199
+ max_value = count_overlaps.values.max
200
+ max_overlap_list = []
201
+ count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
202
+ max_overlap_list.max
203
+ end # end pf determine_overlap_pid_pair
204
+
205
+ # input a pair of sequences as String, return a Hash object of overlapping Hash object
206
+ # {:overlap_size => number_of_differnt_positions, ...}
207
+ # {minimal overlap set to 4. }
208
+ def overlap_matrix(sequence1, sequence2)
209
+ min_overlap = 4
210
+ max_overlap = [sequence1.size, sequence2.size].max
211
+ matrix_hash = {}
212
+ (min_overlap..max_overlap).each do |overlap|
213
+ matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
214
+ end
215
+ return matrix_hash
216
+ end # end of overlap_matrix
217
+
218
+ end # end of SeqHashPair
219
+ end # end of ViralSeq
@@ -1,392 +1,615 @@
1
- # lib/sequence.rb
2
- # Includes functions for sequence operations
3
- # Including methods as:
4
- # ViralSeq::AMINO_ACID_LIST
5
- # ViralSeq::Sequence
6
- # ViralSeq::Sequence#rev_complement
7
- # ViralSeq::Sequence#get_aa_sequence
8
- # ViralSeq::Sequence#get_aa_array
9
- # ViralSeq::Sequence#name
10
- # ViralSeq::Sequence#dna_sequence
11
- # ViralSeq::Sequence#aa_sequence
12
- # ViralSeq::Sequence#aa_array
13
- # ViralSeq::amino_acid
14
- # ViralSeq::amino_acid_2
15
- # ViralSeq::to_list
16
- # ViralSeq::uniq_sequence_hash
17
- # ViralSeq::stop_codon_seq_hash
18
- # String#rc
19
- # String#mutation
20
- # String#nt_parser
21
-
22
- # ViralSeq::AMINO_ACID_LIST
23
- # # Array of all amino acid one letter abbreviations
24
-
25
- # ViralSeq::Sequence
26
- # # Sequence class
27
- # =USAGE
28
- # # create a sequence object
29
- # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
30
- #
31
- # # print dna sequence
32
- # puts seq.dna_sequence
33
- #
34
- # # reserce complement sequence of DNA sequence, return as a string
35
- # seq.rev_complement
36
- #
37
- # # change @dna_sequence to reverse complement DNA sequence
38
- # seq.rev_complement!
39
- #
40
- # # generate amino acid sequences. either return string or array.
41
- # # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
42
- # # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
43
- # seq.get_aa_sequence
44
- # # or
45
- # seq.get_aa_array
46
- #
47
- # # print amino acid sequence
48
- # puts seq.aa_sequence
49
-
50
- # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
51
- # # collapse sequence hash to unique sequence hash.
52
- # # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
53
- # # master_sequence_tag is the master tag for unique sequences
54
- # # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
55
- # =USAGE
56
- # sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
57
- # '>seq4' => 'CCCC', '>seq5' => 'CCCC',
58
- # '>seq6' => 'TTTT' }
59
- # uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
60
- # => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
61
1
 
62
2
  module ViralSeq
63
3
 
64
- # array for all amino acid one letter abbreviations
65
- AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
66
-
67
- # sequence class
4
+ # ViralSeq::Sequence class for sequence operation
5
+ #
6
+ # @example create a sequence object
7
+ # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
8
+ # => #<ViralSeq::Sequence:0x00007fd03c8c10b8 @name="my_sequence", @dna="ACCTAGGTTCGGAGC", @aa_string="", @aa_array=[]>
9
+ #
10
+ # @example return dna sequence as String
11
+ # seq.dna
12
+ # => "ACCTAGGTTCGGAGC"
13
+ #
14
+ # @example reverse complement sequence of DNA sequence
15
+ # seq.rc
16
+ # => "GCTCCGAACCTAGGT"
17
+ #
18
+ # @example change @dna to reverse complement DNA sequence
19
+ # seq.rc!
20
+ #
21
+ # @example translate the DNA sequence, return values for @aa_string and @aa_array
22
+ # seq = ViralSeq::Sequence.new('my_sequence', 'AWTCGRAGAG')
23
+ # seq.translate(1)
24
+ # seq.aa_string
25
+ # => "##E"
26
+ # seq.aa_array
27
+ # => ["IF", "EG", "E"]
68
28
 
69
29
  class Sequence
30
+ # initialize a ViralSeq::Sequence class with sequence name (default as '>sequence')
31
+ # and DNA sequence as String object
70
32
  def initialize (name = ">sequence",dna_sequence ="")
71
33
  @name = name
72
- @dna_sequence = dna_sequence.upcase
73
- @aa_sequence = ""
34
+ @dna = dna_sequence.upcase
35
+ @aa_string = ""
74
36
  @aa_array = []
75
37
  end
76
38
 
77
- attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
39
+ # @return [String] sequence tag name
40
+ attr_accessor :name
41
+
42
+ # @return [String] DNA sequence
43
+ attr_accessor :dna
44
+
45
+ # @return [String] amino acid sequence
46
+ attr_accessor :aa_string
47
+
48
+ # @return [Array] amino acid sequence as an Array object,
49
+ # ambiguity dna sequence will be translated in all possible amino acid sequence at the position
50
+ attr_accessor :aa_array
78
51
 
52
+ # @return [String] reverse compliment sequence of the @dna.
79
53
  def rev_complement
80
- @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
54
+ @dna.rc
81
55
  end
56
+
57
+ # replace the @dna with reverse complement DNA sequence.
82
58
  def rev_complement!
83
- @dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
59
+ @dna = @dna.rc
84
60
  end
85
61
 
86
- def get_aa_sequence(initial_position = 0)
87
- @aa_sequence = ""
88
- require_sequence = @dna_sequence[initial_position..-1]
62
+ alias_method :rc, :rev_complement
63
+ alias_method :rc!, :rev_complement!
64
+
65
+ # translate @dna to amino acid sequence.
66
+ # generate values for @aa_string and @aa_array
67
+ # @param initial_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
68
+
69
+ def translate(initial_position = 0)
70
+ @aa_string = ""
71
+ require_sequence = @dna[initial_position..-1]
89
72
  base_array = []
90
73
  require_sequence.each_char {|base| base_array << base}
91
74
  while (base_array.length>=3) do
92
75
  base_3= ""
93
76
  3.times {base_3 += base_array.shift}
94
- @aa_sequence << amino_acid(base_3)
77
+ @aa_string << amino_acid(base_3)
95
78
  end
96
- return @aa_sequence
97
- end
98
79
 
99
- # get amino acid calls, return a array.keep ambiguity calls.
100
- def get_aa_array(initial_position = 0)
101
80
  @aa_array = []
102
- require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
81
+ require_sequence = @dna[initial_position..-1].tr('-','N')
103
82
  base_array = []
104
83
  require_sequence.each_char {|base| base_array << base}
105
84
  while (base_array.length>=3) do
106
85
  base_3= ""
107
86
  3.times{base_3 += base_array.shift}
108
- @aa_array<< ViralSeq.amino_acid_2(base_3)
87
+ @aa_array<< amino_acid_2(base_3)
109
88
  end
110
- return @aa_array
111
89
  end
90
+
91
+ # @return [Integer] length of DNA sequence
112
92
  def dna_length
113
- @dna_sequence.length
93
+ @dna.length
114
94
  end
95
+
96
+ # @return [Integer] length of amino acid sequence
115
97
  def aa_length
116
- @aa_sequence.length
98
+ @aa_string.length
117
99
  end
118
- end
119
-
120
- # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
121
- def self.amino_acid (bases)
122
- case bases
123
- when /^TT[TCY]$/
124
- return "F"
125
- when /^TT[AGR]$/
126
- return "L"
127
- when /^CT.$/
128
- return "L"
129
- when /^AT[TCAHYWM]$/
130
- return "I"
131
- when "ATG"
132
- return "M"
133
- when /^GT.$/
134
- return "V"
135
- when /^TC.$/
136
- return "S"
137
- when /^CC.$/
138
- return "P"
139
- when /^AC.$/
140
- return "T"
141
- when /^GC.$/
142
- return "A"
143
- when /^TA[TCY]$/
144
- return "Y"
145
- when /^TA[AGR]$/
146
- return "*"
147
- when /^T[GR]A$/
148
- return "*"
149
- when /^CA[TCY]$/
150
- return "H"
151
- when /^CA[AGR]$/
152
- return "Q"
153
- when /^AA[TCY]$/
154
- return "N"
155
- when /^AA[AGR]$/
156
- return "K"
157
- when /^GA[TCY]$/
158
- return "D"
159
- when /^GA[AGR]$/
160
- return "E"
161
- when /^TG[TCY]$/
162
- return "C"
163
- when "TGG"
164
- return "W"
165
- when /^CG.$/
166
- return "R"
167
- when /^AG[TCY]$/
168
- return "S"
169
- when /^[AM]G[AGR]$/
170
- return "R"
171
- when /^GG.$/
172
- return "G"
173
- when /^[ATW][CGS][CTY]$/
174
- return "S"
175
- when /^[TCY]T[AGR]$/
176
- return "L"
177
- else
178
- return "#"
179
- end
180
- end
181
-
182
- # keep ambiguities, return all possible amino acids.
183
-
184
- def self.amino_acid_2 (bases)
185
- bases_to_aa = []
186
- aa_list = []
187
- base1 = ViralSeq.to_list(bases[0])
188
- base2 = ViralSeq.to_list(bases[1])
189
- base3 = ViralSeq.to_list(bases[2])
190
- l1 = base1.size - 1
191
- l2 = base2.size - 1
192
- l3 = base3.size - 1
193
- (0..l1).each do |n1|
194
- b1 = base1[n1]
195
- (0..l2).each do |n2|
196
- b2 = base2[n2]
197
- (0..l3).each do |n3|
198
- b3 = base3[n3]
199
- bases_all = b1 + b2 + b3
200
- bases_to_aa << bases_all
100
+
101
+ # resistant mutation interpretation for a chosen region from a translated ViralSeq::Sequence object
102
+ # @param option [Symbol] option of region to interpret, `:hcv_ns5a`, `:hiv_pr`, `:nrti`, `:nnrti`, `hiv_in`
103
+ # @param start_aa [Integer] the starting aa number of the input sequence
104
+ # @return [Hash] return a Hash object for SDRMs identified. :posiiton => [:wildtype_codon, :mutation_codon]
105
+ # @example examine an HIV PR region sequence for drug resistance mutations
106
+ # my_seq_name = 'a_pr_seq'
107
+ # my_seq = 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAAATAGGAGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATAATACAGTATTAGAAGACATGGAGTTACCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATCTGTGGGCATAAAACTACAGGTACAGTGTTAATAGGACCTACACCCGTCAACATAATTGGAAGAGATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTT'
108
+ # s = ViralSeq::Sequence.new(my_seq_name, my_seq)
109
+ # s.translate
110
+ # s.sdrm(:hiv_pr)
111
+ # => {30=>["D", "N"], 88=>["N", "D"]}
112
+
113
+ def sdrm(option, start_aa = 1)
114
+ aa_array = self.aa_array
115
+ out_hash = {}
116
+ sdrm = sdrm_hash(option)
117
+ aa_length = aa_array.size
118
+ end_aa = start_aa + aa_length - 1
119
+ (start_aa..end_aa).each do |position|
120
+ array_position = position - start_aa
121
+ if sdrm.keys.include?(position)
122
+ wt_aa = sdrm[position][0]
123
+ test_aa = aa_array[array_position]
124
+ if test_aa.size == 1
125
+ unless wt_aa == test_aa
126
+ if sdrm[position][1].include?(test_aa)
127
+ out_hash[position] = [wt_aa,test_aa]
128
+ end
129
+ end
130
+ else
131
+ test_aa_array = test_aa.split("")
132
+ if (test_aa_array & sdrm[position][1])
133
+ out_hash[position] = [wt_aa,test_aa]
134
+ end
135
+ end
201
136
  end
202
137
  end
203
- end
138
+ return out_hash
139
+ end # end of #hcv_ns5a
204
140
 
205
- bases_to_aa.each do |base|
206
- case base
207
- when /^TT[TCY]$/
208
- aa = "F"
209
- when /^TT[AGR]$/
210
- aa = "L"
211
- when /^CT.$/
212
- aa = "L"
213
- when /^AT[TCAHYWM]$/
214
- aa = "I"
215
- when "ATG"
216
- aa = "M"
217
- when /^GT.$/
218
- aa = "V"
219
- when /^TC.$/
220
- aa = "S"
221
- when /^CC.$/
222
- aa = "P"
223
- when /^AC.$/
224
- aa = "T"
225
- when /^GC.$/
226
- aa = "A"
227
- when /^TA[TCY]$/
228
- aa = "Y"
229
- when /^TA[AGR]$/
230
- aa = "*"
231
- when /^T[GR]A$/
232
- aa = "*"
233
- when /^CA[TCY]$/
234
- aa = "H"
235
- when /^CA[AGR]$/
236
- aa = "Q"
237
- when /^AA[TCY]$/
238
- aa = "N"
239
- when /^AA[AGR]$/
240
- aa = "K"
241
- when /^GA[TCY]$/
242
- aa = "D"
243
- when /^GA[AGR]$/
244
- aa = "E"
245
- when /^TG[TCY]$/
246
- aa = "C"
247
- when "TGG"
248
- aa = "W"
249
- when /^CG.$/
250
- aa = "R"
251
- when /^AG[TCY]$/
252
- aa = "S"
253
- when /^[AM]G[AGR]$/
254
- aa = "R"
255
- when /^GG.$/
256
- aa = "G"
257
- when /^[ATW][CGS][CTY]$/
258
- aa = "S"
259
- when /^[TCY]T[AGR]$/
260
- aa = "L"
261
- else
262
- aa = "-"
263
- end
264
- aa_list << aa
265
- end
266
- aa_out = aa_list.uniq.join('/')
267
- return aa_out
268
- end
269
-
270
- # parse ambiguity bases, aka %w{W S M K R Y B D H V N}
271
-
272
- def self.to_list(base = "")
273
- list = []
274
- case base
275
- when /[A|T|C|G]/
276
- list << base
277
- when "W"
278
- list = ['A','T']
279
- when "S"
280
- list = ['C','G']
281
- when "M"
282
- list = ['A','C']
283
- when 'K'
284
- list = ['G','C']
285
- when 'R'
286
- list = ['A','G']
287
- when 'Y'
288
- list = ['C','T']
289
- when 'B'
290
- list = ['C','G','T']
291
- when 'D'
292
- list = ['A','G','T']
293
- when 'H'
294
- list = ['A','C','T']
295
- when 'V'
296
- list = ['A','C','G']
297
- when 'N'
298
- list = ['A','T','C','G']
299
- end
300
- return list
301
- end
302
-
303
- # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
304
- # collapse sequence hash to unique sequence hash.
305
- # input_sequence_hash is a sequence hash {:name => :sequence, ...}
306
- # master_sequence_tag is the master tag for unique sequences
307
- # sequences will be named as (master_sequence_tag + "_" + Integer)
308
-
309
- def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
310
- uni = ViralSeq.count(seq.values)
311
- new_seq = {}
312
- n = 1
313
- uni.each do |s,c|
314
- name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
315
- new_seq[name] = s
316
- n += 1
317
- end
318
- return new_seq
319
- end
320
-
321
- # input a sequence hash, return a sequence hash with stop codons.
322
- def self.stop_codon_seq_hash(seq_hash, rf = 0)
323
- out_seq_hash = {}
324
- seq_hash.each do |k,v|
325
- sequence = Sequence.new(k,v)
326
- sequence.get_aa_array(rf)
327
- if sequence.aa_array.include?("*")
328
- out_seq_hash[k] = v
141
+ # HIV sequence locator function, resembling HIV Sequence Locator from LANL
142
+ # # current version only supports nucleotide sequence, not for amino acid sequence.
143
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
144
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
145
+ # @return [Array] an array of the following info
146
+ # # start_location (Integer)
147
+ # # end_location (Integer)
148
+ # # percentage_of_similarity_to_reference_sequence (Float)
149
+ # # containing_indel? (Boolean)
150
+ # # aligned_input_sequence (String)
151
+ # # aligned_reference_sequence (String)
152
+ # @example identify the location of the input sequence on the NL43 genome
153
+ # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
154
+ # s = ViralSeq::Sequence.new('my_sequence', sequence)
155
+ # loc = s.locator(:NL43)
156
+ # h = ViralSeq::SeqHash.new; h.dna_hash['NL43'] = loc[5]; h.dna_hash[s.name] = loc[4]
157
+ # rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
158
+ # puts "The input sequence \"#{s.name}\" is located on the NL43 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
159
+ # => The input sequence "my_sequence" is located on the NL43 nt sequence from 2333 to 2433.
160
+ # => It is 98.0% similar to the reference.
161
+ # => It does not have indels.
162
+ # => The alignment is
163
+ # => NL43 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
164
+ # => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
165
+ # @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
166
+
167
+ def locator(ref_option = :HXB2, path_to_muscle = false)
168
+ seq = self.dna
169
+ ori_ref = ViralSeq::RefSeq.get(ref_option)
170
+
171
+ begin
172
+ ori_ref_l = ori_ref.size
173
+ l1 = 0
174
+ l2 = 0
175
+
176
+ aln_seq = ViralSeq::Muscle.align(ori_ref, seq, path_to_muscle)
177
+ aln_test = aln_seq[1]
178
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
179
+ gap_begin = $1.size
180
+ gap_end = $3.size
181
+ aln_test2 = $2
182
+ ref = aln_seq[0]
183
+ ref = ref[gap_begin..(-gap_end-1)]
184
+ ref_size = ref.size
185
+ if ref_size > 1.3*(seq.size)
186
+ l1 = l1 + gap_begin
187
+ l2 = l2 + gap_end
188
+ max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
189
+ aln_test2 =~ /#{max_seq}/
190
+ before_aln_seq = $`
191
+ before_aln = $`.size
192
+ post_aln_seq = $'
193
+ post_aln = $'.size
194
+ before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
195
+ b1 = (1.3 * before_aln_seq_size).to_i
196
+ post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
197
+ b2 = (1.3 * post_aln_seq_size).to_i
198
+ if (before_aln > seq.size) and (post_aln <= seq.size)
199
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
200
+ l1 = l1 + (before_aln - b1)
201
+ elsif (post_aln > seq.size) and (before_aln <= seq.size)
202
+ ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
203
+ l2 = l2 + post_aln - b2
204
+ elsif (post_aln > seq.size) and (before_aln > seq.size)
205
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
206
+ l1 = l1 + (before_aln - b1)
207
+ l2 = l2 + (post_aln - b2)
208
+ end
209
+
210
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
211
+ aln_test = aln_seq[1]
212
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
213
+ gap_begin = $1.size
214
+ gap_end = $3.size
215
+ ref = aln_seq[0]
216
+ ref = ref[gap_begin..(-gap_end-1)]
217
+ end
218
+
219
+ aln_test = aln_seq[1]
220
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
221
+ gap_begin = $1.size
222
+ gap_end = $3.size
223
+ aln_test = $2
224
+ aln_test =~ /^(\w+)(\-*)\w/
225
+ s1 = $1.size
226
+ g1 = $2.size
227
+ aln_test =~ /\w(\-*)(\w+)$/
228
+ s2 = $2.size
229
+ g2 = $1.size
230
+
231
+ l1 = l1 + gap_begin
232
+ l2 = l2 + gap_end
233
+ repeat = 0
234
+
235
+ if g1 == g2 and (s1 + g1 + s2) == ref.size
236
+ if s1 > s2 and g2 > 2*s2
237
+ ref = ref[0..(-g2-1)]
238
+ repeat = 1
239
+ l2 = l2 + g2
240
+ elsif s1 < s2 and g1 > 2*s1
241
+ ref = ref[g1..-1]
242
+ repeat = 1
243
+ l1 = l1 + g1
244
+ end
245
+ else
246
+ if g1 > 2*s1
247
+ ref = ref[g1..-1]
248
+ repeat = 1
249
+ l1 = l1 + g1
250
+ end
251
+ if g2 > 2*s2
252
+ ref = ref[0..(-g2 - 1)]
253
+ repeat = 1
254
+ l2 = l2 + g2
255
+ end
256
+ end
257
+
258
+ while repeat == 1
259
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
260
+ aln_test = aln_seq[1]
261
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
262
+ gap_begin = $1.size
263
+ gap_end = $3.size
264
+ aln_test = $2
265
+ aln_test =~ /^(\w+)(\-*)\w/
266
+ s1 = $1.size
267
+ g1 = $2.size
268
+ aln_test =~ /\w(\-*)(\w+)$/
269
+ s2 = $2.size
270
+ g2 = $1.size
271
+ ref = aln_seq[0]
272
+ ref = ref[gap_begin..(-gap_end-1)]
273
+ l1 = l1 + gap_begin
274
+ l2 = l2 + gap_end
275
+ repeat = 0
276
+ if g1 > 2*s1
277
+ ref = ref[g1..-1]
278
+ repeat = 1
279
+ l1 = l1 + g1
280
+ end
281
+ if g2 > 2*s2
282
+ ref = ref[0..(-g2 - 1)]
283
+ repeat = 1
284
+ l2 = l2 + g2
285
+ end
286
+ end
287
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
288
+
289
+
290
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
291
+ aln_test = aln_seq[1]
292
+ ref = aln_seq[0]
293
+
294
+ #refine alignment
295
+
296
+ if ref =~ /^(\-+)/
297
+ l1 = l1 - $1.size
298
+ elsif ref =~ /(\-+)$/
299
+ l2 = l2 + $1.size
300
+ end
301
+
302
+ if (ori_ref_l - l2 - 1) >= l1
303
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
304
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
305
+ aln_test = aln_seq[1]
306
+ ref = aln_seq[0]
307
+
308
+ ref_size = ref.size
309
+ sim_count = 0
310
+ (0..(ref_size-1)).each do |n|
311
+ ref_base = ref[n]
312
+ test_base = aln_test[n]
313
+ sim_count += 1 if ref_base == test_base
314
+ end
315
+ similarity = (sim_count/ref_size.to_f*100).round(1)
316
+
317
+ loc_p1 = l1 + 1
318
+ loc_p2 = ori_ref_l - l2
319
+ if seq.size != (loc_p2 - loc_p1 + 1)
320
+ indel = true
321
+ elsif aln_test.include?("-")
322
+ indel = true
323
+ else
324
+ indel = false
325
+ end
326
+ return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
327
+ else
328
+ return [0,0,0,0,0,0,0]
329
+ end
330
+ rescue => e
331
+ puts "Unexpected error occured."
332
+ puts "Exception Class: #{ e.class.name }"
333
+ puts "Exception Message: #{ e.message }"
334
+ puts "Exception Backtrace: #{ e.backtrace[0] }"
335
+ puts "ViralSeq.sequence_locator returns nil"
336
+ return nil
329
337
  end
330
- end
331
- return out_seq_hash
332
- end
333
-
334
- end
335
-
336
- # functions added to Class::String for direct operation on sequence if it is a String object
337
- # String.rc
338
- # # reverse complement
339
- # # example
340
- # "ACAGA".rc
341
- # => "TCTGT"
342
- #
343
- # String.mutation(error_rate)
344
- # # mutate a nt sequence (String class) randomly
345
- # # must define error rate, default value 0.01, aka 1%
346
- # =USAGE
347
- # # example
348
- # seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
349
- # seq.mutation(0.05)
350
- # => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
351
- #
352
- # String.nt_parser
353
- # # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
354
- # =USAGE
355
- # "ATRWCG".nt_parser
356
- # => /AT[A|G][A|T]CG/
357
-
358
- class String
359
- # direct function of calling reverse complement on String class
360
- def rc
361
- self.reverse.tr("ACTG","TGAC")
362
- end
363
-
364
- def mutation(error_rate = 0.01)
365
- new_string = ""
366
- self.split("").each do |nt|
367
- pool = ["A","C","T","G"]
368
- pool.delete(nt)
369
- s = error_rate * 10000
370
- r = rand(10000)
371
- if r < s
372
- nt = pool.sample
338
+ end # end of locator
339
+
340
+ # Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
341
+ # @param p1 [Integer] start position number on the reference genome
342
+ # @param p2 [Integer] end position number on the reference genome
343
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
344
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
345
+ # @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
346
+ # if either the start or end position is beyond the range of the target sequence.
347
+ # @example trim a sequence to fit in the range of [2333, 2433] on the HXB2 nt reference
348
+ # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
349
+ # s = ViralSeq::Sequence.new('my_seq', seq)
350
+ # s.sequence_clip(2333, 2433, :HXB2).dna
351
+ # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
352
+
353
+ def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
354
+ loc = self.locator(ref_option, path_to_muscle)
355
+ l1 = loc[0]
356
+ l2 = loc[1]
357
+ if (p1 >= l1) & (p2 <= l2)
358
+ seq = loc[4]
359
+ ref = loc[5]
360
+ g1 = 0
361
+ ref.each_char do |char|
362
+ break if l1 == p1
363
+ g1 += 1
364
+ l1 += 1 unless char == "-"
365
+ end
366
+ g2 = 1
367
+ ref.reverse.each_char do |char|
368
+ break if l2 == p2
369
+ g2 += 1
370
+ l2 -= 1 unless char == "-"
371
+ end
372
+ return ViralSeq::Sequence.new(self.name,seq[g1..(-g2)].tr("-",""))
373
+ else
374
+ return nil
373
375
  end
374
- new_string << nt
375
376
  end
376
- return new_string
377
- end
378
-
379
- def nt_parser
380
- match = ""
381
- self.each_char.each do |base|
382
- base_array = ViralSeq.to_list(base)
383
- if base_array.size == 1
384
- match += base_array[0]
377
+
378
+ # start of private functions
379
+ private
380
+
381
+ # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
382
+ def amino_acid (bases)
383
+ case bases
384
+ when /^TT[TCY]$/
385
+ return "F"
386
+ when /^TT[AGR]$/
387
+ return "L"
388
+ when /^CT.$/
389
+ return "L"
390
+ when /^AT[TCAHYWM]$/
391
+ return "I"
392
+ when "ATG"
393
+ return "M"
394
+ when /^GT.$/
395
+ return "V"
396
+ when /^TC.$/
397
+ return "S"
398
+ when /^CC.$/
399
+ return "P"
400
+ when /^AC.$/
401
+ return "T"
402
+ when /^GC.$/
403
+ return "A"
404
+ when /^TA[TCY]$/
405
+ return "Y"
406
+ when /^TA[AGR]$/
407
+ return "*"
408
+ when /^T[GR]A$/
409
+ return "*"
410
+ when /^CA[TCY]$/
411
+ return "H"
412
+ when /^CA[AGR]$/
413
+ return "Q"
414
+ when /^AA[TCY]$/
415
+ return "N"
416
+ when /^AA[AGR]$/
417
+ return "K"
418
+ when /^GA[TCY]$/
419
+ return "D"
420
+ when /^GA[AGR]$/
421
+ return "E"
422
+ when /^TG[TCY]$/
423
+ return "C"
424
+ when "TGG"
425
+ return "W"
426
+ when /^CG.$/
427
+ return "R"
428
+ when /^AG[TCY]$/
429
+ return "S"
430
+ when /^[AM]G[AGR]$/
431
+ return "R"
432
+ when /^GG.$/
433
+ return "G"
434
+ when /^[ATW][CGS][CTY]$/
435
+ return "S"
436
+ when /^[TCY]T[AGR]$/
437
+ return "L"
385
438
  else
386
- pattern = "[" + base_array.join("|") + "]"
387
- match += pattern
439
+ return "#"
440
+ end
441
+ end # end of amino_acid
442
+
443
+ # keep ambiguities, return all possible amino acids.
444
+
445
+ def amino_acid_2 (bases)
446
+ bases_to_aa = []
447
+ aa_list = []
448
+ base1 = bases[0].to_list
449
+ base2 = bases[1].to_list
450
+ base3 = bases[2].to_list
451
+ l1 = base1.size - 1
452
+ l2 = base2.size - 1
453
+ l3 = base3.size - 1
454
+ (0..l1).each do |n1|
455
+ b1 = base1[n1]
456
+ (0..l2).each do |n2|
457
+ b2 = base2[n2]
458
+ (0..l3).each do |n3|
459
+ b3 = base3[n3]
460
+ bases_all = b1 + b2 + b3
461
+ bases_to_aa << bases_all
462
+ end
463
+ end
464
+ end
465
+
466
+ bases_to_aa.each do |base|
467
+ case base
468
+ when /^TT[TCY]$/
469
+ aa = "F"
470
+ when /^TT[AGR]$/
471
+ aa = "L"
472
+ when /^CT.$/
473
+ aa = "L"
474
+ when /^AT[TCAHYWM]$/
475
+ aa = "I"
476
+ when "ATG"
477
+ aa = "M"
478
+ when /^GT.$/
479
+ aa = "V"
480
+ when /^TC.$/
481
+ aa = "S"
482
+ when /^CC.$/
483
+ aa = "P"
484
+ when /^AC.$/
485
+ aa = "T"
486
+ when /^GC.$/
487
+ aa = "A"
488
+ when /^TA[TCY]$/
489
+ aa = "Y"
490
+ when /^TA[AGR]$/
491
+ aa = "*"
492
+ when /^T[GR]A$/
493
+ aa = "*"
494
+ when /^CA[TCY]$/
495
+ aa = "H"
496
+ when /^CA[AGR]$/
497
+ aa = "Q"
498
+ when /^AA[TCY]$/
499
+ aa = "N"
500
+ when /^AA[AGR]$/
501
+ aa = "K"
502
+ when /^GA[TCY]$/
503
+ aa = "D"
504
+ when /^GA[AGR]$/
505
+ aa = "E"
506
+ when /^TG[TCY]$/
507
+ aa = "C"
508
+ when "TGG"
509
+ aa = "W"
510
+ when /^CG.$/
511
+ aa = "R"
512
+ when /^AG[TCY]$/
513
+ aa = "S"
514
+ when /^[AM]G[AGR]$/
515
+ aa = "R"
516
+ when /^GG.$/
517
+ aa = "G"
518
+ when /^[ATW][CGS][CTY]$/
519
+ aa = "S"
520
+ when /^[TCY]T[AGR]$/
521
+ aa = "L"
522
+ else
523
+ aa = "-"
524
+ end
525
+ aa_list << aa
526
+ end
527
+ aa_out = aa_list.uniq.join
528
+ return aa_out
529
+ end # end of #amino_acid_2
530
+
531
+ # sdrm position hash
532
+ def sdrm_hash(options)
533
+ sdrm = {}
534
+ case options
535
+ when :hcv_ns5a
536
+ sdrm[28] = ['M',['T']]
537
+ sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
538
+ sdrm[31] = ['L',['M','V','F']]
539
+ sdrm[32] = ['P',['L']]
540
+ sdrm[44] = ['K',['R']]
541
+ sdrm[58] = ['H',['D','P','S']]
542
+ sdrm[64] = ['T',['A','S']]
543
+ sdrm[77] = ['P',['A','S']]
544
+ sdrm[78] = ['R',['K']]
545
+ sdrm[79] = ['T',['A']]
546
+ sdrm[83] = ['T',['M']]
547
+ sdrm[85] = ['S',['N','H','Y']]
548
+ sdrm[92] = ['A',['P','T','K','E']]
549
+ sdrm[93] = ['Y',['C','F','H','N']]
550
+ sdrm[107] = ['K',['T','S']]
551
+ sdrm[121] = ['I',['V']]
552
+ sdrm[135] = ['T',['A']]
553
+ when :nrti
554
+ sdrm[41] = ['M',['L']]
555
+ sdrm[65] = ['K',['R']]
556
+ sdrm[67] = ['D',['N','G','E']]
557
+ sdrm[69] = ['T',['D']]
558
+ sdrm[70] = ['K',['R','E']]
559
+ sdrm[74] = ['L',['V','I']]
560
+ sdrm[75] = ['V',['M','T','A','S']]
561
+ sdrm[77] = ['F',['L']]
562
+ sdrm[115] = ['Y',['F']]
563
+ sdrm[116] = ['F',['Y']]
564
+ sdrm[151] = ['Q',['M']]
565
+ sdrm[184] = ['M',['V','I']]
566
+ sdrm[210] = ['L',['W']]
567
+ sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
568
+ sdrm[219] = ["K",["Q","E","N","R"]]
569
+ when :nnrti
570
+ sdrm[100] = ['L',['I']]
571
+ sdrm[101] = ['K',['E','P']]
572
+ sdrm[103] = ['K',['N','S']]
573
+ sdrm[106] = ['V',['M','A']]
574
+ sdrm[179] = ['V',['F','D']]
575
+ sdrm[181] = ['Y',['C','I','V']]
576
+ sdrm[188] = ['Y',['L','H','C']]
577
+ sdrm[190] = ['G',['A','S','E']]
578
+ sdrm[225] = ['P',['H']]
579
+ sdrm[230] = ['M',['L']]
580
+ when :hiv_pr
581
+ sdrm[23] = ['L',['I']]
582
+ sdrm[24] = ['L',['I']]
583
+ sdrm[30] = ['D',['N']]
584
+ sdrm[32] = ['V',['I']]
585
+ sdrm[46] = ['M',['I','L']]
586
+ sdrm[47] = ['I',['V','A']]
587
+ sdrm[48] = ['G',['V','M']]
588
+ sdrm[50] = ['I',['V','L']]
589
+ sdrm[53] = ['F',['L']]
590
+ sdrm[54] = ['I',['V','L','M','T','A','S']]
591
+ sdrm[73] = ['G',['S','T','C','A']]
592
+ sdrm[76] = ['L',['V']]
593
+ sdrm[82] = ['V',['A','T','S','F','L','C','M']]
594
+ sdrm[83] = ['N',['D']]
595
+ sdrm[84] = ['I',['V','A','C']]
596
+ sdrm[88] = ['N',['D','S']]
597
+ sdrm[90] = ['L',['M']]
598
+ when :hiv_in
599
+ sdrm[66] = ['T',['A','I','K']]
600
+ sdrm[74] = ['L',['M']]
601
+ sdrm[92] = ['E',['Q']]
602
+ sdrm[95] = ['Q',['K']]
603
+ sdrm[97] = ['T',['A']]
604
+ sdrm[121] = ['F',['Y']]
605
+ sdrm[140] = ['G',['A','S','C']]
606
+ sdrm[143] = ["Y",["C","H","R"]]
607
+ sdrm[147] = ['S',['G']]
608
+ sdrm[148] = ['Q',['H','K','R']]
609
+ sdrm[155] = ['N',['S','H']]
610
+ else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
388
611
  end
612
+ return sdrm
389
613
  end
390
- Regexp.new match
391
- end
392
- end
614
+ end # end of ViralSeq::Sequence
615
+ end # end of ViralSeq