viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,219 @@
1
+
2
+ module ViralSeq
3
+
4
+ # Class for paired-end sequences.
5
+ # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
6
+ # my_seqhashpair = ViralSeq::SeqHashPair.fa('my_seq_directory')
7
+ # @example join the paired-end sequences with an overlap of 100 bp
8
+ # my_seqhashpair.join1(100)
9
+ # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
+ # my_seqhashpair.join1(:indiv)
11
+
12
+ class SeqHashPair
13
+
14
+ # initialize SeqHashPair object with @dna_hash, @title and @file
15
+
16
+ def initialize (dna_hash = {}, title = "", file = [])
17
+ @dna_hash = dna_hash
18
+ @title = title
19
+ @file = file
20
+ end
21
+
22
+ # @return [Hash] Hash object for :name => [:r1_sequence_string, :r2_sequence_string]
23
+
24
+ attr_accessor :dna_hash
25
+
26
+ # @return [String] the title of the SeqHash object.
27
+ # default as the directory basename if SeqHash object is initialized using ::fa
28
+
29
+ attr_accessor :title
30
+
31
+ # @return [String] the r1 and r2 files that are used to initialize SeqHash object, if they exist
32
+
33
+ attr_accessor :file
34
+
35
+ # initialize a new ViralSeq::SeqHashPair object from a directory containing paired sequence files in the FASTA format
36
+ # @param indir [String] directory containing paired sequence files in the FASTA format,
37
+ #
38
+ # Paired sequence files need to have "r1" and "r2" in their file names
39
+ #
40
+ # Example for the file structure
41
+ # ├───lib1
42
+ # │ lib1_r1.txt
43
+ # │ lib1_r2.txt
44
+ # The sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
45
+ # @return [ViralSeq::SeqHashPair] new SeqHashPair object from the paired FASTA sequence files
46
+ # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
47
+ # my_seqhashpair = ViralSeq::SeqHashPair.fa('spec/sample_paired_seq')
48
+
49
+ def self.new_from_fasta(indir)
50
+ files = Dir[indir + "/*"]
51
+ r1_file = ""
52
+ r2_file = ""
53
+ files.each do |f|
54
+ if File.basename(f) =~ /r1/i
55
+ r1_file = f
56
+ elsif File.basename(f) =~ /r2/i
57
+ r2_file = f
58
+ end
59
+ end
60
+
61
+ seq1 = ViralSeq::SeqHash.fa(r1_file).dna_hash
62
+ seq2 = ViralSeq::SeqHash.fa(r2_file).dna_hash
63
+
64
+ new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
65
+ new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
66
+
67
+ seq_pair_hash = {}
68
+
69
+ new_seq1.each do |seq_name,seq|
70
+ seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
71
+ end
72
+ seq_hash = ViralSeq::SeqHashPair.new
73
+ seq_hash.dna_hash = seq_pair_hash
74
+ seq_hash.title = File.basename(indir,".*")
75
+ seq_hash.file = [r1_file, r2_file]
76
+ return seq_hash
77
+ end # end of .new_from_fasta
78
+
79
+ class << self
80
+ alias_method :fa, :new_from_fasta
81
+ end
82
+
83
+ # Pair-end join function for KNOWN overlap size.
84
+ # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
+ # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
86
+ # @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
87
+ # @example join paired-end sequences with different :diff cut-offs, overlap provided.
88
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
89
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
90
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
91
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
92
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
93
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
94
+ # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seqs)
95
+ # my_seqhashpair.join1(100).dna_hash.keys
96
+ # => [">pair1"]
97
+ # my_seqhashpair.join1(100,0.01).dna_hash.keys
98
+ # => [">pair1", ">pair2"]
99
+ # my_seqhashpair.join1(100,0.02).dna_hash.keys
100
+ # => [">pair1", ">pair2", ">pair3"]
101
+
102
+ def join1(overlap = 0, diff = 0.0)
103
+ seq_pair_hash = self.dna_hash
104
+ raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
+ joined_seq = {}
107
+ seq_pair_hash.each do |seq_name, seq_pair|
108
+ r1_seq = seq_pair[0]
109
+ r2_seq = seq_pair[1]
110
+ if overlap.zero?
111
+ joined_seq[seq_name] = r1_seq + r2_seq
112
+ elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
+ joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
114
+ else
115
+ next
116
+ end
117
+ end
118
+ joined_seq_hash = ViralSeq::SeqHash.new
119
+ joined_seq_hash.dna_hash = joined_seq
120
+ joined_seq_hash.title = self.title + "_joined"
121
+ joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
122
+ return joined_seq_hash
123
+ end # end of join1
124
+
125
+
126
+ # Pair-end join function for UNKNOWN overlap.
127
+ # @param model [Symbol] models used to determine the overlap, `:con`, `:indiv`
128
+ #
129
+ # model `:con`: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
130
+ #
131
+ # note: minimal overlap as 4 bases.
132
+ # model `:indiv`: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
133
+ # @param diff (see #join1)
134
+ # @return (see #join1)
135
+ # @example join paired-end sequences, overlap NOT provided
136
+ # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
137
+ # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
138
+ # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"] }
139
+ # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
+ # my_seqhashpair.join2.dna_hash
141
+ # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
+ # my_seqhashpair.join2(:indiv).dna_hash
143
+ # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
+
145
+ def join2(model = :con, diff = 0.0)
146
+ seq_pair_hash = self.dna_hash
147
+ begin
148
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
149
+ if model == :con
150
+ overlap = determine_overlap_pid_pair(seq_pair_hash, diff)
151
+ return self.join1(overlap, diff)
152
+ elsif model == :indiv
153
+ joined_seq = {}
154
+ seq_pair_hash.each do |seq_name, seq_pair|
155
+ overlap_list = []
156
+ overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
157
+ cut_off_base = overlap1 * diff
158
+ overlap_list << overlap1 if diff_nt <= cut_off_base
159
+ end
160
+ if overlap_list.empty?
161
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
162
+ else
163
+ overlap = overlap_list.max
164
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
165
+ end
166
+ end
167
+ joined_seq_hash = ViralSeq::SeqHash.new
168
+ joined_seq_hash.dna_hash = joined_seq
169
+ joined_seq_hash.title = self.title + "_joined"
170
+ joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
171
+ return joined_seq_hash
172
+ else
173
+ raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \`#{model}\`, expected `:con` or `:indiv`.")
174
+ end
175
+ rescue ArgumentError => e
176
+ puts e
177
+ return nil
178
+ end
179
+ end # end of join2
180
+
181
+ private
182
+ # determine overlap size from @dna_hash
183
+ def determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
184
+ overlaps = []
185
+ seq_pair_hash.each do |_seq_name, seq_pair|
186
+ overlap_list = []
187
+ matrix = overlap_matrix(seq_pair[0], seq_pair[1])
188
+ matrix.each do |overlap, diff_nt|
189
+ cut_off_base = overlap * diff
190
+ overlap_list << overlap if diff_nt <= cut_off_base
191
+ end
192
+ if overlap_list.empty?
193
+ overlaps << 0
194
+ else
195
+ overlaps << overlap_list.max
196
+ end
197
+ end
198
+ count_overlaps = overlaps.count_freq
199
+ max_value = count_overlaps.values.max
200
+ max_overlap_list = []
201
+ count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
202
+ max_overlap_list.max
203
+ end # end pf determine_overlap_pid_pair
204
+
205
+ # input a pair of sequences as String, return a Hash object of overlapping Hash object
206
+ # {:overlap_size => number_of_differnt_positions, ...}
207
+ # {minimal overlap set to 4. }
208
+ def overlap_matrix(sequence1, sequence2)
209
+ min_overlap = 4
210
+ max_overlap = [sequence1.size, sequence2.size].max
211
+ matrix_hash = {}
212
+ (min_overlap..max_overlap).each do |overlap|
213
+ matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
214
+ end
215
+ return matrix_hash
216
+ end # end of overlap_matrix
217
+
218
+ end # end of SeqHashPair
219
+ end # end of ViralSeq
@@ -1,392 +1,615 @@
1
- # lib/sequence.rb
2
- # Includes functions for sequence operations
3
- # Including methods as:
4
- # ViralSeq::AMINO_ACID_LIST
5
- # ViralSeq::Sequence
6
- # ViralSeq::Sequence#rev_complement
7
- # ViralSeq::Sequence#get_aa_sequence
8
- # ViralSeq::Sequence#get_aa_array
9
- # ViralSeq::Sequence#name
10
- # ViralSeq::Sequence#dna_sequence
11
- # ViralSeq::Sequence#aa_sequence
12
- # ViralSeq::Sequence#aa_array
13
- # ViralSeq::amino_acid
14
- # ViralSeq::amino_acid_2
15
- # ViralSeq::to_list
16
- # ViralSeq::uniq_sequence_hash
17
- # ViralSeq::stop_codon_seq_hash
18
- # String#rc
19
- # String#mutation
20
- # String#nt_parser
21
-
22
- # ViralSeq::AMINO_ACID_LIST
23
- # # Array of all amino acid one letter abbreviations
24
-
25
- # ViralSeq::Sequence
26
- # # Sequence class
27
- # =USAGE
28
- # # create a sequence object
29
- # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
30
- #
31
- # # print dna sequence
32
- # puts seq.dna_sequence
33
- #
34
- # # reserce complement sequence of DNA sequence, return as a string
35
- # seq.rev_complement
36
- #
37
- # # change @dna_sequence to reverse complement DNA sequence
38
- # seq.rev_complement!
39
- #
40
- # # generate amino acid sequences. either return string or array.
41
- # # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
42
- # # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
43
- # seq.get_aa_sequence
44
- # # or
45
- # seq.get_aa_array
46
- #
47
- # # print amino acid sequence
48
- # puts seq.aa_sequence
49
-
50
- # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
51
- # # collapse sequence hash to unique sequence hash.
52
- # # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
53
- # # master_sequence_tag is the master tag for unique sequences
54
- # # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
55
- # =USAGE
56
- # sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
57
- # '>seq4' => 'CCCC', '>seq5' => 'CCCC',
58
- # '>seq6' => 'TTTT' }
59
- # uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
60
- # => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
61
1
 
62
2
  module ViralSeq
63
3
 
64
- # array for all amino acid one letter abbreviations
65
- AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
66
-
67
- # sequence class
4
+ # ViralSeq::Sequence class for sequence operation
5
+ #
6
+ # @example create a sequence object
7
+ # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
8
+ # => #<ViralSeq::Sequence:0x00007fd03c8c10b8 @name="my_sequence", @dna="ACCTAGGTTCGGAGC", @aa_string="", @aa_array=[]>
9
+ #
10
+ # @example return dna sequence as String
11
+ # seq.dna
12
+ # => "ACCTAGGTTCGGAGC"
13
+ #
14
+ # @example reverse complement sequence of DNA sequence
15
+ # seq.rc
16
+ # => "GCTCCGAACCTAGGT"
17
+ #
18
+ # @example change @dna to reverse complement DNA sequence
19
+ # seq.rc!
20
+ #
21
+ # @example translate the DNA sequence, return values for @aa_string and @aa_array
22
+ # seq = ViralSeq::Sequence.new('my_sequence', 'AWTCGRAGAG')
23
+ # seq.translate(1)
24
+ # seq.aa_string
25
+ # => "##E"
26
+ # seq.aa_array
27
+ # => ["IF", "EG", "E"]
68
28
 
69
29
  class Sequence
30
+ # initialize a ViralSeq::Sequence class with sequence name (default as '>sequence')
31
+ # and DNA sequence as String object
70
32
  def initialize (name = ">sequence",dna_sequence ="")
71
33
  @name = name
72
- @dna_sequence = dna_sequence.upcase
73
- @aa_sequence = ""
34
+ @dna = dna_sequence.upcase
35
+ @aa_string = ""
74
36
  @aa_array = []
75
37
  end
76
38
 
77
- attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
39
+ # @return [String] sequence tag name
40
+ attr_accessor :name
41
+
42
+ # @return [String] DNA sequence
43
+ attr_accessor :dna
44
+
45
+ # @return [String] amino acid sequence
46
+ attr_accessor :aa_string
47
+
48
+ # @return [Array] amino acid sequence as an Array object,
49
+ # ambiguity dna sequence will be translated in all possible amino acid sequence at the position
50
+ attr_accessor :aa_array
78
51
 
52
+ # @return [String] reverse compliment sequence of the @dna.
79
53
  def rev_complement
80
- @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
54
+ @dna.rc
81
55
  end
56
+
57
+ # replace the @dna with reverse complement DNA sequence.
82
58
  def rev_complement!
83
- @dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
59
+ @dna = @dna.rc
84
60
  end
85
61
 
86
- def get_aa_sequence(initial_position = 0)
87
- @aa_sequence = ""
88
- require_sequence = @dna_sequence[initial_position..-1]
62
+ alias_method :rc, :rev_complement
63
+ alias_method :rc!, :rev_complement!
64
+
65
+ # translate @dna to amino acid sequence.
66
+ # generate values for @aa_string and @aa_array
67
+ # @param initial_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
68
+
69
+ def translate(initial_position = 0)
70
+ @aa_string = ""
71
+ require_sequence = @dna[initial_position..-1]
89
72
  base_array = []
90
73
  require_sequence.each_char {|base| base_array << base}
91
74
  while (base_array.length>=3) do
92
75
  base_3= ""
93
76
  3.times {base_3 += base_array.shift}
94
- @aa_sequence << amino_acid(base_3)
77
+ @aa_string << amino_acid(base_3)
95
78
  end
96
- return @aa_sequence
97
- end
98
79
 
99
- # get amino acid calls, return a array.keep ambiguity calls.
100
- def get_aa_array(initial_position = 0)
101
80
  @aa_array = []
102
- require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
81
+ require_sequence = @dna[initial_position..-1].tr('-','N')
103
82
  base_array = []
104
83
  require_sequence.each_char {|base| base_array << base}
105
84
  while (base_array.length>=3) do
106
85
  base_3= ""
107
86
  3.times{base_3 += base_array.shift}
108
- @aa_array<< ViralSeq.amino_acid_2(base_3)
87
+ @aa_array<< amino_acid_2(base_3)
109
88
  end
110
- return @aa_array
111
89
  end
90
+
91
+ # @return [Integer] length of DNA sequence
112
92
  def dna_length
113
- @dna_sequence.length
93
+ @dna.length
114
94
  end
95
+
96
+ # @return [Integer] length of amino acid sequence
115
97
  def aa_length
116
- @aa_sequence.length
98
+ @aa_string.length
117
99
  end
118
- end
119
-
120
- # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
121
- def self.amino_acid (bases)
122
- case bases
123
- when /^TT[TCY]$/
124
- return "F"
125
- when /^TT[AGR]$/
126
- return "L"
127
- when /^CT.$/
128
- return "L"
129
- when /^AT[TCAHYWM]$/
130
- return "I"
131
- when "ATG"
132
- return "M"
133
- when /^GT.$/
134
- return "V"
135
- when /^TC.$/
136
- return "S"
137
- when /^CC.$/
138
- return "P"
139
- when /^AC.$/
140
- return "T"
141
- when /^GC.$/
142
- return "A"
143
- when /^TA[TCY]$/
144
- return "Y"
145
- when /^TA[AGR]$/
146
- return "*"
147
- when /^T[GR]A$/
148
- return "*"
149
- when /^CA[TCY]$/
150
- return "H"
151
- when /^CA[AGR]$/
152
- return "Q"
153
- when /^AA[TCY]$/
154
- return "N"
155
- when /^AA[AGR]$/
156
- return "K"
157
- when /^GA[TCY]$/
158
- return "D"
159
- when /^GA[AGR]$/
160
- return "E"
161
- when /^TG[TCY]$/
162
- return "C"
163
- when "TGG"
164
- return "W"
165
- when /^CG.$/
166
- return "R"
167
- when /^AG[TCY]$/
168
- return "S"
169
- when /^[AM]G[AGR]$/
170
- return "R"
171
- when /^GG.$/
172
- return "G"
173
- when /^[ATW][CGS][CTY]$/
174
- return "S"
175
- when /^[TCY]T[AGR]$/
176
- return "L"
177
- else
178
- return "#"
179
- end
180
- end
181
-
182
- # keep ambiguities, return all possible amino acids.
183
-
184
- def self.amino_acid_2 (bases)
185
- bases_to_aa = []
186
- aa_list = []
187
- base1 = ViralSeq.to_list(bases[0])
188
- base2 = ViralSeq.to_list(bases[1])
189
- base3 = ViralSeq.to_list(bases[2])
190
- l1 = base1.size - 1
191
- l2 = base2.size - 1
192
- l3 = base3.size - 1
193
- (0..l1).each do |n1|
194
- b1 = base1[n1]
195
- (0..l2).each do |n2|
196
- b2 = base2[n2]
197
- (0..l3).each do |n3|
198
- b3 = base3[n3]
199
- bases_all = b1 + b2 + b3
200
- bases_to_aa << bases_all
100
+
101
+ # resistant mutation interpretation for a chosen region from a translated ViralSeq::Sequence object
102
+ # @param option [Symbol] option of region to interpret, `:hcv_ns5a`, `:hiv_pr`, `:nrti`, `:nnrti`, `hiv_in`
103
+ # @param start_aa [Integer] the starting aa number of the input sequence
104
+ # @return [Hash] return a Hash object for SDRMs identified. :posiiton => [:wildtype_codon, :mutation_codon]
105
+ # @example examine an HIV PR region sequence for drug resistance mutations
106
+ # my_seq_name = 'a_pr_seq'
107
+ # my_seq = 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAAATAGGAGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATAATACAGTATTAGAAGACATGGAGTTACCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATCTGTGGGCATAAAACTACAGGTACAGTGTTAATAGGACCTACACCCGTCAACATAATTGGAAGAGATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTT'
108
+ # s = ViralSeq::Sequence.new(my_seq_name, my_seq)
109
+ # s.translate
110
+ # s.sdrm(:hiv_pr)
111
+ # => {30=>["D", "N"], 88=>["N", "D"]}
112
+
113
+ def sdrm(option, start_aa = 1)
114
+ aa_array = self.aa_array
115
+ out_hash = {}
116
+ sdrm = sdrm_hash(option)
117
+ aa_length = aa_array.size
118
+ end_aa = start_aa + aa_length - 1
119
+ (start_aa..end_aa).each do |position|
120
+ array_position = position - start_aa
121
+ if sdrm.keys.include?(position)
122
+ wt_aa = sdrm[position][0]
123
+ test_aa = aa_array[array_position]
124
+ if test_aa.size == 1
125
+ unless wt_aa == test_aa
126
+ if sdrm[position][1].include?(test_aa)
127
+ out_hash[position] = [wt_aa,test_aa]
128
+ end
129
+ end
130
+ else
131
+ test_aa_array = test_aa.split("")
132
+ if (test_aa_array & sdrm[position][1])
133
+ out_hash[position] = [wt_aa,test_aa]
134
+ end
135
+ end
201
136
  end
202
137
  end
203
- end
138
+ return out_hash
139
+ end # end of #hcv_ns5a
204
140
 
205
- bases_to_aa.each do |base|
206
- case base
207
- when /^TT[TCY]$/
208
- aa = "F"
209
- when /^TT[AGR]$/
210
- aa = "L"
211
- when /^CT.$/
212
- aa = "L"
213
- when /^AT[TCAHYWM]$/
214
- aa = "I"
215
- when "ATG"
216
- aa = "M"
217
- when /^GT.$/
218
- aa = "V"
219
- when /^TC.$/
220
- aa = "S"
221
- when /^CC.$/
222
- aa = "P"
223
- when /^AC.$/
224
- aa = "T"
225
- when /^GC.$/
226
- aa = "A"
227
- when /^TA[TCY]$/
228
- aa = "Y"
229
- when /^TA[AGR]$/
230
- aa = "*"
231
- when /^T[GR]A$/
232
- aa = "*"
233
- when /^CA[TCY]$/
234
- aa = "H"
235
- when /^CA[AGR]$/
236
- aa = "Q"
237
- when /^AA[TCY]$/
238
- aa = "N"
239
- when /^AA[AGR]$/
240
- aa = "K"
241
- when /^GA[TCY]$/
242
- aa = "D"
243
- when /^GA[AGR]$/
244
- aa = "E"
245
- when /^TG[TCY]$/
246
- aa = "C"
247
- when "TGG"
248
- aa = "W"
249
- when /^CG.$/
250
- aa = "R"
251
- when /^AG[TCY]$/
252
- aa = "S"
253
- when /^[AM]G[AGR]$/
254
- aa = "R"
255
- when /^GG.$/
256
- aa = "G"
257
- when /^[ATW][CGS][CTY]$/
258
- aa = "S"
259
- when /^[TCY]T[AGR]$/
260
- aa = "L"
261
- else
262
- aa = "-"
263
- end
264
- aa_list << aa
265
- end
266
- aa_out = aa_list.uniq.join('/')
267
- return aa_out
268
- end
269
-
270
- # parse ambiguity bases, aka %w{W S M K R Y B D H V N}
271
-
272
- def self.to_list(base = "")
273
- list = []
274
- case base
275
- when /[A|T|C|G]/
276
- list << base
277
- when "W"
278
- list = ['A','T']
279
- when "S"
280
- list = ['C','G']
281
- when "M"
282
- list = ['A','C']
283
- when 'K'
284
- list = ['G','C']
285
- when 'R'
286
- list = ['A','G']
287
- when 'Y'
288
- list = ['C','T']
289
- when 'B'
290
- list = ['C','G','T']
291
- when 'D'
292
- list = ['A','G','T']
293
- when 'H'
294
- list = ['A','C','T']
295
- when 'V'
296
- list = ['A','C','G']
297
- when 'N'
298
- list = ['A','T','C','G']
299
- end
300
- return list
301
- end
302
-
303
- # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
304
- # collapse sequence hash to unique sequence hash.
305
- # input_sequence_hash is a sequence hash {:name => :sequence, ...}
306
- # master_sequence_tag is the master tag for unique sequences
307
- # sequences will be named as (master_sequence_tag + "_" + Integer)
308
-
309
- def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
310
- uni = ViralSeq.count(seq.values)
311
- new_seq = {}
312
- n = 1
313
- uni.each do |s,c|
314
- name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
315
- new_seq[name] = s
316
- n += 1
317
- end
318
- return new_seq
319
- end
320
-
321
- # input a sequence hash, return a sequence hash with stop codons.
322
- def self.stop_codon_seq_hash(seq_hash, rf = 0)
323
- out_seq_hash = {}
324
- seq_hash.each do |k,v|
325
- sequence = Sequence.new(k,v)
326
- sequence.get_aa_array(rf)
327
- if sequence.aa_array.include?("*")
328
- out_seq_hash[k] = v
141
+ # HIV sequence locator function, resembling HIV Sequence Locator from LANL
142
+ # # current version only supports nucleotide sequence, not for amino acid sequence.
143
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
144
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
145
+ # @return [Array] an array of the following info
146
+ # # start_location (Integer)
147
+ # # end_location (Integer)
148
+ # # percentage_of_similarity_to_reference_sequence (Float)
149
+ # # containing_indel? (Boolean)
150
+ # # aligned_input_sequence (String)
151
+ # # aligned_reference_sequence (String)
152
+ # @example identify the location of the input sequence on the NL43 genome
153
+ # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
154
+ # s = ViralSeq::Sequence.new('my_sequence', sequence)
155
+ # loc = s.locator(:NL43)
156
+ # h = ViralSeq::SeqHash.new; h.dna_hash['NL43'] = loc[5]; h.dna_hash[s.name] = loc[4]
157
+ # rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
158
+ # puts "The input sequence \"#{s.name}\" is located on the NL43 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
159
+ # => The input sequence "my_sequence" is located on the NL43 nt sequence from 2333 to 2433.
160
+ # => It is 98.0% similar to the reference.
161
+ # => It does not have indels.
162
+ # => The alignment is
163
+ # => NL43 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
164
+ # => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
165
+ # @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
166
+
167
+ def locator(ref_option = :HXB2, path_to_muscle = false)
168
+ seq = self.dna
169
+ ori_ref = ViralSeq::RefSeq.get(ref_option)
170
+
171
+ begin
172
+ ori_ref_l = ori_ref.size
173
+ l1 = 0
174
+ l2 = 0
175
+
176
+ aln_seq = ViralSeq::Muscle.align(ori_ref, seq, path_to_muscle)
177
+ aln_test = aln_seq[1]
178
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
179
+ gap_begin = $1.size
180
+ gap_end = $3.size
181
+ aln_test2 = $2
182
+ ref = aln_seq[0]
183
+ ref = ref[gap_begin..(-gap_end-1)]
184
+ ref_size = ref.size
185
+ if ref_size > 1.3*(seq.size)
186
+ l1 = l1 + gap_begin
187
+ l2 = l2 + gap_end
188
+ max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
189
+ aln_test2 =~ /#{max_seq}/
190
+ before_aln_seq = $`
191
+ before_aln = $`.size
192
+ post_aln_seq = $'
193
+ post_aln = $'.size
194
+ before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
195
+ b1 = (1.3 * before_aln_seq_size).to_i
196
+ post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
197
+ b2 = (1.3 * post_aln_seq_size).to_i
198
+ if (before_aln > seq.size) and (post_aln <= seq.size)
199
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
200
+ l1 = l1 + (before_aln - b1)
201
+ elsif (post_aln > seq.size) and (before_aln <= seq.size)
202
+ ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
203
+ l2 = l2 + post_aln - b2
204
+ elsif (post_aln > seq.size) and (before_aln > seq.size)
205
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
206
+ l1 = l1 + (before_aln - b1)
207
+ l2 = l2 + (post_aln - b2)
208
+ end
209
+
210
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
211
+ aln_test = aln_seq[1]
212
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
213
+ gap_begin = $1.size
214
+ gap_end = $3.size
215
+ ref = aln_seq[0]
216
+ ref = ref[gap_begin..(-gap_end-1)]
217
+ end
218
+
219
+ aln_test = aln_seq[1]
220
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
221
+ gap_begin = $1.size
222
+ gap_end = $3.size
223
+ aln_test = $2
224
+ aln_test =~ /^(\w+)(\-*)\w/
225
+ s1 = $1.size
226
+ g1 = $2.size
227
+ aln_test =~ /\w(\-*)(\w+)$/
228
+ s2 = $2.size
229
+ g2 = $1.size
230
+
231
+ l1 = l1 + gap_begin
232
+ l2 = l2 + gap_end
233
+ repeat = 0
234
+
235
+ if g1 == g2 and (s1 + g1 + s2) == ref.size
236
+ if s1 > s2 and g2 > 2*s2
237
+ ref = ref[0..(-g2-1)]
238
+ repeat = 1
239
+ l2 = l2 + g2
240
+ elsif s1 < s2 and g1 > 2*s1
241
+ ref = ref[g1..-1]
242
+ repeat = 1
243
+ l1 = l1 + g1
244
+ end
245
+ else
246
+ if g1 > 2*s1
247
+ ref = ref[g1..-1]
248
+ repeat = 1
249
+ l1 = l1 + g1
250
+ end
251
+ if g2 > 2*s2
252
+ ref = ref[0..(-g2 - 1)]
253
+ repeat = 1
254
+ l2 = l2 + g2
255
+ end
256
+ end
257
+
258
+ while repeat == 1
259
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
260
+ aln_test = aln_seq[1]
261
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
262
+ gap_begin = $1.size
263
+ gap_end = $3.size
264
+ aln_test = $2
265
+ aln_test =~ /^(\w+)(\-*)\w/
266
+ s1 = $1.size
267
+ g1 = $2.size
268
+ aln_test =~ /\w(\-*)(\w+)$/
269
+ s2 = $2.size
270
+ g2 = $1.size
271
+ ref = aln_seq[0]
272
+ ref = ref[gap_begin..(-gap_end-1)]
273
+ l1 = l1 + gap_begin
274
+ l2 = l2 + gap_end
275
+ repeat = 0
276
+ if g1 > 2*s1
277
+ ref = ref[g1..-1]
278
+ repeat = 1
279
+ l1 = l1 + g1
280
+ end
281
+ if g2 > 2*s2
282
+ ref = ref[0..(-g2 - 1)]
283
+ repeat = 1
284
+ l2 = l2 + g2
285
+ end
286
+ end
287
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
288
+
289
+
290
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
291
+ aln_test = aln_seq[1]
292
+ ref = aln_seq[0]
293
+
294
+ #refine alignment
295
+
296
+ if ref =~ /^(\-+)/
297
+ l1 = l1 - $1.size
298
+ elsif ref =~ /(\-+)$/
299
+ l2 = l2 + $1.size
300
+ end
301
+
302
+ if (ori_ref_l - l2 - 1) >= l1
303
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
304
+ aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
305
+ aln_test = aln_seq[1]
306
+ ref = aln_seq[0]
307
+
308
+ ref_size = ref.size
309
+ sim_count = 0
310
+ (0..(ref_size-1)).each do |n|
311
+ ref_base = ref[n]
312
+ test_base = aln_test[n]
313
+ sim_count += 1 if ref_base == test_base
314
+ end
315
+ similarity = (sim_count/ref_size.to_f*100).round(1)
316
+
317
+ loc_p1 = l1 + 1
318
+ loc_p2 = ori_ref_l - l2
319
+ if seq.size != (loc_p2 - loc_p1 + 1)
320
+ indel = true
321
+ elsif aln_test.include?("-")
322
+ indel = true
323
+ else
324
+ indel = false
325
+ end
326
+ return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
327
+ else
328
+ return [0,0,0,0,0,0,0]
329
+ end
330
+ rescue => e
331
+ puts "Unexpected error occured."
332
+ puts "Exception Class: #{ e.class.name }"
333
+ puts "Exception Message: #{ e.message }"
334
+ puts "Exception Backtrace: #{ e.backtrace[0] }"
335
+ puts "ViralSeq.sequence_locator returns nil"
336
+ return nil
329
337
  end
330
- end
331
- return out_seq_hash
332
- end
333
-
334
- end
335
-
336
- # functions added to Class::String for direct operation on sequence if it is a String object
337
- # String.rc
338
- # # reverse complement
339
- # # example
340
- # "ACAGA".rc
341
- # => "TCTGT"
342
- #
343
- # String.mutation(error_rate)
344
- # # mutate a nt sequence (String class) randomly
345
- # # must define error rate, default value 0.01, aka 1%
346
- # =USAGE
347
- # # example
348
- # seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
349
- # seq.mutation(0.05)
350
- # => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
351
- #
352
- # String.nt_parser
353
- # # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
354
- # =USAGE
355
- # "ATRWCG".nt_parser
356
- # => /AT[A|G][A|T]CG/
357
-
358
- class String
359
- # direct function of calling reverse complement on String class
360
- def rc
361
- self.reverse.tr("ACTG","TGAC")
362
- end
363
-
364
- def mutation(error_rate = 0.01)
365
- new_string = ""
366
- self.split("").each do |nt|
367
- pool = ["A","C","T","G"]
368
- pool.delete(nt)
369
- s = error_rate * 10000
370
- r = rand(10000)
371
- if r < s
372
- nt = pool.sample
338
+ end # end of locator
339
+
340
+ # Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
341
+ # @param p1 [Integer] start position number on the reference genome
342
+ # @param p2 [Integer] end position number on the reference genome
343
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
344
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
345
+ # @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
346
+ # if either the start or end position is beyond the range of the target sequence.
347
+ # @example trim a sequence to fit in the range of [2333, 2433] on the HXB2 nt reference
348
+ # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
349
+ # s = ViralSeq::Sequence.new('my_seq', seq)
350
+ # s.sequence_clip(2333, 2433, :HXB2).dna
351
+ # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
352
+
353
+ def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
354
+ loc = self.locator(ref_option, path_to_muscle)
355
+ l1 = loc[0]
356
+ l2 = loc[1]
357
+ if (p1 >= l1) & (p2 <= l2)
358
+ seq = loc[4]
359
+ ref = loc[5]
360
+ g1 = 0
361
+ ref.each_char do |char|
362
+ break if l1 == p1
363
+ g1 += 1
364
+ l1 += 1 unless char == "-"
365
+ end
366
+ g2 = 1
367
+ ref.reverse.each_char do |char|
368
+ break if l2 == p2
369
+ g2 += 1
370
+ l2 -= 1 unless char == "-"
371
+ end
372
+ return ViralSeq::Sequence.new(self.name,seq[g1..(-g2)].tr("-",""))
373
+ else
374
+ return nil
373
375
  end
374
- new_string << nt
375
376
  end
376
- return new_string
377
- end
378
-
379
- def nt_parser
380
- match = ""
381
- self.each_char.each do |base|
382
- base_array = ViralSeq.to_list(base)
383
- if base_array.size == 1
384
- match += base_array[0]
377
+
378
+ # start of private functions
379
+ private
380
+
381
+ # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
382
+ def amino_acid (bases)
383
+ case bases
384
+ when /^TT[TCY]$/
385
+ return "F"
386
+ when /^TT[AGR]$/
387
+ return "L"
388
+ when /^CT.$/
389
+ return "L"
390
+ when /^AT[TCAHYWM]$/
391
+ return "I"
392
+ when "ATG"
393
+ return "M"
394
+ when /^GT.$/
395
+ return "V"
396
+ when /^TC.$/
397
+ return "S"
398
+ when /^CC.$/
399
+ return "P"
400
+ when /^AC.$/
401
+ return "T"
402
+ when /^GC.$/
403
+ return "A"
404
+ when /^TA[TCY]$/
405
+ return "Y"
406
+ when /^TA[AGR]$/
407
+ return "*"
408
+ when /^T[GR]A$/
409
+ return "*"
410
+ when /^CA[TCY]$/
411
+ return "H"
412
+ when /^CA[AGR]$/
413
+ return "Q"
414
+ when /^AA[TCY]$/
415
+ return "N"
416
+ when /^AA[AGR]$/
417
+ return "K"
418
+ when /^GA[TCY]$/
419
+ return "D"
420
+ when /^GA[AGR]$/
421
+ return "E"
422
+ when /^TG[TCY]$/
423
+ return "C"
424
+ when "TGG"
425
+ return "W"
426
+ when /^CG.$/
427
+ return "R"
428
+ when /^AG[TCY]$/
429
+ return "S"
430
+ when /^[AM]G[AGR]$/
431
+ return "R"
432
+ when /^GG.$/
433
+ return "G"
434
+ when /^[ATW][CGS][CTY]$/
435
+ return "S"
436
+ when /^[TCY]T[AGR]$/
437
+ return "L"
385
438
  else
386
- pattern = "[" + base_array.join("|") + "]"
387
- match += pattern
439
+ return "#"
440
+ end
441
+ end # end of amino_acid
442
+
443
+ # keep ambiguities, return all possible amino acids.
444
+
445
+ def amino_acid_2 (bases)
446
+ bases_to_aa = []
447
+ aa_list = []
448
+ base1 = bases[0].to_list
449
+ base2 = bases[1].to_list
450
+ base3 = bases[2].to_list
451
+ l1 = base1.size - 1
452
+ l2 = base2.size - 1
453
+ l3 = base3.size - 1
454
+ (0..l1).each do |n1|
455
+ b1 = base1[n1]
456
+ (0..l2).each do |n2|
457
+ b2 = base2[n2]
458
+ (0..l3).each do |n3|
459
+ b3 = base3[n3]
460
+ bases_all = b1 + b2 + b3
461
+ bases_to_aa << bases_all
462
+ end
463
+ end
464
+ end
465
+
466
+ bases_to_aa.each do |base|
467
+ case base
468
+ when /^TT[TCY]$/
469
+ aa = "F"
470
+ when /^TT[AGR]$/
471
+ aa = "L"
472
+ when /^CT.$/
473
+ aa = "L"
474
+ when /^AT[TCAHYWM]$/
475
+ aa = "I"
476
+ when "ATG"
477
+ aa = "M"
478
+ when /^GT.$/
479
+ aa = "V"
480
+ when /^TC.$/
481
+ aa = "S"
482
+ when /^CC.$/
483
+ aa = "P"
484
+ when /^AC.$/
485
+ aa = "T"
486
+ when /^GC.$/
487
+ aa = "A"
488
+ when /^TA[TCY]$/
489
+ aa = "Y"
490
+ when /^TA[AGR]$/
491
+ aa = "*"
492
+ when /^T[GR]A$/
493
+ aa = "*"
494
+ when /^CA[TCY]$/
495
+ aa = "H"
496
+ when /^CA[AGR]$/
497
+ aa = "Q"
498
+ when /^AA[TCY]$/
499
+ aa = "N"
500
+ when /^AA[AGR]$/
501
+ aa = "K"
502
+ when /^GA[TCY]$/
503
+ aa = "D"
504
+ when /^GA[AGR]$/
505
+ aa = "E"
506
+ when /^TG[TCY]$/
507
+ aa = "C"
508
+ when "TGG"
509
+ aa = "W"
510
+ when /^CG.$/
511
+ aa = "R"
512
+ when /^AG[TCY]$/
513
+ aa = "S"
514
+ when /^[AM]G[AGR]$/
515
+ aa = "R"
516
+ when /^GG.$/
517
+ aa = "G"
518
+ when /^[ATW][CGS][CTY]$/
519
+ aa = "S"
520
+ when /^[TCY]T[AGR]$/
521
+ aa = "L"
522
+ else
523
+ aa = "-"
524
+ end
525
+ aa_list << aa
526
+ end
527
+ aa_out = aa_list.uniq.join
528
+ return aa_out
529
+ end # end of #amino_acid_2
530
+
531
+ # sdrm position hash
532
+ def sdrm_hash(options)
533
+ sdrm = {}
534
+ case options
535
+ when :hcv_ns5a
536
+ sdrm[28] = ['M',['T']]
537
+ sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
538
+ sdrm[31] = ['L',['M','V','F']]
539
+ sdrm[32] = ['P',['L']]
540
+ sdrm[44] = ['K',['R']]
541
+ sdrm[58] = ['H',['D','P','S']]
542
+ sdrm[64] = ['T',['A','S']]
543
+ sdrm[77] = ['P',['A','S']]
544
+ sdrm[78] = ['R',['K']]
545
+ sdrm[79] = ['T',['A']]
546
+ sdrm[83] = ['T',['M']]
547
+ sdrm[85] = ['S',['N','H','Y']]
548
+ sdrm[92] = ['A',['P','T','K','E']]
549
+ sdrm[93] = ['Y',['C','F','H','N']]
550
+ sdrm[107] = ['K',['T','S']]
551
+ sdrm[121] = ['I',['V']]
552
+ sdrm[135] = ['T',['A']]
553
+ when :nrti
554
+ sdrm[41] = ['M',['L']]
555
+ sdrm[65] = ['K',['R']]
556
+ sdrm[67] = ['D',['N','G','E']]
557
+ sdrm[69] = ['T',['D']]
558
+ sdrm[70] = ['K',['R','E']]
559
+ sdrm[74] = ['L',['V','I']]
560
+ sdrm[75] = ['V',['M','T','A','S']]
561
+ sdrm[77] = ['F',['L']]
562
+ sdrm[115] = ['Y',['F']]
563
+ sdrm[116] = ['F',['Y']]
564
+ sdrm[151] = ['Q',['M']]
565
+ sdrm[184] = ['M',['V','I']]
566
+ sdrm[210] = ['L',['W']]
567
+ sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
568
+ sdrm[219] = ["K",["Q","E","N","R"]]
569
+ when :nnrti
570
+ sdrm[100] = ['L',['I']]
571
+ sdrm[101] = ['K',['E','P']]
572
+ sdrm[103] = ['K',['N','S']]
573
+ sdrm[106] = ['V',['M','A']]
574
+ sdrm[179] = ['V',['F','D']]
575
+ sdrm[181] = ['Y',['C','I','V']]
576
+ sdrm[188] = ['Y',['L','H','C']]
577
+ sdrm[190] = ['G',['A','S','E']]
578
+ sdrm[225] = ['P',['H']]
579
+ sdrm[230] = ['M',['L']]
580
+ when :hiv_pr
581
+ sdrm[23] = ['L',['I']]
582
+ sdrm[24] = ['L',['I']]
583
+ sdrm[30] = ['D',['N']]
584
+ sdrm[32] = ['V',['I']]
585
+ sdrm[46] = ['M',['I','L']]
586
+ sdrm[47] = ['I',['V','A']]
587
+ sdrm[48] = ['G',['V','M']]
588
+ sdrm[50] = ['I',['V','L']]
589
+ sdrm[53] = ['F',['L']]
590
+ sdrm[54] = ['I',['V','L','M','T','A','S']]
591
+ sdrm[73] = ['G',['S','T','C','A']]
592
+ sdrm[76] = ['L',['V']]
593
+ sdrm[82] = ['V',['A','T','S','F','L','C','M']]
594
+ sdrm[83] = ['N',['D']]
595
+ sdrm[84] = ['I',['V','A','C']]
596
+ sdrm[88] = ['N',['D','S']]
597
+ sdrm[90] = ['L',['M']]
598
+ when :hiv_in
599
+ sdrm[66] = ['T',['A','I','K']]
600
+ sdrm[74] = ['L',['M']]
601
+ sdrm[92] = ['E',['Q']]
602
+ sdrm[95] = ['Q',['K']]
603
+ sdrm[97] = ['T',['A']]
604
+ sdrm[121] = ['F',['Y']]
605
+ sdrm[140] = ['G',['A','S','C']]
606
+ sdrm[143] = ["Y",["C","H","R"]]
607
+ sdrm[147] = ['S',['G']]
608
+ sdrm[148] = ['Q',['H','K','R']]
609
+ sdrm[155] = ['N',['S','H']]
610
+ else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
388
611
  end
612
+ return sdrm
389
613
  end
390
- Regexp.new match
391
- end
392
- end
614
+ end # end of ViralSeq::Sequence
615
+ end # end of ViralSeq