viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
@@ -0,0 +1,219 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# Class for paired-end sequences.
|
5
|
+
# @example initialize a new SeqHashPair object from a directory containing paired-end sequences
|
6
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.fa('my_seq_directory')
|
7
|
+
# @example join the paired-end sequences with an overlap of 100 bp
|
8
|
+
# my_seqhashpair.join1(100)
|
9
|
+
# @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
|
10
|
+
# my_seqhashpair.join1(:indiv)
|
11
|
+
|
12
|
+
class SeqHashPair
|
13
|
+
|
14
|
+
# initialize SeqHashPair object with @dna_hash, @title and @file
|
15
|
+
|
16
|
+
def initialize (dna_hash = {}, title = "", file = [])
|
17
|
+
@dna_hash = dna_hash
|
18
|
+
@title = title
|
19
|
+
@file = file
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Hash] Hash object for :name => [:r1_sequence_string, :r2_sequence_string]
|
23
|
+
|
24
|
+
attr_accessor :dna_hash
|
25
|
+
|
26
|
+
# @return [String] the title of the SeqHash object.
|
27
|
+
# default as the directory basename if SeqHash object is initialized using ::fa
|
28
|
+
|
29
|
+
attr_accessor :title
|
30
|
+
|
31
|
+
# @return [String] the r1 and r2 files that are used to initialize SeqHash object, if they exist
|
32
|
+
|
33
|
+
attr_accessor :file
|
34
|
+
|
35
|
+
# initialize a new ViralSeq::SeqHashPair object from a directory containing paired sequence files in the FASTA format
|
36
|
+
# @param indir [String] directory containing paired sequence files in the FASTA format,
|
37
|
+
#
|
38
|
+
# Paired sequence files need to have "r1" and "r2" in their file names
|
39
|
+
#
|
40
|
+
# Example for the file structure
|
41
|
+
# ├───lib1
|
42
|
+
# │ lib1_r1.txt
|
43
|
+
# │ lib1_r2.txt
|
44
|
+
# The sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
45
|
+
# @return [ViralSeq::SeqHashPair] new SeqHashPair object from the paired FASTA sequence files
|
46
|
+
# @example initialize a new SeqHashPair object from a directory containing paired-end sequences
|
47
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.fa('spec/sample_paired_seq')
|
48
|
+
|
49
|
+
def self.new_from_fasta(indir)
|
50
|
+
files = Dir[indir + "/*"]
|
51
|
+
r1_file = ""
|
52
|
+
r2_file = ""
|
53
|
+
files.each do |f|
|
54
|
+
if File.basename(f) =~ /r1/i
|
55
|
+
r1_file = f
|
56
|
+
elsif File.basename(f) =~ /r2/i
|
57
|
+
r2_file = f
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
seq1 = ViralSeq::SeqHash.fa(r1_file).dna_hash
|
62
|
+
seq2 = ViralSeq::SeqHash.fa(r2_file).dna_hash
|
63
|
+
|
64
|
+
new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
65
|
+
new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
66
|
+
|
67
|
+
seq_pair_hash = {}
|
68
|
+
|
69
|
+
new_seq1.each do |seq_name,seq|
|
70
|
+
seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
|
71
|
+
end
|
72
|
+
seq_hash = ViralSeq::SeqHashPair.new
|
73
|
+
seq_hash.dna_hash = seq_pair_hash
|
74
|
+
seq_hash.title = File.basename(indir,".*")
|
75
|
+
seq_hash.file = [r1_file, r2_file]
|
76
|
+
return seq_hash
|
77
|
+
end # end of .new_from_fasta
|
78
|
+
|
79
|
+
class << self
|
80
|
+
alias_method :fa, :new_from_fasta
|
81
|
+
end
|
82
|
+
|
83
|
+
# Pair-end join function for KNOWN overlap size.
|
84
|
+
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
|
+
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
86
|
+
# @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
|
87
|
+
# @example join paired-end sequences with different :diff cut-offs, overlap provided.
|
88
|
+
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
89
|
+
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
90
|
+
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
91
|
+
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
92
|
+
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
93
|
+
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
94
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seqs)
|
95
|
+
# my_seqhashpair.join1(100).dna_hash.keys
|
96
|
+
# => [">pair1"]
|
97
|
+
# my_seqhashpair.join1(100,0.01).dna_hash.keys
|
98
|
+
# => [">pair1", ">pair2"]
|
99
|
+
# my_seqhashpair.join1(100,0.02).dna_hash.keys
|
100
|
+
# => [">pair1", ">pair2", ">pair3"]
|
101
|
+
|
102
|
+
def join1(overlap = 0, diff = 0.0)
|
103
|
+
seq_pair_hash = self.dna_hash
|
104
|
+
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
105
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
106
|
+
joined_seq = {}
|
107
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
108
|
+
r1_seq = seq_pair[0]
|
109
|
+
r2_seq = seq_pair[1]
|
110
|
+
if overlap.zero?
|
111
|
+
joined_seq[seq_name] = r1_seq + r2_seq
|
112
|
+
elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
|
113
|
+
joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
|
114
|
+
else
|
115
|
+
next
|
116
|
+
end
|
117
|
+
end
|
118
|
+
joined_seq_hash = ViralSeq::SeqHash.new
|
119
|
+
joined_seq_hash.dna_hash = joined_seq
|
120
|
+
joined_seq_hash.title = self.title + "_joined"
|
121
|
+
joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
|
122
|
+
return joined_seq_hash
|
123
|
+
end # end of join1
|
124
|
+
|
125
|
+
|
126
|
+
# Pair-end join function for UNKNOWN overlap.
|
127
|
+
# @param model [Symbol] models used to determine the overlap, `:con`, `:indiv`
|
128
|
+
#
|
129
|
+
# model `:con`: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
130
|
+
#
|
131
|
+
# note: minimal overlap as 4 bases.
|
132
|
+
# model `:indiv`: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
133
|
+
# @param diff (see #join1)
|
134
|
+
# @return (see #join1)
|
135
|
+
# @example join paired-end sequences, overlap NOT provided
|
136
|
+
# paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
|
137
|
+
# ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
|
138
|
+
# ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"] }
|
139
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
|
+
# my_seqhashpair.join2.dna_hash
|
141
|
+
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
+
# my_seqhashpair.join2(:indiv).dna_hash
|
143
|
+
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
|
+
|
145
|
+
def join2(model = :con, diff = 0.0)
|
146
|
+
seq_pair_hash = self.dna_hash
|
147
|
+
begin
|
148
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
149
|
+
if model == :con
|
150
|
+
overlap = determine_overlap_pid_pair(seq_pair_hash, diff)
|
151
|
+
return self.join1(overlap, diff)
|
152
|
+
elsif model == :indiv
|
153
|
+
joined_seq = {}
|
154
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
155
|
+
overlap_list = []
|
156
|
+
overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
|
157
|
+
cut_off_base = overlap1 * diff
|
158
|
+
overlap_list << overlap1 if diff_nt <= cut_off_base
|
159
|
+
end
|
160
|
+
if overlap_list.empty?
|
161
|
+
joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
|
162
|
+
else
|
163
|
+
overlap = overlap_list.max
|
164
|
+
joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
joined_seq_hash = ViralSeq::SeqHash.new
|
168
|
+
joined_seq_hash.dna_hash = joined_seq
|
169
|
+
joined_seq_hash.title = self.title + "_joined"
|
170
|
+
joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
|
171
|
+
return joined_seq_hash
|
172
|
+
else
|
173
|
+
raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \`#{model}\`, expected `:con` or `:indiv`.")
|
174
|
+
end
|
175
|
+
rescue ArgumentError => e
|
176
|
+
puts e
|
177
|
+
return nil
|
178
|
+
end
|
179
|
+
end # end of join2
|
180
|
+
|
181
|
+
private
|
182
|
+
# determine overlap size from @dna_hash
|
183
|
+
def determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
|
184
|
+
overlaps = []
|
185
|
+
seq_pair_hash.each do |_seq_name, seq_pair|
|
186
|
+
overlap_list = []
|
187
|
+
matrix = overlap_matrix(seq_pair[0], seq_pair[1])
|
188
|
+
matrix.each do |overlap, diff_nt|
|
189
|
+
cut_off_base = overlap * diff
|
190
|
+
overlap_list << overlap if diff_nt <= cut_off_base
|
191
|
+
end
|
192
|
+
if overlap_list.empty?
|
193
|
+
overlaps << 0
|
194
|
+
else
|
195
|
+
overlaps << overlap_list.max
|
196
|
+
end
|
197
|
+
end
|
198
|
+
count_overlaps = overlaps.count_freq
|
199
|
+
max_value = count_overlaps.values.max
|
200
|
+
max_overlap_list = []
|
201
|
+
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
202
|
+
max_overlap_list.max
|
203
|
+
end # end pf determine_overlap_pid_pair
|
204
|
+
|
205
|
+
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
206
|
+
# {:overlap_size => number_of_differnt_positions, ...}
|
207
|
+
# {minimal overlap set to 4. }
|
208
|
+
def overlap_matrix(sequence1, sequence2)
|
209
|
+
min_overlap = 4
|
210
|
+
max_overlap = [sequence1.size, sequence2.size].max
|
211
|
+
matrix_hash = {}
|
212
|
+
(min_overlap..max_overlap).each do |overlap|
|
213
|
+
matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
|
214
|
+
end
|
215
|
+
return matrix_hash
|
216
|
+
end # end of overlap_matrix
|
217
|
+
|
218
|
+
end # end of SeqHashPair
|
219
|
+
end # end of ViralSeq
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -1,392 +1,615 @@
|
|
1
|
-
# lib/sequence.rb
|
2
|
-
# Includes functions for sequence operations
|
3
|
-
# Including methods as:
|
4
|
-
# ViralSeq::AMINO_ACID_LIST
|
5
|
-
# ViralSeq::Sequence
|
6
|
-
# ViralSeq::Sequence#rev_complement
|
7
|
-
# ViralSeq::Sequence#get_aa_sequence
|
8
|
-
# ViralSeq::Sequence#get_aa_array
|
9
|
-
# ViralSeq::Sequence#name
|
10
|
-
# ViralSeq::Sequence#dna_sequence
|
11
|
-
# ViralSeq::Sequence#aa_sequence
|
12
|
-
# ViralSeq::Sequence#aa_array
|
13
|
-
# ViralSeq::amino_acid
|
14
|
-
# ViralSeq::amino_acid_2
|
15
|
-
# ViralSeq::to_list
|
16
|
-
# ViralSeq::uniq_sequence_hash
|
17
|
-
# ViralSeq::stop_codon_seq_hash
|
18
|
-
# String#rc
|
19
|
-
# String#mutation
|
20
|
-
# String#nt_parser
|
21
|
-
|
22
|
-
# ViralSeq::AMINO_ACID_LIST
|
23
|
-
# # Array of all amino acid one letter abbreviations
|
24
|
-
|
25
|
-
# ViralSeq::Sequence
|
26
|
-
# # Sequence class
|
27
|
-
# =USAGE
|
28
|
-
# # create a sequence object
|
29
|
-
# seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
|
30
|
-
#
|
31
|
-
# # print dna sequence
|
32
|
-
# puts seq.dna_sequence
|
33
|
-
#
|
34
|
-
# # reserce complement sequence of DNA sequence, return as a string
|
35
|
-
# seq.rev_complement
|
36
|
-
#
|
37
|
-
# # change @dna_sequence to reverse complement DNA sequence
|
38
|
-
# seq.rev_complement!
|
39
|
-
#
|
40
|
-
# # generate amino acid sequences. either return string or array.
|
41
|
-
# # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
|
42
|
-
# # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
|
43
|
-
# seq.get_aa_sequence
|
44
|
-
# # or
|
45
|
-
# seq.get_aa_array
|
46
|
-
#
|
47
|
-
# # print amino acid sequence
|
48
|
-
# puts seq.aa_sequence
|
49
|
-
|
50
|
-
# ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
|
51
|
-
# # collapse sequence hash to unique sequence hash.
|
52
|
-
# # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
|
53
|
-
# # master_sequence_tag is the master tag for unique sequences
|
54
|
-
# # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
|
55
|
-
# =USAGE
|
56
|
-
# sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
|
57
|
-
# '>seq4' => 'CCCC', '>seq5' => 'CCCC',
|
58
|
-
# '>seq6' => 'TTTT' }
|
59
|
-
# uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
|
60
|
-
# => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
|
61
1
|
|
62
2
|
module ViralSeq
|
63
3
|
|
64
|
-
#
|
65
|
-
|
66
|
-
|
67
|
-
#
|
4
|
+
# ViralSeq::Sequence class for sequence operation
|
5
|
+
#
|
6
|
+
# @example create a sequence object
|
7
|
+
# seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
|
8
|
+
# => #<ViralSeq::Sequence:0x00007fd03c8c10b8 @name="my_sequence", @dna="ACCTAGGTTCGGAGC", @aa_string="", @aa_array=[]>
|
9
|
+
#
|
10
|
+
# @example return dna sequence as String
|
11
|
+
# seq.dna
|
12
|
+
# => "ACCTAGGTTCGGAGC"
|
13
|
+
#
|
14
|
+
# @example reverse complement sequence of DNA sequence
|
15
|
+
# seq.rc
|
16
|
+
# => "GCTCCGAACCTAGGT"
|
17
|
+
#
|
18
|
+
# @example change @dna to reverse complement DNA sequence
|
19
|
+
# seq.rc!
|
20
|
+
#
|
21
|
+
# @example translate the DNA sequence, return values for @aa_string and @aa_array
|
22
|
+
# seq = ViralSeq::Sequence.new('my_sequence', 'AWTCGRAGAG')
|
23
|
+
# seq.translate(1)
|
24
|
+
# seq.aa_string
|
25
|
+
# => "##E"
|
26
|
+
# seq.aa_array
|
27
|
+
# => ["IF", "EG", "E"]
|
68
28
|
|
69
29
|
class Sequence
|
30
|
+
# initialize a ViralSeq::Sequence class with sequence name (default as '>sequence')
|
31
|
+
# and DNA sequence as String object
|
70
32
|
def initialize (name = ">sequence",dna_sequence ="")
|
71
33
|
@name = name
|
72
|
-
@
|
73
|
-
@
|
34
|
+
@dna = dna_sequence.upcase
|
35
|
+
@aa_string = ""
|
74
36
|
@aa_array = []
|
75
37
|
end
|
76
38
|
|
77
|
-
|
39
|
+
# @return [String] sequence tag name
|
40
|
+
attr_accessor :name
|
41
|
+
|
42
|
+
# @return [String] DNA sequence
|
43
|
+
attr_accessor :dna
|
44
|
+
|
45
|
+
# @return [String] amino acid sequence
|
46
|
+
attr_accessor :aa_string
|
47
|
+
|
48
|
+
# @return [Array] amino acid sequence as an Array object,
|
49
|
+
# ambiguity dna sequence will be translated in all possible amino acid sequence at the position
|
50
|
+
attr_accessor :aa_array
|
78
51
|
|
52
|
+
# @return [String] reverse compliment sequence of the @dna.
|
79
53
|
def rev_complement
|
80
|
-
@
|
54
|
+
@dna.rc
|
81
55
|
end
|
56
|
+
|
57
|
+
# replace the @dna with reverse complement DNA sequence.
|
82
58
|
def rev_complement!
|
83
|
-
@
|
59
|
+
@dna = @dna.rc
|
84
60
|
end
|
85
61
|
|
86
|
-
|
87
|
-
|
88
|
-
|
62
|
+
alias_method :rc, :rev_complement
|
63
|
+
alias_method :rc!, :rev_complement!
|
64
|
+
|
65
|
+
# translate @dna to amino acid sequence.
|
66
|
+
# generate values for @aa_string and @aa_array
|
67
|
+
# @param initial_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
|
68
|
+
|
69
|
+
def translate(initial_position = 0)
|
70
|
+
@aa_string = ""
|
71
|
+
require_sequence = @dna[initial_position..-1]
|
89
72
|
base_array = []
|
90
73
|
require_sequence.each_char {|base| base_array << base}
|
91
74
|
while (base_array.length>=3) do
|
92
75
|
base_3= ""
|
93
76
|
3.times {base_3 += base_array.shift}
|
94
|
-
@
|
77
|
+
@aa_string << amino_acid(base_3)
|
95
78
|
end
|
96
|
-
return @aa_sequence
|
97
|
-
end
|
98
79
|
|
99
|
-
# get amino acid calls, return a array.keep ambiguity calls.
|
100
|
-
def get_aa_array(initial_position = 0)
|
101
80
|
@aa_array = []
|
102
|
-
require_sequence = @
|
81
|
+
require_sequence = @dna[initial_position..-1].tr('-','N')
|
103
82
|
base_array = []
|
104
83
|
require_sequence.each_char {|base| base_array << base}
|
105
84
|
while (base_array.length>=3) do
|
106
85
|
base_3= ""
|
107
86
|
3.times{base_3 += base_array.shift}
|
108
|
-
@aa_array<<
|
87
|
+
@aa_array<< amino_acid_2(base_3)
|
109
88
|
end
|
110
|
-
return @aa_array
|
111
89
|
end
|
90
|
+
|
91
|
+
# @return [Integer] length of DNA sequence
|
112
92
|
def dna_length
|
113
|
-
@
|
93
|
+
@dna.length
|
114
94
|
end
|
95
|
+
|
96
|
+
# @return [Integer] length of amino acid sequence
|
115
97
|
def aa_length
|
116
|
-
@
|
98
|
+
@aa_string.length
|
117
99
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
return "N"
|
155
|
-
when /^AA[AGR]$/
|
156
|
-
return "K"
|
157
|
-
when /^GA[TCY]$/
|
158
|
-
return "D"
|
159
|
-
when /^GA[AGR]$/
|
160
|
-
return "E"
|
161
|
-
when /^TG[TCY]$/
|
162
|
-
return "C"
|
163
|
-
when "TGG"
|
164
|
-
return "W"
|
165
|
-
when /^CG.$/
|
166
|
-
return "R"
|
167
|
-
when /^AG[TCY]$/
|
168
|
-
return "S"
|
169
|
-
when /^[AM]G[AGR]$/
|
170
|
-
return "R"
|
171
|
-
when /^GG.$/
|
172
|
-
return "G"
|
173
|
-
when /^[ATW][CGS][CTY]$/
|
174
|
-
return "S"
|
175
|
-
when /^[TCY]T[AGR]$/
|
176
|
-
return "L"
|
177
|
-
else
|
178
|
-
return "#"
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
# keep ambiguities, return all possible amino acids.
|
183
|
-
|
184
|
-
def self.amino_acid_2 (bases)
|
185
|
-
bases_to_aa = []
|
186
|
-
aa_list = []
|
187
|
-
base1 = ViralSeq.to_list(bases[0])
|
188
|
-
base2 = ViralSeq.to_list(bases[1])
|
189
|
-
base3 = ViralSeq.to_list(bases[2])
|
190
|
-
l1 = base1.size - 1
|
191
|
-
l2 = base2.size - 1
|
192
|
-
l3 = base3.size - 1
|
193
|
-
(0..l1).each do |n1|
|
194
|
-
b1 = base1[n1]
|
195
|
-
(0..l2).each do |n2|
|
196
|
-
b2 = base2[n2]
|
197
|
-
(0..l3).each do |n3|
|
198
|
-
b3 = base3[n3]
|
199
|
-
bases_all = b1 + b2 + b3
|
200
|
-
bases_to_aa << bases_all
|
100
|
+
|
101
|
+
# resistant mutation interpretation for a chosen region from a translated ViralSeq::Sequence object
|
102
|
+
# @param option [Symbol] option of region to interpret, `:hcv_ns5a`, `:hiv_pr`, `:nrti`, `:nnrti`, `hiv_in`
|
103
|
+
# @param start_aa [Integer] the starting aa number of the input sequence
|
104
|
+
# @return [Hash] return a Hash object for SDRMs identified. :posiiton => [:wildtype_codon, :mutation_codon]
|
105
|
+
# @example examine an HIV PR region sequence for drug resistance mutations
|
106
|
+
# my_seq_name = 'a_pr_seq'
|
107
|
+
# my_seq = 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAAATAGGAGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATAATACAGTATTAGAAGACATGGAGTTACCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATCTGTGGGCATAAAACTACAGGTACAGTGTTAATAGGACCTACACCCGTCAACATAATTGGAAGAGATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTT'
|
108
|
+
# s = ViralSeq::Sequence.new(my_seq_name, my_seq)
|
109
|
+
# s.translate
|
110
|
+
# s.sdrm(:hiv_pr)
|
111
|
+
# => {30=>["D", "N"], 88=>["N", "D"]}
|
112
|
+
|
113
|
+
def sdrm(option, start_aa = 1)
|
114
|
+
aa_array = self.aa_array
|
115
|
+
out_hash = {}
|
116
|
+
sdrm = sdrm_hash(option)
|
117
|
+
aa_length = aa_array.size
|
118
|
+
end_aa = start_aa + aa_length - 1
|
119
|
+
(start_aa..end_aa).each do |position|
|
120
|
+
array_position = position - start_aa
|
121
|
+
if sdrm.keys.include?(position)
|
122
|
+
wt_aa = sdrm[position][0]
|
123
|
+
test_aa = aa_array[array_position]
|
124
|
+
if test_aa.size == 1
|
125
|
+
unless wt_aa == test_aa
|
126
|
+
if sdrm[position][1].include?(test_aa)
|
127
|
+
out_hash[position] = [wt_aa,test_aa]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
else
|
131
|
+
test_aa_array = test_aa.split("")
|
132
|
+
if (test_aa_array & sdrm[position][1])
|
133
|
+
out_hash[position] = [wt_aa,test_aa]
|
134
|
+
end
|
135
|
+
end
|
201
136
|
end
|
202
137
|
end
|
203
|
-
|
138
|
+
return out_hash
|
139
|
+
end # end of #hcv_ns5a
|
204
140
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
141
|
+
# HIV sequence locator function, resembling HIV Sequence Locator from LANL
|
142
|
+
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
143
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
144
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
145
|
+
# @return [Array] an array of the following info
|
146
|
+
# # start_location (Integer)
|
147
|
+
# # end_location (Integer)
|
148
|
+
# # percentage_of_similarity_to_reference_sequence (Float)
|
149
|
+
# # containing_indel? (Boolean)
|
150
|
+
# # aligned_input_sequence (String)
|
151
|
+
# # aligned_reference_sequence (String)
|
152
|
+
# @example identify the location of the input sequence on the NL43 genome
|
153
|
+
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
154
|
+
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
155
|
+
# loc = s.locator(:NL43)
|
156
|
+
# h = ViralSeq::SeqHash.new; h.dna_hash['NL43'] = loc[5]; h.dna_hash[s.name] = loc[4]
|
157
|
+
# rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
|
158
|
+
# puts "The input sequence \"#{s.name}\" is located on the NL43 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
|
159
|
+
# => The input sequence "my_sequence" is located on the NL43 nt sequence from 2333 to 2433.
|
160
|
+
# => It is 98.0% similar to the reference.
|
161
|
+
# => It does not have indels.
|
162
|
+
# => The alignment is
|
163
|
+
# => NL43 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
|
164
|
+
# => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
|
165
|
+
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
166
|
+
|
167
|
+
def locator(ref_option = :HXB2, path_to_muscle = false)
|
168
|
+
seq = self.dna
|
169
|
+
ori_ref = ViralSeq::RefSeq.get(ref_option)
|
170
|
+
|
171
|
+
begin
|
172
|
+
ori_ref_l = ori_ref.size
|
173
|
+
l1 = 0
|
174
|
+
l2 = 0
|
175
|
+
|
176
|
+
aln_seq = ViralSeq::Muscle.align(ori_ref, seq, path_to_muscle)
|
177
|
+
aln_test = aln_seq[1]
|
178
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
179
|
+
gap_begin = $1.size
|
180
|
+
gap_end = $3.size
|
181
|
+
aln_test2 = $2
|
182
|
+
ref = aln_seq[0]
|
183
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
184
|
+
ref_size = ref.size
|
185
|
+
if ref_size > 1.3*(seq.size)
|
186
|
+
l1 = l1 + gap_begin
|
187
|
+
l2 = l2 + gap_end
|
188
|
+
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
189
|
+
aln_test2 =~ /#{max_seq}/
|
190
|
+
before_aln_seq = $`
|
191
|
+
before_aln = $`.size
|
192
|
+
post_aln_seq = $'
|
193
|
+
post_aln = $'.size
|
194
|
+
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
195
|
+
b1 = (1.3 * before_aln_seq_size).to_i
|
196
|
+
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
197
|
+
b2 = (1.3 * post_aln_seq_size).to_i
|
198
|
+
if (before_aln > seq.size) and (post_aln <= seq.size)
|
199
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
200
|
+
l1 = l1 + (before_aln - b1)
|
201
|
+
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
202
|
+
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
203
|
+
l2 = l2 + post_aln - b2
|
204
|
+
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
205
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
206
|
+
l1 = l1 + (before_aln - b1)
|
207
|
+
l2 = l2 + (post_aln - b2)
|
208
|
+
end
|
209
|
+
|
210
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
211
|
+
aln_test = aln_seq[1]
|
212
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
213
|
+
gap_begin = $1.size
|
214
|
+
gap_end = $3.size
|
215
|
+
ref = aln_seq[0]
|
216
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
217
|
+
end
|
218
|
+
|
219
|
+
aln_test = aln_seq[1]
|
220
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
221
|
+
gap_begin = $1.size
|
222
|
+
gap_end = $3.size
|
223
|
+
aln_test = $2
|
224
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
225
|
+
s1 = $1.size
|
226
|
+
g1 = $2.size
|
227
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
228
|
+
s2 = $2.size
|
229
|
+
g2 = $1.size
|
230
|
+
|
231
|
+
l1 = l1 + gap_begin
|
232
|
+
l2 = l2 + gap_end
|
233
|
+
repeat = 0
|
234
|
+
|
235
|
+
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
236
|
+
if s1 > s2 and g2 > 2*s2
|
237
|
+
ref = ref[0..(-g2-1)]
|
238
|
+
repeat = 1
|
239
|
+
l2 = l2 + g2
|
240
|
+
elsif s1 < s2 and g1 > 2*s1
|
241
|
+
ref = ref[g1..-1]
|
242
|
+
repeat = 1
|
243
|
+
l1 = l1 + g1
|
244
|
+
end
|
245
|
+
else
|
246
|
+
if g1 > 2*s1
|
247
|
+
ref = ref[g1..-1]
|
248
|
+
repeat = 1
|
249
|
+
l1 = l1 + g1
|
250
|
+
end
|
251
|
+
if g2 > 2*s2
|
252
|
+
ref = ref[0..(-g2 - 1)]
|
253
|
+
repeat = 1
|
254
|
+
l2 = l2 + g2
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
while repeat == 1
|
259
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
260
|
+
aln_test = aln_seq[1]
|
261
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
262
|
+
gap_begin = $1.size
|
263
|
+
gap_end = $3.size
|
264
|
+
aln_test = $2
|
265
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
266
|
+
s1 = $1.size
|
267
|
+
g1 = $2.size
|
268
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
269
|
+
s2 = $2.size
|
270
|
+
g2 = $1.size
|
271
|
+
ref = aln_seq[0]
|
272
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
273
|
+
l1 = l1 + gap_begin
|
274
|
+
l2 = l2 + gap_end
|
275
|
+
repeat = 0
|
276
|
+
if g1 > 2*s1
|
277
|
+
ref = ref[g1..-1]
|
278
|
+
repeat = 1
|
279
|
+
l1 = l1 + g1
|
280
|
+
end
|
281
|
+
if g2 > 2*s2
|
282
|
+
ref = ref[0..(-g2 - 1)]
|
283
|
+
repeat = 1
|
284
|
+
l2 = l2 + g2
|
285
|
+
end
|
286
|
+
end
|
287
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
288
|
+
|
289
|
+
|
290
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
291
|
+
aln_test = aln_seq[1]
|
292
|
+
ref = aln_seq[0]
|
293
|
+
|
294
|
+
#refine alignment
|
295
|
+
|
296
|
+
if ref =~ /^(\-+)/
|
297
|
+
l1 = l1 - $1.size
|
298
|
+
elsif ref =~ /(\-+)$/
|
299
|
+
l2 = l2 + $1.size
|
300
|
+
end
|
301
|
+
|
302
|
+
if (ori_ref_l - l2 - 1) >= l1
|
303
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
304
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
305
|
+
aln_test = aln_seq[1]
|
306
|
+
ref = aln_seq[0]
|
307
|
+
|
308
|
+
ref_size = ref.size
|
309
|
+
sim_count = 0
|
310
|
+
(0..(ref_size-1)).each do |n|
|
311
|
+
ref_base = ref[n]
|
312
|
+
test_base = aln_test[n]
|
313
|
+
sim_count += 1 if ref_base == test_base
|
314
|
+
end
|
315
|
+
similarity = (sim_count/ref_size.to_f*100).round(1)
|
316
|
+
|
317
|
+
loc_p1 = l1 + 1
|
318
|
+
loc_p2 = ori_ref_l - l2
|
319
|
+
if seq.size != (loc_p2 - loc_p1 + 1)
|
320
|
+
indel = true
|
321
|
+
elsif aln_test.include?("-")
|
322
|
+
indel = true
|
323
|
+
else
|
324
|
+
indel = false
|
325
|
+
end
|
326
|
+
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
327
|
+
else
|
328
|
+
return [0,0,0,0,0,0,0]
|
329
|
+
end
|
330
|
+
rescue => e
|
331
|
+
puts "Unexpected error occured."
|
332
|
+
puts "Exception Class: #{ e.class.name }"
|
333
|
+
puts "Exception Message: #{ e.message }"
|
334
|
+
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
335
|
+
puts "ViralSeq.sequence_locator returns nil"
|
336
|
+
return nil
|
329
337
|
end
|
330
|
-
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
end
|
335
|
-
|
336
|
-
#
|
337
|
-
#
|
338
|
-
#
|
339
|
-
#
|
340
|
-
# "
|
341
|
-
#
|
342
|
-
#
|
343
|
-
#
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
pool = ["A","C","T","G"]
|
368
|
-
pool.delete(nt)
|
369
|
-
s = error_rate * 10000
|
370
|
-
r = rand(10000)
|
371
|
-
if r < s
|
372
|
-
nt = pool.sample
|
338
|
+
end # end of locator
|
339
|
+
|
340
|
+
# Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
|
341
|
+
# @param p1 [Integer] start position number on the reference genome
|
342
|
+
# @param p2 [Integer] end position number on the reference genome
|
343
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
344
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
345
|
+
# @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
|
346
|
+
# if either the start or end position is beyond the range of the target sequence.
|
347
|
+
# @example trim a sequence to fit in the range of [2333, 2433] on the HXB2 nt reference
|
348
|
+
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
349
|
+
# s = ViralSeq::Sequence.new('my_seq', seq)
|
350
|
+
# s.sequence_clip(2333, 2433, :HXB2).dna
|
351
|
+
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
352
|
+
|
353
|
+
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
354
|
+
loc = self.locator(ref_option, path_to_muscle)
|
355
|
+
l1 = loc[0]
|
356
|
+
l2 = loc[1]
|
357
|
+
if (p1 >= l1) & (p2 <= l2)
|
358
|
+
seq = loc[4]
|
359
|
+
ref = loc[5]
|
360
|
+
g1 = 0
|
361
|
+
ref.each_char do |char|
|
362
|
+
break if l1 == p1
|
363
|
+
g1 += 1
|
364
|
+
l1 += 1 unless char == "-"
|
365
|
+
end
|
366
|
+
g2 = 1
|
367
|
+
ref.reverse.each_char do |char|
|
368
|
+
break if l2 == p2
|
369
|
+
g2 += 1
|
370
|
+
l2 -= 1 unless char == "-"
|
371
|
+
end
|
372
|
+
return ViralSeq::Sequence.new(self.name,seq[g1..(-g2)].tr("-",""))
|
373
|
+
else
|
374
|
+
return nil
|
373
375
|
end
|
374
|
-
new_string << nt
|
375
376
|
end
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
377
|
+
|
378
|
+
# start of private functions
|
379
|
+
private
|
380
|
+
|
381
|
+
# generate amino acid abbreviations from 3 bases, ambiguity will return "#"
|
382
|
+
def amino_acid (bases)
|
383
|
+
case bases
|
384
|
+
when /^TT[TCY]$/
|
385
|
+
return "F"
|
386
|
+
when /^TT[AGR]$/
|
387
|
+
return "L"
|
388
|
+
when /^CT.$/
|
389
|
+
return "L"
|
390
|
+
when /^AT[TCAHYWM]$/
|
391
|
+
return "I"
|
392
|
+
when "ATG"
|
393
|
+
return "M"
|
394
|
+
when /^GT.$/
|
395
|
+
return "V"
|
396
|
+
when /^TC.$/
|
397
|
+
return "S"
|
398
|
+
when /^CC.$/
|
399
|
+
return "P"
|
400
|
+
when /^AC.$/
|
401
|
+
return "T"
|
402
|
+
when /^GC.$/
|
403
|
+
return "A"
|
404
|
+
when /^TA[TCY]$/
|
405
|
+
return "Y"
|
406
|
+
when /^TA[AGR]$/
|
407
|
+
return "*"
|
408
|
+
when /^T[GR]A$/
|
409
|
+
return "*"
|
410
|
+
when /^CA[TCY]$/
|
411
|
+
return "H"
|
412
|
+
when /^CA[AGR]$/
|
413
|
+
return "Q"
|
414
|
+
when /^AA[TCY]$/
|
415
|
+
return "N"
|
416
|
+
when /^AA[AGR]$/
|
417
|
+
return "K"
|
418
|
+
when /^GA[TCY]$/
|
419
|
+
return "D"
|
420
|
+
when /^GA[AGR]$/
|
421
|
+
return "E"
|
422
|
+
when /^TG[TCY]$/
|
423
|
+
return "C"
|
424
|
+
when "TGG"
|
425
|
+
return "W"
|
426
|
+
when /^CG.$/
|
427
|
+
return "R"
|
428
|
+
when /^AG[TCY]$/
|
429
|
+
return "S"
|
430
|
+
when /^[AM]G[AGR]$/
|
431
|
+
return "R"
|
432
|
+
when /^GG.$/
|
433
|
+
return "G"
|
434
|
+
when /^[ATW][CGS][CTY]$/
|
435
|
+
return "S"
|
436
|
+
when /^[TCY]T[AGR]$/
|
437
|
+
return "L"
|
385
438
|
else
|
386
|
-
|
387
|
-
|
439
|
+
return "#"
|
440
|
+
end
|
441
|
+
end # end of amino_acid
|
442
|
+
|
443
|
+
# keep ambiguities, return all possible amino acids.
|
444
|
+
|
445
|
+
def amino_acid_2 (bases)
|
446
|
+
bases_to_aa = []
|
447
|
+
aa_list = []
|
448
|
+
base1 = bases[0].to_list
|
449
|
+
base2 = bases[1].to_list
|
450
|
+
base3 = bases[2].to_list
|
451
|
+
l1 = base1.size - 1
|
452
|
+
l2 = base2.size - 1
|
453
|
+
l3 = base3.size - 1
|
454
|
+
(0..l1).each do |n1|
|
455
|
+
b1 = base1[n1]
|
456
|
+
(0..l2).each do |n2|
|
457
|
+
b2 = base2[n2]
|
458
|
+
(0..l3).each do |n3|
|
459
|
+
b3 = base3[n3]
|
460
|
+
bases_all = b1 + b2 + b3
|
461
|
+
bases_to_aa << bases_all
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
bases_to_aa.each do |base|
|
467
|
+
case base
|
468
|
+
when /^TT[TCY]$/
|
469
|
+
aa = "F"
|
470
|
+
when /^TT[AGR]$/
|
471
|
+
aa = "L"
|
472
|
+
when /^CT.$/
|
473
|
+
aa = "L"
|
474
|
+
when /^AT[TCAHYWM]$/
|
475
|
+
aa = "I"
|
476
|
+
when "ATG"
|
477
|
+
aa = "M"
|
478
|
+
when /^GT.$/
|
479
|
+
aa = "V"
|
480
|
+
when /^TC.$/
|
481
|
+
aa = "S"
|
482
|
+
when /^CC.$/
|
483
|
+
aa = "P"
|
484
|
+
when /^AC.$/
|
485
|
+
aa = "T"
|
486
|
+
when /^GC.$/
|
487
|
+
aa = "A"
|
488
|
+
when /^TA[TCY]$/
|
489
|
+
aa = "Y"
|
490
|
+
when /^TA[AGR]$/
|
491
|
+
aa = "*"
|
492
|
+
when /^T[GR]A$/
|
493
|
+
aa = "*"
|
494
|
+
when /^CA[TCY]$/
|
495
|
+
aa = "H"
|
496
|
+
when /^CA[AGR]$/
|
497
|
+
aa = "Q"
|
498
|
+
when /^AA[TCY]$/
|
499
|
+
aa = "N"
|
500
|
+
when /^AA[AGR]$/
|
501
|
+
aa = "K"
|
502
|
+
when /^GA[TCY]$/
|
503
|
+
aa = "D"
|
504
|
+
when /^GA[AGR]$/
|
505
|
+
aa = "E"
|
506
|
+
when /^TG[TCY]$/
|
507
|
+
aa = "C"
|
508
|
+
when "TGG"
|
509
|
+
aa = "W"
|
510
|
+
when /^CG.$/
|
511
|
+
aa = "R"
|
512
|
+
when /^AG[TCY]$/
|
513
|
+
aa = "S"
|
514
|
+
when /^[AM]G[AGR]$/
|
515
|
+
aa = "R"
|
516
|
+
when /^GG.$/
|
517
|
+
aa = "G"
|
518
|
+
when /^[ATW][CGS][CTY]$/
|
519
|
+
aa = "S"
|
520
|
+
when /^[TCY]T[AGR]$/
|
521
|
+
aa = "L"
|
522
|
+
else
|
523
|
+
aa = "-"
|
524
|
+
end
|
525
|
+
aa_list << aa
|
526
|
+
end
|
527
|
+
aa_out = aa_list.uniq.join
|
528
|
+
return aa_out
|
529
|
+
end # end of #amino_acid_2
|
530
|
+
|
531
|
+
# sdrm position hash
|
532
|
+
def sdrm_hash(options)
|
533
|
+
sdrm = {}
|
534
|
+
case options
|
535
|
+
when :hcv_ns5a
|
536
|
+
sdrm[28] = ['M',['T']]
|
537
|
+
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
538
|
+
sdrm[31] = ['L',['M','V','F']]
|
539
|
+
sdrm[32] = ['P',['L']]
|
540
|
+
sdrm[44] = ['K',['R']]
|
541
|
+
sdrm[58] = ['H',['D','P','S']]
|
542
|
+
sdrm[64] = ['T',['A','S']]
|
543
|
+
sdrm[77] = ['P',['A','S']]
|
544
|
+
sdrm[78] = ['R',['K']]
|
545
|
+
sdrm[79] = ['T',['A']]
|
546
|
+
sdrm[83] = ['T',['M']]
|
547
|
+
sdrm[85] = ['S',['N','H','Y']]
|
548
|
+
sdrm[92] = ['A',['P','T','K','E']]
|
549
|
+
sdrm[93] = ['Y',['C','F','H','N']]
|
550
|
+
sdrm[107] = ['K',['T','S']]
|
551
|
+
sdrm[121] = ['I',['V']]
|
552
|
+
sdrm[135] = ['T',['A']]
|
553
|
+
when :nrti
|
554
|
+
sdrm[41] = ['M',['L']]
|
555
|
+
sdrm[65] = ['K',['R']]
|
556
|
+
sdrm[67] = ['D',['N','G','E']]
|
557
|
+
sdrm[69] = ['T',['D']]
|
558
|
+
sdrm[70] = ['K',['R','E']]
|
559
|
+
sdrm[74] = ['L',['V','I']]
|
560
|
+
sdrm[75] = ['V',['M','T','A','S']]
|
561
|
+
sdrm[77] = ['F',['L']]
|
562
|
+
sdrm[115] = ['Y',['F']]
|
563
|
+
sdrm[116] = ['F',['Y']]
|
564
|
+
sdrm[151] = ['Q',['M']]
|
565
|
+
sdrm[184] = ['M',['V','I']]
|
566
|
+
sdrm[210] = ['L',['W']]
|
567
|
+
sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
|
568
|
+
sdrm[219] = ["K",["Q","E","N","R"]]
|
569
|
+
when :nnrti
|
570
|
+
sdrm[100] = ['L',['I']]
|
571
|
+
sdrm[101] = ['K',['E','P']]
|
572
|
+
sdrm[103] = ['K',['N','S']]
|
573
|
+
sdrm[106] = ['V',['M','A']]
|
574
|
+
sdrm[179] = ['V',['F','D']]
|
575
|
+
sdrm[181] = ['Y',['C','I','V']]
|
576
|
+
sdrm[188] = ['Y',['L','H','C']]
|
577
|
+
sdrm[190] = ['G',['A','S','E']]
|
578
|
+
sdrm[225] = ['P',['H']]
|
579
|
+
sdrm[230] = ['M',['L']]
|
580
|
+
when :hiv_pr
|
581
|
+
sdrm[23] = ['L',['I']]
|
582
|
+
sdrm[24] = ['L',['I']]
|
583
|
+
sdrm[30] = ['D',['N']]
|
584
|
+
sdrm[32] = ['V',['I']]
|
585
|
+
sdrm[46] = ['M',['I','L']]
|
586
|
+
sdrm[47] = ['I',['V','A']]
|
587
|
+
sdrm[48] = ['G',['V','M']]
|
588
|
+
sdrm[50] = ['I',['V','L']]
|
589
|
+
sdrm[53] = ['F',['L']]
|
590
|
+
sdrm[54] = ['I',['V','L','M','T','A','S']]
|
591
|
+
sdrm[73] = ['G',['S','T','C','A']]
|
592
|
+
sdrm[76] = ['L',['V']]
|
593
|
+
sdrm[82] = ['V',['A','T','S','F','L','C','M']]
|
594
|
+
sdrm[83] = ['N',['D']]
|
595
|
+
sdrm[84] = ['I',['V','A','C']]
|
596
|
+
sdrm[88] = ['N',['D','S']]
|
597
|
+
sdrm[90] = ['L',['M']]
|
598
|
+
when :hiv_in
|
599
|
+
sdrm[66] = ['T',['A','I','K']]
|
600
|
+
sdrm[74] = ['L',['M']]
|
601
|
+
sdrm[92] = ['E',['Q']]
|
602
|
+
sdrm[95] = ['Q',['K']]
|
603
|
+
sdrm[97] = ['T',['A']]
|
604
|
+
sdrm[121] = ['F',['Y']]
|
605
|
+
sdrm[140] = ['G',['A','S','C']]
|
606
|
+
sdrm[143] = ["Y",["C","H","R"]]
|
607
|
+
sdrm[147] = ['S',['G']]
|
608
|
+
sdrm[148] = ['Q',['H','K','R']]
|
609
|
+
sdrm[155] = ['N',['S','H']]
|
610
|
+
else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
|
388
611
|
end
|
612
|
+
return sdrm
|
389
613
|
end
|
390
|
-
|
391
|
-
|
392
|
-
end
|
614
|
+
end # end of ViralSeq::Sequence
|
615
|
+
end # end of ViralSeq
|