viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
@@ -0,0 +1,219 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# Class for paired-end sequences.
|
5
|
+
# @example initialize a new SeqHashPair object from a directory containing paired-end sequences
|
6
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.fa('my_seq_directory')
|
7
|
+
# @example join the paired-end sequences with an overlap of 100 bp
|
8
|
+
# my_seqhashpair.join1(100)
|
9
|
+
# @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
|
10
|
+
# my_seqhashpair.join1(:indiv)
|
11
|
+
|
12
|
+
class SeqHashPair
|
13
|
+
|
14
|
+
# initialize SeqHashPair object with @dna_hash, @title and @file
|
15
|
+
|
16
|
+
def initialize (dna_hash = {}, title = "", file = [])
|
17
|
+
@dna_hash = dna_hash
|
18
|
+
@title = title
|
19
|
+
@file = file
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Hash] Hash object for :name => [:r1_sequence_string, :r2_sequence_string]
|
23
|
+
|
24
|
+
attr_accessor :dna_hash
|
25
|
+
|
26
|
+
# @return [String] the title of the SeqHash object.
|
27
|
+
# default as the directory basename if SeqHash object is initialized using ::fa
|
28
|
+
|
29
|
+
attr_accessor :title
|
30
|
+
|
31
|
+
# @return [String] the r1 and r2 files that are used to initialize SeqHash object, if they exist
|
32
|
+
|
33
|
+
attr_accessor :file
|
34
|
+
|
35
|
+
# initialize a new ViralSeq::SeqHashPair object from a directory containing paired sequence files in the FASTA format
|
36
|
+
# @param indir [String] directory containing paired sequence files in the FASTA format,
|
37
|
+
#
|
38
|
+
# Paired sequence files need to have "r1" and "r2" in their file names
|
39
|
+
#
|
40
|
+
# Example for the file structure
|
41
|
+
# ├───lib1
|
42
|
+
# │ lib1_r1.txt
|
43
|
+
# │ lib1_r2.txt
|
44
|
+
# The sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
45
|
+
# @return [ViralSeq::SeqHashPair] new SeqHashPair object from the paired FASTA sequence files
|
46
|
+
# @example initialize a new SeqHashPair object from a directory containing paired-end sequences
|
47
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.fa('spec/sample_paired_seq')
|
48
|
+
|
49
|
+
def self.new_from_fasta(indir)
|
50
|
+
files = Dir[indir + "/*"]
|
51
|
+
r1_file = ""
|
52
|
+
r2_file = ""
|
53
|
+
files.each do |f|
|
54
|
+
if File.basename(f) =~ /r1/i
|
55
|
+
r1_file = f
|
56
|
+
elsif File.basename(f) =~ /r2/i
|
57
|
+
r2_file = f
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
seq1 = ViralSeq::SeqHash.fa(r1_file).dna_hash
|
62
|
+
seq2 = ViralSeq::SeqHash.fa(r2_file).dna_hash
|
63
|
+
|
64
|
+
new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
65
|
+
new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
66
|
+
|
67
|
+
seq_pair_hash = {}
|
68
|
+
|
69
|
+
new_seq1.each do |seq_name,seq|
|
70
|
+
seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
|
71
|
+
end
|
72
|
+
seq_hash = ViralSeq::SeqHashPair.new
|
73
|
+
seq_hash.dna_hash = seq_pair_hash
|
74
|
+
seq_hash.title = File.basename(indir,".*")
|
75
|
+
seq_hash.file = [r1_file, r2_file]
|
76
|
+
return seq_hash
|
77
|
+
end # end of .new_from_fasta
|
78
|
+
|
79
|
+
class << self
|
80
|
+
alias_method :fa, :new_from_fasta
|
81
|
+
end
|
82
|
+
|
83
|
+
# Pair-end join function for KNOWN overlap size.
|
84
|
+
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
|
+
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
86
|
+
# @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
|
87
|
+
# @example join paired-end sequences with different :diff cut-offs, overlap provided.
|
88
|
+
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
89
|
+
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
90
|
+
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
91
|
+
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
92
|
+
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
93
|
+
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
94
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seqs)
|
95
|
+
# my_seqhashpair.join1(100).dna_hash.keys
|
96
|
+
# => [">pair1"]
|
97
|
+
# my_seqhashpair.join1(100,0.01).dna_hash.keys
|
98
|
+
# => [">pair1", ">pair2"]
|
99
|
+
# my_seqhashpair.join1(100,0.02).dna_hash.keys
|
100
|
+
# => [">pair1", ">pair2", ">pair3"]
|
101
|
+
|
102
|
+
def join1(overlap = 0, diff = 0.0)
|
103
|
+
seq_pair_hash = self.dna_hash
|
104
|
+
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
105
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
106
|
+
joined_seq = {}
|
107
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
108
|
+
r1_seq = seq_pair[0]
|
109
|
+
r2_seq = seq_pair[1]
|
110
|
+
if overlap.zero?
|
111
|
+
joined_seq[seq_name] = r1_seq + r2_seq
|
112
|
+
elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
|
113
|
+
joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
|
114
|
+
else
|
115
|
+
next
|
116
|
+
end
|
117
|
+
end
|
118
|
+
joined_seq_hash = ViralSeq::SeqHash.new
|
119
|
+
joined_seq_hash.dna_hash = joined_seq
|
120
|
+
joined_seq_hash.title = self.title + "_joined"
|
121
|
+
joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
|
122
|
+
return joined_seq_hash
|
123
|
+
end # end of join1
|
124
|
+
|
125
|
+
|
126
|
+
# Pair-end join function for UNKNOWN overlap.
|
127
|
+
# @param model [Symbol] models used to determine the overlap, `:con`, `:indiv`
|
128
|
+
#
|
129
|
+
# model `:con`: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
130
|
+
#
|
131
|
+
# note: minimal overlap as 4 bases.
|
132
|
+
# model `:indiv`: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
133
|
+
# @param diff (see #join1)
|
134
|
+
# @return (see #join1)
|
135
|
+
# @example join paired-end sequences, overlap NOT provided
|
136
|
+
# paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
|
137
|
+
# ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
|
138
|
+
# ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"] }
|
139
|
+
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
|
+
# my_seqhashpair.join2.dna_hash
|
141
|
+
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
+
# my_seqhashpair.join2(:indiv).dna_hash
|
143
|
+
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
|
+
|
145
|
+
def join2(model = :con, diff = 0.0)
|
146
|
+
seq_pair_hash = self.dna_hash
|
147
|
+
begin
|
148
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
149
|
+
if model == :con
|
150
|
+
overlap = determine_overlap_pid_pair(seq_pair_hash, diff)
|
151
|
+
return self.join1(overlap, diff)
|
152
|
+
elsif model == :indiv
|
153
|
+
joined_seq = {}
|
154
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
155
|
+
overlap_list = []
|
156
|
+
overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
|
157
|
+
cut_off_base = overlap1 * diff
|
158
|
+
overlap_list << overlap1 if diff_nt <= cut_off_base
|
159
|
+
end
|
160
|
+
if overlap_list.empty?
|
161
|
+
joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
|
162
|
+
else
|
163
|
+
overlap = overlap_list.max
|
164
|
+
joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
joined_seq_hash = ViralSeq::SeqHash.new
|
168
|
+
joined_seq_hash.dna_hash = joined_seq
|
169
|
+
joined_seq_hash.title = self.title + "_joined"
|
170
|
+
joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
|
171
|
+
return joined_seq_hash
|
172
|
+
else
|
173
|
+
raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \`#{model}\`, expected `:con` or `:indiv`.")
|
174
|
+
end
|
175
|
+
rescue ArgumentError => e
|
176
|
+
puts e
|
177
|
+
return nil
|
178
|
+
end
|
179
|
+
end # end of join2
|
180
|
+
|
181
|
+
private
|
182
|
+
# determine overlap size from @dna_hash
|
183
|
+
def determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
|
184
|
+
overlaps = []
|
185
|
+
seq_pair_hash.each do |_seq_name, seq_pair|
|
186
|
+
overlap_list = []
|
187
|
+
matrix = overlap_matrix(seq_pair[0], seq_pair[1])
|
188
|
+
matrix.each do |overlap, diff_nt|
|
189
|
+
cut_off_base = overlap * diff
|
190
|
+
overlap_list << overlap if diff_nt <= cut_off_base
|
191
|
+
end
|
192
|
+
if overlap_list.empty?
|
193
|
+
overlaps << 0
|
194
|
+
else
|
195
|
+
overlaps << overlap_list.max
|
196
|
+
end
|
197
|
+
end
|
198
|
+
count_overlaps = overlaps.count_freq
|
199
|
+
max_value = count_overlaps.values.max
|
200
|
+
max_overlap_list = []
|
201
|
+
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
202
|
+
max_overlap_list.max
|
203
|
+
end # end pf determine_overlap_pid_pair
|
204
|
+
|
205
|
+
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
206
|
+
# {:overlap_size => number_of_differnt_positions, ...}
|
207
|
+
# {minimal overlap set to 4. }
|
208
|
+
def overlap_matrix(sequence1, sequence2)
|
209
|
+
min_overlap = 4
|
210
|
+
max_overlap = [sequence1.size, sequence2.size].max
|
211
|
+
matrix_hash = {}
|
212
|
+
(min_overlap..max_overlap).each do |overlap|
|
213
|
+
matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
|
214
|
+
end
|
215
|
+
return matrix_hash
|
216
|
+
end # end of overlap_matrix
|
217
|
+
|
218
|
+
end # end of SeqHashPair
|
219
|
+
end # end of ViralSeq
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -1,392 +1,615 @@
|
|
1
|
-
# lib/sequence.rb
|
2
|
-
# Includes functions for sequence operations
|
3
|
-
# Including methods as:
|
4
|
-
# ViralSeq::AMINO_ACID_LIST
|
5
|
-
# ViralSeq::Sequence
|
6
|
-
# ViralSeq::Sequence#rev_complement
|
7
|
-
# ViralSeq::Sequence#get_aa_sequence
|
8
|
-
# ViralSeq::Sequence#get_aa_array
|
9
|
-
# ViralSeq::Sequence#name
|
10
|
-
# ViralSeq::Sequence#dna_sequence
|
11
|
-
# ViralSeq::Sequence#aa_sequence
|
12
|
-
# ViralSeq::Sequence#aa_array
|
13
|
-
# ViralSeq::amino_acid
|
14
|
-
# ViralSeq::amino_acid_2
|
15
|
-
# ViralSeq::to_list
|
16
|
-
# ViralSeq::uniq_sequence_hash
|
17
|
-
# ViralSeq::stop_codon_seq_hash
|
18
|
-
# String#rc
|
19
|
-
# String#mutation
|
20
|
-
# String#nt_parser
|
21
|
-
|
22
|
-
# ViralSeq::AMINO_ACID_LIST
|
23
|
-
# # Array of all amino acid one letter abbreviations
|
24
|
-
|
25
|
-
# ViralSeq::Sequence
|
26
|
-
# # Sequence class
|
27
|
-
# =USAGE
|
28
|
-
# # create a sequence object
|
29
|
-
# seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
|
30
|
-
#
|
31
|
-
# # print dna sequence
|
32
|
-
# puts seq.dna_sequence
|
33
|
-
#
|
34
|
-
# # reserce complement sequence of DNA sequence, return as a string
|
35
|
-
# seq.rev_complement
|
36
|
-
#
|
37
|
-
# # change @dna_sequence to reverse complement DNA sequence
|
38
|
-
# seq.rev_complement!
|
39
|
-
#
|
40
|
-
# # generate amino acid sequences. either return string or array.
|
41
|
-
# # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
|
42
|
-
# # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
|
43
|
-
# seq.get_aa_sequence
|
44
|
-
# # or
|
45
|
-
# seq.get_aa_array
|
46
|
-
#
|
47
|
-
# # print amino acid sequence
|
48
|
-
# puts seq.aa_sequence
|
49
|
-
|
50
|
-
# ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
|
51
|
-
# # collapse sequence hash to unique sequence hash.
|
52
|
-
# # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
|
53
|
-
# # master_sequence_tag is the master tag for unique sequences
|
54
|
-
# # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
|
55
|
-
# =USAGE
|
56
|
-
# sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
|
57
|
-
# '>seq4' => 'CCCC', '>seq5' => 'CCCC',
|
58
|
-
# '>seq6' => 'TTTT' }
|
59
|
-
# uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
|
60
|
-
# => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
|
61
1
|
|
62
2
|
module ViralSeq
|
63
3
|
|
64
|
-
#
|
65
|
-
|
66
|
-
|
67
|
-
#
|
4
|
+
# ViralSeq::Sequence class for sequence operation
|
5
|
+
#
|
6
|
+
# @example create a sequence object
|
7
|
+
# seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
|
8
|
+
# => #<ViralSeq::Sequence:0x00007fd03c8c10b8 @name="my_sequence", @dna="ACCTAGGTTCGGAGC", @aa_string="", @aa_array=[]>
|
9
|
+
#
|
10
|
+
# @example return dna sequence as String
|
11
|
+
# seq.dna
|
12
|
+
# => "ACCTAGGTTCGGAGC"
|
13
|
+
#
|
14
|
+
# @example reverse complement sequence of DNA sequence
|
15
|
+
# seq.rc
|
16
|
+
# => "GCTCCGAACCTAGGT"
|
17
|
+
#
|
18
|
+
# @example change @dna to reverse complement DNA sequence
|
19
|
+
# seq.rc!
|
20
|
+
#
|
21
|
+
# @example translate the DNA sequence, return values for @aa_string and @aa_array
|
22
|
+
# seq = ViralSeq::Sequence.new('my_sequence', 'AWTCGRAGAG')
|
23
|
+
# seq.translate(1)
|
24
|
+
# seq.aa_string
|
25
|
+
# => "##E"
|
26
|
+
# seq.aa_array
|
27
|
+
# => ["IF", "EG", "E"]
|
68
28
|
|
69
29
|
class Sequence
|
30
|
+
# initialize a ViralSeq::Sequence class with sequence name (default as '>sequence')
|
31
|
+
# and DNA sequence as String object
|
70
32
|
def initialize (name = ">sequence",dna_sequence ="")
|
71
33
|
@name = name
|
72
|
-
@
|
73
|
-
@
|
34
|
+
@dna = dna_sequence.upcase
|
35
|
+
@aa_string = ""
|
74
36
|
@aa_array = []
|
75
37
|
end
|
76
38
|
|
77
|
-
|
39
|
+
# @return [String] sequence tag name
|
40
|
+
attr_accessor :name
|
41
|
+
|
42
|
+
# @return [String] DNA sequence
|
43
|
+
attr_accessor :dna
|
44
|
+
|
45
|
+
# @return [String] amino acid sequence
|
46
|
+
attr_accessor :aa_string
|
47
|
+
|
48
|
+
# @return [Array] amino acid sequence as an Array object,
|
49
|
+
# ambiguity dna sequence will be translated in all possible amino acid sequence at the position
|
50
|
+
attr_accessor :aa_array
|
78
51
|
|
52
|
+
# @return [String] reverse compliment sequence of the @dna.
|
79
53
|
def rev_complement
|
80
|
-
@
|
54
|
+
@dna.rc
|
81
55
|
end
|
56
|
+
|
57
|
+
# replace the @dna with reverse complement DNA sequence.
|
82
58
|
def rev_complement!
|
83
|
-
@
|
59
|
+
@dna = @dna.rc
|
84
60
|
end
|
85
61
|
|
86
|
-
|
87
|
-
|
88
|
-
|
62
|
+
alias_method :rc, :rev_complement
|
63
|
+
alias_method :rc!, :rev_complement!
|
64
|
+
|
65
|
+
# translate @dna to amino acid sequence.
|
66
|
+
# generate values for @aa_string and @aa_array
|
67
|
+
# @param initial_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
|
68
|
+
|
69
|
+
def translate(initial_position = 0)
|
70
|
+
@aa_string = ""
|
71
|
+
require_sequence = @dna[initial_position..-1]
|
89
72
|
base_array = []
|
90
73
|
require_sequence.each_char {|base| base_array << base}
|
91
74
|
while (base_array.length>=3) do
|
92
75
|
base_3= ""
|
93
76
|
3.times {base_3 += base_array.shift}
|
94
|
-
@
|
77
|
+
@aa_string << amino_acid(base_3)
|
95
78
|
end
|
96
|
-
return @aa_sequence
|
97
|
-
end
|
98
79
|
|
99
|
-
# get amino acid calls, return a array.keep ambiguity calls.
|
100
|
-
def get_aa_array(initial_position = 0)
|
101
80
|
@aa_array = []
|
102
|
-
require_sequence = @
|
81
|
+
require_sequence = @dna[initial_position..-1].tr('-','N')
|
103
82
|
base_array = []
|
104
83
|
require_sequence.each_char {|base| base_array << base}
|
105
84
|
while (base_array.length>=3) do
|
106
85
|
base_3= ""
|
107
86
|
3.times{base_3 += base_array.shift}
|
108
|
-
@aa_array<<
|
87
|
+
@aa_array<< amino_acid_2(base_3)
|
109
88
|
end
|
110
|
-
return @aa_array
|
111
89
|
end
|
90
|
+
|
91
|
+
# @return [Integer] length of DNA sequence
|
112
92
|
def dna_length
|
113
|
-
@
|
93
|
+
@dna.length
|
114
94
|
end
|
95
|
+
|
96
|
+
# @return [Integer] length of amino acid sequence
|
115
97
|
def aa_length
|
116
|
-
@
|
98
|
+
@aa_string.length
|
117
99
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
return "N"
|
155
|
-
when /^AA[AGR]$/
|
156
|
-
return "K"
|
157
|
-
when /^GA[TCY]$/
|
158
|
-
return "D"
|
159
|
-
when /^GA[AGR]$/
|
160
|
-
return "E"
|
161
|
-
when /^TG[TCY]$/
|
162
|
-
return "C"
|
163
|
-
when "TGG"
|
164
|
-
return "W"
|
165
|
-
when /^CG.$/
|
166
|
-
return "R"
|
167
|
-
when /^AG[TCY]$/
|
168
|
-
return "S"
|
169
|
-
when /^[AM]G[AGR]$/
|
170
|
-
return "R"
|
171
|
-
when /^GG.$/
|
172
|
-
return "G"
|
173
|
-
when /^[ATW][CGS][CTY]$/
|
174
|
-
return "S"
|
175
|
-
when /^[TCY]T[AGR]$/
|
176
|
-
return "L"
|
177
|
-
else
|
178
|
-
return "#"
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
# keep ambiguities, return all possible amino acids.
|
183
|
-
|
184
|
-
def self.amino_acid_2 (bases)
|
185
|
-
bases_to_aa = []
|
186
|
-
aa_list = []
|
187
|
-
base1 = ViralSeq.to_list(bases[0])
|
188
|
-
base2 = ViralSeq.to_list(bases[1])
|
189
|
-
base3 = ViralSeq.to_list(bases[2])
|
190
|
-
l1 = base1.size - 1
|
191
|
-
l2 = base2.size - 1
|
192
|
-
l3 = base3.size - 1
|
193
|
-
(0..l1).each do |n1|
|
194
|
-
b1 = base1[n1]
|
195
|
-
(0..l2).each do |n2|
|
196
|
-
b2 = base2[n2]
|
197
|
-
(0..l3).each do |n3|
|
198
|
-
b3 = base3[n3]
|
199
|
-
bases_all = b1 + b2 + b3
|
200
|
-
bases_to_aa << bases_all
|
100
|
+
|
101
|
+
# resistant mutation interpretation for a chosen region from a translated ViralSeq::Sequence object
|
102
|
+
# @param option [Symbol] option of region to interpret, `:hcv_ns5a`, `:hiv_pr`, `:nrti`, `:nnrti`, `hiv_in`
|
103
|
+
# @param start_aa [Integer] the starting aa number of the input sequence
|
104
|
+
# @return [Hash] return a Hash object for SDRMs identified. :posiiton => [:wildtype_codon, :mutation_codon]
|
105
|
+
# @example examine an HIV PR region sequence for drug resistance mutations
|
106
|
+
# my_seq_name = 'a_pr_seq'
|
107
|
+
# my_seq = 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAAATAGGAGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATAATACAGTATTAGAAGACATGGAGTTACCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATCTGTGGGCATAAAACTACAGGTACAGTGTTAATAGGACCTACACCCGTCAACATAATTGGAAGAGATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTT'
|
108
|
+
# s = ViralSeq::Sequence.new(my_seq_name, my_seq)
|
109
|
+
# s.translate
|
110
|
+
# s.sdrm(:hiv_pr)
|
111
|
+
# => {30=>["D", "N"], 88=>["N", "D"]}
|
112
|
+
|
113
|
+
def sdrm(option, start_aa = 1)
|
114
|
+
aa_array = self.aa_array
|
115
|
+
out_hash = {}
|
116
|
+
sdrm = sdrm_hash(option)
|
117
|
+
aa_length = aa_array.size
|
118
|
+
end_aa = start_aa + aa_length - 1
|
119
|
+
(start_aa..end_aa).each do |position|
|
120
|
+
array_position = position - start_aa
|
121
|
+
if sdrm.keys.include?(position)
|
122
|
+
wt_aa = sdrm[position][0]
|
123
|
+
test_aa = aa_array[array_position]
|
124
|
+
if test_aa.size == 1
|
125
|
+
unless wt_aa == test_aa
|
126
|
+
if sdrm[position][1].include?(test_aa)
|
127
|
+
out_hash[position] = [wt_aa,test_aa]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
else
|
131
|
+
test_aa_array = test_aa.split("")
|
132
|
+
if (test_aa_array & sdrm[position][1])
|
133
|
+
out_hash[position] = [wt_aa,test_aa]
|
134
|
+
end
|
135
|
+
end
|
201
136
|
end
|
202
137
|
end
|
203
|
-
|
138
|
+
return out_hash
|
139
|
+
end # end of #hcv_ns5a
|
204
140
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
141
|
+
# HIV sequence locator function, resembling HIV Sequence Locator from LANL
|
142
|
+
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
143
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
144
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
145
|
+
# @return [Array] an array of the following info
|
146
|
+
# # start_location (Integer)
|
147
|
+
# # end_location (Integer)
|
148
|
+
# # percentage_of_similarity_to_reference_sequence (Float)
|
149
|
+
# # containing_indel? (Boolean)
|
150
|
+
# # aligned_input_sequence (String)
|
151
|
+
# # aligned_reference_sequence (String)
|
152
|
+
# @example identify the location of the input sequence on the NL43 genome
|
153
|
+
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
154
|
+
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
155
|
+
# loc = s.locator(:NL43)
|
156
|
+
# h = ViralSeq::SeqHash.new; h.dna_hash['NL43'] = loc[5]; h.dna_hash[s.name] = loc[4]
|
157
|
+
# rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
|
158
|
+
# puts "The input sequence \"#{s.name}\" is located on the NL43 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
|
159
|
+
# => The input sequence "my_sequence" is located on the NL43 nt sequence from 2333 to 2433.
|
160
|
+
# => It is 98.0% similar to the reference.
|
161
|
+
# => It does not have indels.
|
162
|
+
# => The alignment is
|
163
|
+
# => NL43 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
|
164
|
+
# => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
|
165
|
+
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
166
|
+
|
167
|
+
def locator(ref_option = :HXB2, path_to_muscle = false)
|
168
|
+
seq = self.dna
|
169
|
+
ori_ref = ViralSeq::RefSeq.get(ref_option)
|
170
|
+
|
171
|
+
begin
|
172
|
+
ori_ref_l = ori_ref.size
|
173
|
+
l1 = 0
|
174
|
+
l2 = 0
|
175
|
+
|
176
|
+
aln_seq = ViralSeq::Muscle.align(ori_ref, seq, path_to_muscle)
|
177
|
+
aln_test = aln_seq[1]
|
178
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
179
|
+
gap_begin = $1.size
|
180
|
+
gap_end = $3.size
|
181
|
+
aln_test2 = $2
|
182
|
+
ref = aln_seq[0]
|
183
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
184
|
+
ref_size = ref.size
|
185
|
+
if ref_size > 1.3*(seq.size)
|
186
|
+
l1 = l1 + gap_begin
|
187
|
+
l2 = l2 + gap_end
|
188
|
+
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
189
|
+
aln_test2 =~ /#{max_seq}/
|
190
|
+
before_aln_seq = $`
|
191
|
+
before_aln = $`.size
|
192
|
+
post_aln_seq = $'
|
193
|
+
post_aln = $'.size
|
194
|
+
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
195
|
+
b1 = (1.3 * before_aln_seq_size).to_i
|
196
|
+
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
197
|
+
b2 = (1.3 * post_aln_seq_size).to_i
|
198
|
+
if (before_aln > seq.size) and (post_aln <= seq.size)
|
199
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
200
|
+
l1 = l1 + (before_aln - b1)
|
201
|
+
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
202
|
+
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
203
|
+
l2 = l2 + post_aln - b2
|
204
|
+
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
205
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
206
|
+
l1 = l1 + (before_aln - b1)
|
207
|
+
l2 = l2 + (post_aln - b2)
|
208
|
+
end
|
209
|
+
|
210
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
211
|
+
aln_test = aln_seq[1]
|
212
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
213
|
+
gap_begin = $1.size
|
214
|
+
gap_end = $3.size
|
215
|
+
ref = aln_seq[0]
|
216
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
217
|
+
end
|
218
|
+
|
219
|
+
aln_test = aln_seq[1]
|
220
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
221
|
+
gap_begin = $1.size
|
222
|
+
gap_end = $3.size
|
223
|
+
aln_test = $2
|
224
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
225
|
+
s1 = $1.size
|
226
|
+
g1 = $2.size
|
227
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
228
|
+
s2 = $2.size
|
229
|
+
g2 = $1.size
|
230
|
+
|
231
|
+
l1 = l1 + gap_begin
|
232
|
+
l2 = l2 + gap_end
|
233
|
+
repeat = 0
|
234
|
+
|
235
|
+
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
236
|
+
if s1 > s2 and g2 > 2*s2
|
237
|
+
ref = ref[0..(-g2-1)]
|
238
|
+
repeat = 1
|
239
|
+
l2 = l2 + g2
|
240
|
+
elsif s1 < s2 and g1 > 2*s1
|
241
|
+
ref = ref[g1..-1]
|
242
|
+
repeat = 1
|
243
|
+
l1 = l1 + g1
|
244
|
+
end
|
245
|
+
else
|
246
|
+
if g1 > 2*s1
|
247
|
+
ref = ref[g1..-1]
|
248
|
+
repeat = 1
|
249
|
+
l1 = l1 + g1
|
250
|
+
end
|
251
|
+
if g2 > 2*s2
|
252
|
+
ref = ref[0..(-g2 - 1)]
|
253
|
+
repeat = 1
|
254
|
+
l2 = l2 + g2
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
while repeat == 1
|
259
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
260
|
+
aln_test = aln_seq[1]
|
261
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
262
|
+
gap_begin = $1.size
|
263
|
+
gap_end = $3.size
|
264
|
+
aln_test = $2
|
265
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
266
|
+
s1 = $1.size
|
267
|
+
g1 = $2.size
|
268
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
269
|
+
s2 = $2.size
|
270
|
+
g2 = $1.size
|
271
|
+
ref = aln_seq[0]
|
272
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
273
|
+
l1 = l1 + gap_begin
|
274
|
+
l2 = l2 + gap_end
|
275
|
+
repeat = 0
|
276
|
+
if g1 > 2*s1
|
277
|
+
ref = ref[g1..-1]
|
278
|
+
repeat = 1
|
279
|
+
l1 = l1 + g1
|
280
|
+
end
|
281
|
+
if g2 > 2*s2
|
282
|
+
ref = ref[0..(-g2 - 1)]
|
283
|
+
repeat = 1
|
284
|
+
l2 = l2 + g2
|
285
|
+
end
|
286
|
+
end
|
287
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
288
|
+
|
289
|
+
|
290
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
291
|
+
aln_test = aln_seq[1]
|
292
|
+
ref = aln_seq[0]
|
293
|
+
|
294
|
+
#refine alignment
|
295
|
+
|
296
|
+
if ref =~ /^(\-+)/
|
297
|
+
l1 = l1 - $1.size
|
298
|
+
elsif ref =~ /(\-+)$/
|
299
|
+
l2 = l2 + $1.size
|
300
|
+
end
|
301
|
+
|
302
|
+
if (ori_ref_l - l2 - 1) >= l1
|
303
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
304
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
|
305
|
+
aln_test = aln_seq[1]
|
306
|
+
ref = aln_seq[0]
|
307
|
+
|
308
|
+
ref_size = ref.size
|
309
|
+
sim_count = 0
|
310
|
+
(0..(ref_size-1)).each do |n|
|
311
|
+
ref_base = ref[n]
|
312
|
+
test_base = aln_test[n]
|
313
|
+
sim_count += 1 if ref_base == test_base
|
314
|
+
end
|
315
|
+
similarity = (sim_count/ref_size.to_f*100).round(1)
|
316
|
+
|
317
|
+
loc_p1 = l1 + 1
|
318
|
+
loc_p2 = ori_ref_l - l2
|
319
|
+
if seq.size != (loc_p2 - loc_p1 + 1)
|
320
|
+
indel = true
|
321
|
+
elsif aln_test.include?("-")
|
322
|
+
indel = true
|
323
|
+
else
|
324
|
+
indel = false
|
325
|
+
end
|
326
|
+
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
327
|
+
else
|
328
|
+
return [0,0,0,0,0,0,0]
|
329
|
+
end
|
330
|
+
rescue => e
|
331
|
+
puts "Unexpected error occured."
|
332
|
+
puts "Exception Class: #{ e.class.name }"
|
333
|
+
puts "Exception Message: #{ e.message }"
|
334
|
+
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
335
|
+
puts "ViralSeq.sequence_locator returns nil"
|
336
|
+
return nil
|
329
337
|
end
|
330
|
-
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
end
|
335
|
-
|
336
|
-
#
|
337
|
-
#
|
338
|
-
#
|
339
|
-
#
|
340
|
-
# "
|
341
|
-
#
|
342
|
-
#
|
343
|
-
#
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
pool = ["A","C","T","G"]
|
368
|
-
pool.delete(nt)
|
369
|
-
s = error_rate * 10000
|
370
|
-
r = rand(10000)
|
371
|
-
if r < s
|
372
|
-
nt = pool.sample
|
338
|
+
end # end of locator
|
339
|
+
|
340
|
+
# Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
|
341
|
+
# @param p1 [Integer] start position number on the reference genome
|
342
|
+
# @param p2 [Integer] end position number on the reference genome
|
343
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
344
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
345
|
+
# @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
|
346
|
+
# if either the start or end position is beyond the range of the target sequence.
|
347
|
+
# @example trim a sequence to fit in the range of [2333, 2433] on the HXB2 nt reference
|
348
|
+
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
349
|
+
# s = ViralSeq::Sequence.new('my_seq', seq)
|
350
|
+
# s.sequence_clip(2333, 2433, :HXB2).dna
|
351
|
+
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
352
|
+
|
353
|
+
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
354
|
+
loc = self.locator(ref_option, path_to_muscle)
|
355
|
+
l1 = loc[0]
|
356
|
+
l2 = loc[1]
|
357
|
+
if (p1 >= l1) & (p2 <= l2)
|
358
|
+
seq = loc[4]
|
359
|
+
ref = loc[5]
|
360
|
+
g1 = 0
|
361
|
+
ref.each_char do |char|
|
362
|
+
break if l1 == p1
|
363
|
+
g1 += 1
|
364
|
+
l1 += 1 unless char == "-"
|
365
|
+
end
|
366
|
+
g2 = 1
|
367
|
+
ref.reverse.each_char do |char|
|
368
|
+
break if l2 == p2
|
369
|
+
g2 += 1
|
370
|
+
l2 -= 1 unless char == "-"
|
371
|
+
end
|
372
|
+
return ViralSeq::Sequence.new(self.name,seq[g1..(-g2)].tr("-",""))
|
373
|
+
else
|
374
|
+
return nil
|
373
375
|
end
|
374
|
-
new_string << nt
|
375
376
|
end
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
377
|
+
|
378
|
+
# start of private functions
|
379
|
+
private
|
380
|
+
|
381
|
+
# generate amino acid abbreviations from 3 bases, ambiguity will return "#"
|
382
|
+
def amino_acid (bases)
|
383
|
+
case bases
|
384
|
+
when /^TT[TCY]$/
|
385
|
+
return "F"
|
386
|
+
when /^TT[AGR]$/
|
387
|
+
return "L"
|
388
|
+
when /^CT.$/
|
389
|
+
return "L"
|
390
|
+
when /^AT[TCAHYWM]$/
|
391
|
+
return "I"
|
392
|
+
when "ATG"
|
393
|
+
return "M"
|
394
|
+
when /^GT.$/
|
395
|
+
return "V"
|
396
|
+
when /^TC.$/
|
397
|
+
return "S"
|
398
|
+
when /^CC.$/
|
399
|
+
return "P"
|
400
|
+
when /^AC.$/
|
401
|
+
return "T"
|
402
|
+
when /^GC.$/
|
403
|
+
return "A"
|
404
|
+
when /^TA[TCY]$/
|
405
|
+
return "Y"
|
406
|
+
when /^TA[AGR]$/
|
407
|
+
return "*"
|
408
|
+
when /^T[GR]A$/
|
409
|
+
return "*"
|
410
|
+
when /^CA[TCY]$/
|
411
|
+
return "H"
|
412
|
+
when /^CA[AGR]$/
|
413
|
+
return "Q"
|
414
|
+
when /^AA[TCY]$/
|
415
|
+
return "N"
|
416
|
+
when /^AA[AGR]$/
|
417
|
+
return "K"
|
418
|
+
when /^GA[TCY]$/
|
419
|
+
return "D"
|
420
|
+
when /^GA[AGR]$/
|
421
|
+
return "E"
|
422
|
+
when /^TG[TCY]$/
|
423
|
+
return "C"
|
424
|
+
when "TGG"
|
425
|
+
return "W"
|
426
|
+
when /^CG.$/
|
427
|
+
return "R"
|
428
|
+
when /^AG[TCY]$/
|
429
|
+
return "S"
|
430
|
+
when /^[AM]G[AGR]$/
|
431
|
+
return "R"
|
432
|
+
when /^GG.$/
|
433
|
+
return "G"
|
434
|
+
when /^[ATW][CGS][CTY]$/
|
435
|
+
return "S"
|
436
|
+
when /^[TCY]T[AGR]$/
|
437
|
+
return "L"
|
385
438
|
else
|
386
|
-
|
387
|
-
|
439
|
+
return "#"
|
440
|
+
end
|
441
|
+
end # end of amino_acid
|
442
|
+
|
443
|
+
# keep ambiguities, return all possible amino acids.
|
444
|
+
|
445
|
+
def amino_acid_2 (bases)
|
446
|
+
bases_to_aa = []
|
447
|
+
aa_list = []
|
448
|
+
base1 = bases[0].to_list
|
449
|
+
base2 = bases[1].to_list
|
450
|
+
base3 = bases[2].to_list
|
451
|
+
l1 = base1.size - 1
|
452
|
+
l2 = base2.size - 1
|
453
|
+
l3 = base3.size - 1
|
454
|
+
(0..l1).each do |n1|
|
455
|
+
b1 = base1[n1]
|
456
|
+
(0..l2).each do |n2|
|
457
|
+
b2 = base2[n2]
|
458
|
+
(0..l3).each do |n3|
|
459
|
+
b3 = base3[n3]
|
460
|
+
bases_all = b1 + b2 + b3
|
461
|
+
bases_to_aa << bases_all
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
bases_to_aa.each do |base|
|
467
|
+
case base
|
468
|
+
when /^TT[TCY]$/
|
469
|
+
aa = "F"
|
470
|
+
when /^TT[AGR]$/
|
471
|
+
aa = "L"
|
472
|
+
when /^CT.$/
|
473
|
+
aa = "L"
|
474
|
+
when /^AT[TCAHYWM]$/
|
475
|
+
aa = "I"
|
476
|
+
when "ATG"
|
477
|
+
aa = "M"
|
478
|
+
when /^GT.$/
|
479
|
+
aa = "V"
|
480
|
+
when /^TC.$/
|
481
|
+
aa = "S"
|
482
|
+
when /^CC.$/
|
483
|
+
aa = "P"
|
484
|
+
when /^AC.$/
|
485
|
+
aa = "T"
|
486
|
+
when /^GC.$/
|
487
|
+
aa = "A"
|
488
|
+
when /^TA[TCY]$/
|
489
|
+
aa = "Y"
|
490
|
+
when /^TA[AGR]$/
|
491
|
+
aa = "*"
|
492
|
+
when /^T[GR]A$/
|
493
|
+
aa = "*"
|
494
|
+
when /^CA[TCY]$/
|
495
|
+
aa = "H"
|
496
|
+
when /^CA[AGR]$/
|
497
|
+
aa = "Q"
|
498
|
+
when /^AA[TCY]$/
|
499
|
+
aa = "N"
|
500
|
+
when /^AA[AGR]$/
|
501
|
+
aa = "K"
|
502
|
+
when /^GA[TCY]$/
|
503
|
+
aa = "D"
|
504
|
+
when /^GA[AGR]$/
|
505
|
+
aa = "E"
|
506
|
+
when /^TG[TCY]$/
|
507
|
+
aa = "C"
|
508
|
+
when "TGG"
|
509
|
+
aa = "W"
|
510
|
+
when /^CG.$/
|
511
|
+
aa = "R"
|
512
|
+
when /^AG[TCY]$/
|
513
|
+
aa = "S"
|
514
|
+
when /^[AM]G[AGR]$/
|
515
|
+
aa = "R"
|
516
|
+
when /^GG.$/
|
517
|
+
aa = "G"
|
518
|
+
when /^[ATW][CGS][CTY]$/
|
519
|
+
aa = "S"
|
520
|
+
when /^[TCY]T[AGR]$/
|
521
|
+
aa = "L"
|
522
|
+
else
|
523
|
+
aa = "-"
|
524
|
+
end
|
525
|
+
aa_list << aa
|
526
|
+
end
|
527
|
+
aa_out = aa_list.uniq.join
|
528
|
+
return aa_out
|
529
|
+
end # end of #amino_acid_2
|
530
|
+
|
531
|
+
# sdrm position hash
|
532
|
+
def sdrm_hash(options)
|
533
|
+
sdrm = {}
|
534
|
+
case options
|
535
|
+
when :hcv_ns5a
|
536
|
+
sdrm[28] = ['M',['T']]
|
537
|
+
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
538
|
+
sdrm[31] = ['L',['M','V','F']]
|
539
|
+
sdrm[32] = ['P',['L']]
|
540
|
+
sdrm[44] = ['K',['R']]
|
541
|
+
sdrm[58] = ['H',['D','P','S']]
|
542
|
+
sdrm[64] = ['T',['A','S']]
|
543
|
+
sdrm[77] = ['P',['A','S']]
|
544
|
+
sdrm[78] = ['R',['K']]
|
545
|
+
sdrm[79] = ['T',['A']]
|
546
|
+
sdrm[83] = ['T',['M']]
|
547
|
+
sdrm[85] = ['S',['N','H','Y']]
|
548
|
+
sdrm[92] = ['A',['P','T','K','E']]
|
549
|
+
sdrm[93] = ['Y',['C','F','H','N']]
|
550
|
+
sdrm[107] = ['K',['T','S']]
|
551
|
+
sdrm[121] = ['I',['V']]
|
552
|
+
sdrm[135] = ['T',['A']]
|
553
|
+
when :nrti
|
554
|
+
sdrm[41] = ['M',['L']]
|
555
|
+
sdrm[65] = ['K',['R']]
|
556
|
+
sdrm[67] = ['D',['N','G','E']]
|
557
|
+
sdrm[69] = ['T',['D']]
|
558
|
+
sdrm[70] = ['K',['R','E']]
|
559
|
+
sdrm[74] = ['L',['V','I']]
|
560
|
+
sdrm[75] = ['V',['M','T','A','S']]
|
561
|
+
sdrm[77] = ['F',['L']]
|
562
|
+
sdrm[115] = ['Y',['F']]
|
563
|
+
sdrm[116] = ['F',['Y']]
|
564
|
+
sdrm[151] = ['Q',['M']]
|
565
|
+
sdrm[184] = ['M',['V','I']]
|
566
|
+
sdrm[210] = ['L',['W']]
|
567
|
+
sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
|
568
|
+
sdrm[219] = ["K",["Q","E","N","R"]]
|
569
|
+
when :nnrti
|
570
|
+
sdrm[100] = ['L',['I']]
|
571
|
+
sdrm[101] = ['K',['E','P']]
|
572
|
+
sdrm[103] = ['K',['N','S']]
|
573
|
+
sdrm[106] = ['V',['M','A']]
|
574
|
+
sdrm[179] = ['V',['F','D']]
|
575
|
+
sdrm[181] = ['Y',['C','I','V']]
|
576
|
+
sdrm[188] = ['Y',['L','H','C']]
|
577
|
+
sdrm[190] = ['G',['A','S','E']]
|
578
|
+
sdrm[225] = ['P',['H']]
|
579
|
+
sdrm[230] = ['M',['L']]
|
580
|
+
when :hiv_pr
|
581
|
+
sdrm[23] = ['L',['I']]
|
582
|
+
sdrm[24] = ['L',['I']]
|
583
|
+
sdrm[30] = ['D',['N']]
|
584
|
+
sdrm[32] = ['V',['I']]
|
585
|
+
sdrm[46] = ['M',['I','L']]
|
586
|
+
sdrm[47] = ['I',['V','A']]
|
587
|
+
sdrm[48] = ['G',['V','M']]
|
588
|
+
sdrm[50] = ['I',['V','L']]
|
589
|
+
sdrm[53] = ['F',['L']]
|
590
|
+
sdrm[54] = ['I',['V','L','M','T','A','S']]
|
591
|
+
sdrm[73] = ['G',['S','T','C','A']]
|
592
|
+
sdrm[76] = ['L',['V']]
|
593
|
+
sdrm[82] = ['V',['A','T','S','F','L','C','M']]
|
594
|
+
sdrm[83] = ['N',['D']]
|
595
|
+
sdrm[84] = ['I',['V','A','C']]
|
596
|
+
sdrm[88] = ['N',['D','S']]
|
597
|
+
sdrm[90] = ['L',['M']]
|
598
|
+
when :hiv_in
|
599
|
+
sdrm[66] = ['T',['A','I','K']]
|
600
|
+
sdrm[74] = ['L',['M']]
|
601
|
+
sdrm[92] = ['E',['Q']]
|
602
|
+
sdrm[95] = ['Q',['K']]
|
603
|
+
sdrm[97] = ['T',['A']]
|
604
|
+
sdrm[121] = ['F',['Y']]
|
605
|
+
sdrm[140] = ['G',['A','S','C']]
|
606
|
+
sdrm[143] = ["Y",["C","H","R"]]
|
607
|
+
sdrm[147] = ['S',['G']]
|
608
|
+
sdrm[148] = ['Q',['H','K','R']]
|
609
|
+
sdrm[155] = ['N',['S','H']]
|
610
|
+
else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
|
388
611
|
end
|
612
|
+
return sdrm
|
389
613
|
end
|
390
|
-
|
391
|
-
|
392
|
-
end
|
614
|
+
end # end of ViralSeq::Sequence
|
615
|
+
end # end of ViralSeq
|