viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,54 +0,0 @@
1
- # viral_seq/hcv_dr
2
- # HCV resistant mutation interpretation
3
- # ViralSeq::hcv_ns5a
4
-
5
- # ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
6
- # # amino_acid_sequence_array is Array object of the amino acid sequence.
7
- # # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
8
- # # start_aa_position is the starting aa number of the input sequence as Integer
9
-
10
- module ViralSeq
11
- def self.hcv_ns5a(aa_array,start_aa=1)
12
- out_hash = {}
13
- sdrm = {}
14
- sdrm[28] = ['M',['T']]
15
- sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
16
- sdrm[31] = ['L',['M','V','F']]
17
- sdrm[32] = ['P',['L']]
18
- sdrm[44] = ['K',['R']]
19
- sdrm[58] = ['H',['D','P','S']]
20
- sdrm[64] = ['T',['A','S']]
21
- sdrm[77] = ['P',['A','S']]
22
- sdrm[78] = ['R',['K']]
23
- sdrm[79] = ['T',['A']]
24
- sdrm[83] = ['T',['M']]
25
- sdrm[85] = ['S',['N','H','Y']]
26
- sdrm[92] = ['A',['P','T','K','E']]
27
- sdrm[93] = ['Y',['C','F','H','N']]
28
- sdrm[107] = ['K',['T','S']]
29
- sdrm[121] = ['I',['V']]
30
- sdrm[135] = ['T',['A']]
31
- aa_length = aa_array.size
32
- end_aa = start_aa + aa_length - 1
33
- (start_aa..end_aa).each do |position|
34
- array_position = position - start_aa
35
- if sdrm.keys.include?(position)
36
- wt_aa = sdrm[position][0]
37
- test_aa = aa_array[array_position]
38
- if test_aa.size == 1
39
- unless wt_aa == test_aa
40
- if sdrm[position][1].include?(test_aa)
41
- out_hash[position] = [wt_aa,test_aa]
42
- end
43
- end
44
- else
45
- test_aa_array = test_aa.split("/")
46
- if (test_aa_array & sdrm[position][1])
47
- out_hash[position] = [wt_aa,test_aa]
48
- end
49
- end
50
- end
51
- end
52
- return out_hash
53
- end
54
- end
@@ -1,299 +0,0 @@
1
- # viral_seq/locator.rb
2
-
3
- # Including following methods:
4
- # ViralSeq::sequence_locator
5
- # ViralSeq::sequence_clip
6
- # ViralSeq::qc_hiv_seq_check
7
-
8
- # HIV sequence locator function
9
- # resembling HIV Sequence Locator from LANL
10
- # https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
11
- # require MUSCLE (http://www.drive5.com/muscle) installed
12
- # current version only supports nucleotide sequence, not for amino acid sequence.
13
-
14
- # =USAGE1
15
- # # Find the location of a sequence
16
- # ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
17
- # # input_sequence: String of nucleotide sequence
18
- # # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
19
- # # path_to_muscle: path to the muscle executable.
20
- # # Default as :false, will call MuscleBio to run Muscle
21
- # # specify path_to_muscle if other source of muscle needed
22
- # # function returns an array of
23
- # # start_location (Integer)
24
- # # end_location (Integer)
25
- # # percentage_of_similarity_to_reference_sequence (Float)
26
- # # containing_indel? (Boolean)
27
- # # aligned_input_sequence (String)
28
- # # aligned_reference_sequence (String)
29
- # # example code
30
- # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
31
- # p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
32
- # => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
33
-
34
- # =USAGE2
35
- # ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
36
- # # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
37
- # # return nil if the input sequence is not in the range
38
- # # input_sequence: String of nucleotide sequence
39
- # # start_position and end_position: Integer of the start and end reference number of the sub-sequence
40
- # # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
41
- # # path_to_muscle: path to the muscle executable.
42
- # # Default as :false, will call MuscleBio to run Muscle
43
- # # specify path_to_muscle if other source of muscle needed
44
- # # example code
45
- # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
46
- # p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
47
- # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
48
-
49
- # =USAGE3
50
- # ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
51
- # # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
52
- # # and a boolean value for allowing indels,
53
- # # path_to_muscle: path to the muscle executable.
54
- # # Default as :false, will call MuscleBio to run Muscle
55
- # # specify path_to_muscle if other source of muscle needed
56
- # # return a sequence sub-hash that meets the the criteria
57
- # # example code
58
- # sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
59
- # filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
60
- # puts sequence_hash.size
61
- # => 6
62
- # puts filtered_sequence_hash.size
63
- # => 4
64
-
65
- module ViralSeq
66
-
67
- def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
68
-
69
- # ViralSeq.check_muscle(path_to_muscle)
70
- ori_ref = ViralSeq.check_ref(ref_option)
71
-
72
- begin
73
- ori_ref_l = ori_ref.size
74
- l1 = 0
75
- l2 = 0
76
-
77
- aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
78
- aln_test = aln_seq[1]
79
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
80
- gap_begin = $1.size
81
- gap_end = $3.size
82
- aln_test2 = $2
83
- ref = aln_seq[0]
84
- ref = ref[gap_begin..(-gap_end-1)]
85
- ref_size = ref.size
86
- if ref_size > 1.3*(seq.size)
87
- l1 = l1 + gap_begin
88
- l2 = l2 + gap_end
89
- max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
90
- aln_test2 =~ /#{max_seq}/
91
- before_aln_seq = $`
92
- before_aln = $`.size
93
- post_aln_seq = $'
94
- post_aln = $'.size
95
- before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
96
- b1 = (1.3 * before_aln_seq_size).to_i
97
- post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
98
- b2 = (1.3 * post_aln_seq_size).to_i
99
- if (before_aln > seq.size) and (post_aln <= seq.size)
100
- ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
101
- l1 = l1 + (before_aln - b1)
102
- elsif (post_aln > seq.size) and (before_aln <= seq.size)
103
- ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
104
- l2 = l2 + post_aln - b2
105
- elsif (post_aln > seq.size) and (before_aln > seq.size)
106
- ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
107
- l1 = l1 + (before_aln - b1)
108
- l2 = l2 + (post_aln - b2)
109
- end
110
-
111
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
112
- aln_test = aln_seq[1]
113
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
114
- gap_begin = $1.size
115
- gap_end = $3.size
116
- ref = aln_seq[0]
117
- ref = ref[gap_begin..(-gap_end-1)]
118
- end
119
-
120
- aln_test = aln_seq[1]
121
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
122
- gap_begin = $1.size
123
- gap_end = $3.size
124
- aln_test = $2
125
- aln_test =~ /^(\w+)(\-*)\w/
126
- s1 = $1.size
127
- g1 = $2.size
128
- aln_test =~ /\w(\-*)(\w+)$/
129
- s2 = $2.size
130
- g2 = $1.size
131
-
132
- l1 = l1 + gap_begin
133
- l2 = l2 + gap_end
134
- repeat = 0
135
-
136
- if g1 == g2 and (s1 + g1 + s2) == ref.size
137
- if s1 > s2 and g2 > 2*s2
138
- ref = ref[0..(-g2-1)]
139
- repeat = 1
140
- l2 = l2 + g2
141
- elsif s1 < s2 and g1 > 2*s1
142
- ref = ref[g1..-1]
143
- repeat = 1
144
- l1 = l1 + g1
145
- end
146
- else
147
- if g1 > 2*s1
148
- ref = ref[g1..-1]
149
- repeat = 1
150
- l1 = l1 + g1
151
- end
152
- if g2 > 2*s2
153
- ref = ref[0..(-g2 - 1)]
154
- repeat = 1
155
- l2 = l2 + g2
156
- end
157
- end
158
-
159
- while repeat == 1
160
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
161
- aln_test = aln_seq[1]
162
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
163
- gap_begin = $1.size
164
- gap_end = $3.size
165
- aln_test = $2
166
- aln_test =~ /^(\w+)(\-*)\w/
167
- s1 = $1.size
168
- g1 = $2.size
169
- aln_test =~ /\w(\-*)(\w+)$/
170
- s2 = $2.size
171
- g2 = $1.size
172
- ref = aln_seq[0]
173
- ref = ref[gap_begin..(-gap_end-1)]
174
- l1 = l1 + gap_begin
175
- l2 = l2 + gap_end
176
- repeat = 0
177
- if g1 > 2*s1
178
- ref = ref[g1..-1]
179
- repeat = 1
180
- l1 = l1 + g1
181
- end
182
- if g2 > 2*s2
183
- ref = ref[0..(-g2 - 1)]
184
- repeat = 1
185
- l2 = l2 + g2
186
- end
187
- end
188
- ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
189
-
190
-
191
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
192
- aln_test = aln_seq[1]
193
- ref = aln_seq[0]
194
-
195
- #refine alignment
196
-
197
- if ref =~ /^(\-+)/
198
- l1 = l1 - $1.size
199
- elsif ref =~ /(\-+)$/
200
- l2 = l2 + $1.size
201
- end
202
-
203
- if (ori_ref_l - l2 - 1) >= l1
204
- ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
205
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
206
- aln_test = aln_seq[1]
207
- ref = aln_seq[0]
208
-
209
- ref_size = ref.size
210
- sim_count = 0
211
- (0..(ref_size-1)).each do |n|
212
- ref_base = ref[n]
213
- test_base = aln_test[n]
214
- sim_count += 1 if ref_base == test_base
215
- end
216
- similarity = (sim_count/ref_size.to_f*100).round(1)
217
-
218
- loc_p1 = l1 + 1
219
- loc_p2 = ori_ref_l - l2
220
- if seq.size != (loc_p2 - loc_p1 + 1)
221
- indel = true
222
- elsif aln_test.include?("-")
223
- indel = true
224
- else
225
- indel = false
226
- end
227
- return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
228
- else
229
- return [0,0,0,0,0,0,0]
230
- end
231
- rescue => e
232
- puts "Unexpected error occured."
233
- puts "Exception Class: #{ e.class.name }"
234
- puts "Exception Message: #{ e.message }"
235
- puts "Exception Backtrace: #{ e.backtrace[0] }"
236
- puts "ViralSeq.sequence_locator returns nil"
237
- return nil
238
- end
239
- end
240
-
241
- # sequence clip function
242
- def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
243
- loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
244
- l1 = loc[0]
245
- l2 = loc[1]
246
- if (p1 >= l1) & (p2 <= l2)
247
- seq = loc[4]
248
- ref = loc[5]
249
- g1 = 0
250
- ref.each_char do |char|
251
- break if l1 == p1
252
- g1 += 1
253
- l1 += 1 unless char == "-"
254
- end
255
- g2 = 1
256
- ref.reverse.each_char do |char|
257
- break if l2 == p2
258
- g2 += 1
259
- l2 -= 1 unless char == "-"
260
- end
261
- return seq[g1..(-g2)].tr("-","")
262
- else
263
- return nil
264
- end
265
- end
266
-
267
- # batch quality check of HIV sequences based on ViralSeq.sequence_locator
268
- # input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
269
- # and allow the sequence to contain indels
270
- # return a hash of filtered sequences
271
-
272
- def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
273
- seq_hash_unique = seq_hash.values.uniq
274
- seq_hash_unique_pass = []
275
- start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
276
- end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
277
- seq_hash_unique.each do |seq|
278
- loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
279
- if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
280
- if indel
281
- seq_hash_unique_pass << seq
282
- elsif loc[3] == false
283
- seq_hash_unique_pass << seq
284
- end
285
- end
286
- end
287
- seq_pass = {}
288
- seq_hash_unique_pass.each do |seq|
289
- seq_hash.each do |seq_name, orginal_seq|
290
- if orginal_seq == seq
291
- seq_pass[seq_name] = seq
292
- seq_hash.delete(seq_name)
293
- end
294
- end
295
- end
296
- return seq_pass
297
- end
298
-
299
- end
@@ -1,103 +0,0 @@
1
- # viral_seq/misc.rb
2
-
3
- # miscellaneous methods
4
- # including
5
- # Hash#copyhash
6
- # Hash#difference
7
- # Hash#uniq_hash
8
- # ViralSeq::tail
9
-
10
- class Hash
11
-
12
- # Hash#copyhash
13
- # copy a hash
14
- # different from "="
15
- # # example
16
- # h1 = {1=>'a'}
17
- # h2 = h1
18
- # h3 = h1.copyhash
19
- # h1.object_id == h2.object_id
20
- # => true
21
- # h1.object_id == h3.object_id
22
- # => false
23
-
24
- def copyhash
25
- h = Hash.new
26
- self.each do |pair|
27
- h.store(pair[0], pair[1])
28
- end
29
- return h
30
- end
31
-
32
- # subtract one hash (h2) from the other (h1) if the keys are identical
33
- # example:
34
- # h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
35
- # h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
36
- # h1.difference(h2) = {"Bird" => 2, "Snake" => 10}
37
-
38
- def difference(other)
39
- reject do |k,_v|
40
- other.has_key? k
41
- end
42
- end
43
-
44
- # input hash A, return hash B with the unique values of hash A as keys,
45
- # and the keys of the unique values of hash A as values of hash B
46
- # # example
47
- # hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
48
- # p hash.uniq_hash
49
- # => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
50
-
51
- def uniq_hash
52
- uniq_values = self.values.uniq
53
- out_hash = {}
54
- uniq_values.each do |uniq_va|
55
- self.each do |k,v|
56
- if v == uniq_va
57
- if out_hash[uniq_va]
58
- out_hash[uniq_va] << k
59
- else
60
- out_hash[uniq_va] = []
61
- out_hash[uniq_va] << k
62
- end
63
- end
64
- end
65
- end
66
- return out_hash
67
- end
68
- end
69
-
70
- # Tail function for file as 'tail' in bash.
71
- def ViralSeq.tail(path, n)
72
- file = File.open(path, "r")
73
- buffer_s = 512
74
- line_count = 0
75
- file.seek(0, IO::SEEK_END)
76
-
77
- offset = file.pos # we start at the end
78
-
79
- while line_count <= n && offset > 0
80
- to_read = if (offset - buffer_s) < 0
81
- offset
82
- else
83
- buffer_s
84
- end
85
-
86
- file.seek(offset-to_read)
87
- data = file.read(to_read)
88
-
89
- data.reverse.each_char do |c|
90
- if line_count > n
91
- offset += 1
92
- break
93
- end
94
- offset -= 1
95
- if c == "\n"
96
- line_count += 1
97
- end
98
- end
99
- end
100
-
101
- file.seek(offset)
102
- file.read
103
- end