viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,54 +0,0 @@
1
- # viral_seq/hcv_dr
2
- # HCV resistant mutation interpretation
3
- # ViralSeq::hcv_ns5a
4
-
5
- # ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
6
- # # amino_acid_sequence_array is Array object of the amino acid sequence.
7
- # # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
8
- # # start_aa_position is the starting aa number of the input sequence as Integer
9
-
10
- module ViralSeq
11
- def self.hcv_ns5a(aa_array,start_aa=1)
12
- out_hash = {}
13
- sdrm = {}
14
- sdrm[28] = ['M',['T']]
15
- sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
16
- sdrm[31] = ['L',['M','V','F']]
17
- sdrm[32] = ['P',['L']]
18
- sdrm[44] = ['K',['R']]
19
- sdrm[58] = ['H',['D','P','S']]
20
- sdrm[64] = ['T',['A','S']]
21
- sdrm[77] = ['P',['A','S']]
22
- sdrm[78] = ['R',['K']]
23
- sdrm[79] = ['T',['A']]
24
- sdrm[83] = ['T',['M']]
25
- sdrm[85] = ['S',['N','H','Y']]
26
- sdrm[92] = ['A',['P','T','K','E']]
27
- sdrm[93] = ['Y',['C','F','H','N']]
28
- sdrm[107] = ['K',['T','S']]
29
- sdrm[121] = ['I',['V']]
30
- sdrm[135] = ['T',['A']]
31
- aa_length = aa_array.size
32
- end_aa = start_aa + aa_length - 1
33
- (start_aa..end_aa).each do |position|
34
- array_position = position - start_aa
35
- if sdrm.keys.include?(position)
36
- wt_aa = sdrm[position][0]
37
- test_aa = aa_array[array_position]
38
- if test_aa.size == 1
39
- unless wt_aa == test_aa
40
- if sdrm[position][1].include?(test_aa)
41
- out_hash[position] = [wt_aa,test_aa]
42
- end
43
- end
44
- else
45
- test_aa_array = test_aa.split("/")
46
- if (test_aa_array & sdrm[position][1])
47
- out_hash[position] = [wt_aa,test_aa]
48
- end
49
- end
50
- end
51
- end
52
- return out_hash
53
- end
54
- end
@@ -1,299 +0,0 @@
1
- # viral_seq/locator.rb
2
-
3
- # Including following methods:
4
- # ViralSeq::sequence_locator
5
- # ViralSeq::sequence_clip
6
- # ViralSeq::qc_hiv_seq_check
7
-
8
- # HIV sequence locator function
9
- # resembling HIV Sequence Locator from LANL
10
- # https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
11
- # require MUSCLE (http://www.drive5.com/muscle) installed
12
- # current version only supports nucleotide sequence, not for amino acid sequence.
13
-
14
- # =USAGE1
15
- # # Find the location of a sequence
16
- # ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
17
- # # input_sequence: String of nucleotide sequence
18
- # # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
19
- # # path_to_muscle: path to the muscle executable.
20
- # # Default as :false, will call MuscleBio to run Muscle
21
- # # specify path_to_muscle if other source of muscle needed
22
- # # function returns an array of
23
- # # start_location (Integer)
24
- # # end_location (Integer)
25
- # # percentage_of_similarity_to_reference_sequence (Float)
26
- # # containing_indel? (Boolean)
27
- # # aligned_input_sequence (String)
28
- # # aligned_reference_sequence (String)
29
- # # example code
30
- # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
31
- # p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
32
- # => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
33
-
34
- # =USAGE2
35
- # ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
36
- # # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
37
- # # return nil if the input sequence is not in the range
38
- # # input_sequence: String of nucleotide sequence
39
- # # start_position and end_position: Integer of the start and end reference number of the sub-sequence
40
- # # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
41
- # # path_to_muscle: path to the muscle executable.
42
- # # Default as :false, will call MuscleBio to run Muscle
43
- # # specify path_to_muscle if other source of muscle needed
44
- # # example code
45
- # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
46
- # p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
47
- # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
48
-
49
- # =USAGE3
50
- # ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
51
- # # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
52
- # # and a boolean value for allowing indels,
53
- # # path_to_muscle: path to the muscle executable.
54
- # # Default as :false, will call MuscleBio to run Muscle
55
- # # specify path_to_muscle if other source of muscle needed
56
- # # return a sequence sub-hash that meets the the criteria
57
- # # example code
58
- # sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
59
- # filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
60
- # puts sequence_hash.size
61
- # => 6
62
- # puts filtered_sequence_hash.size
63
- # => 4
64
-
65
- module ViralSeq
66
-
67
- def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
68
-
69
- # ViralSeq.check_muscle(path_to_muscle)
70
- ori_ref = ViralSeq.check_ref(ref_option)
71
-
72
- begin
73
- ori_ref_l = ori_ref.size
74
- l1 = 0
75
- l2 = 0
76
-
77
- aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
78
- aln_test = aln_seq[1]
79
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
80
- gap_begin = $1.size
81
- gap_end = $3.size
82
- aln_test2 = $2
83
- ref = aln_seq[0]
84
- ref = ref[gap_begin..(-gap_end-1)]
85
- ref_size = ref.size
86
- if ref_size > 1.3*(seq.size)
87
- l1 = l1 + gap_begin
88
- l2 = l2 + gap_end
89
- max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
90
- aln_test2 =~ /#{max_seq}/
91
- before_aln_seq = $`
92
- before_aln = $`.size
93
- post_aln_seq = $'
94
- post_aln = $'.size
95
- before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
96
- b1 = (1.3 * before_aln_seq_size).to_i
97
- post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
98
- b2 = (1.3 * post_aln_seq_size).to_i
99
- if (before_aln > seq.size) and (post_aln <= seq.size)
100
- ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
101
- l1 = l1 + (before_aln - b1)
102
- elsif (post_aln > seq.size) and (before_aln <= seq.size)
103
- ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
104
- l2 = l2 + post_aln - b2
105
- elsif (post_aln > seq.size) and (before_aln > seq.size)
106
- ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
107
- l1 = l1 + (before_aln - b1)
108
- l2 = l2 + (post_aln - b2)
109
- end
110
-
111
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
112
- aln_test = aln_seq[1]
113
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
114
- gap_begin = $1.size
115
- gap_end = $3.size
116
- ref = aln_seq[0]
117
- ref = ref[gap_begin..(-gap_end-1)]
118
- end
119
-
120
- aln_test = aln_seq[1]
121
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
122
- gap_begin = $1.size
123
- gap_end = $3.size
124
- aln_test = $2
125
- aln_test =~ /^(\w+)(\-*)\w/
126
- s1 = $1.size
127
- g1 = $2.size
128
- aln_test =~ /\w(\-*)(\w+)$/
129
- s2 = $2.size
130
- g2 = $1.size
131
-
132
- l1 = l1 + gap_begin
133
- l2 = l2 + gap_end
134
- repeat = 0
135
-
136
- if g1 == g2 and (s1 + g1 + s2) == ref.size
137
- if s1 > s2 and g2 > 2*s2
138
- ref = ref[0..(-g2-1)]
139
- repeat = 1
140
- l2 = l2 + g2
141
- elsif s1 < s2 and g1 > 2*s1
142
- ref = ref[g1..-1]
143
- repeat = 1
144
- l1 = l1 + g1
145
- end
146
- else
147
- if g1 > 2*s1
148
- ref = ref[g1..-1]
149
- repeat = 1
150
- l1 = l1 + g1
151
- end
152
- if g2 > 2*s2
153
- ref = ref[0..(-g2 - 1)]
154
- repeat = 1
155
- l2 = l2 + g2
156
- end
157
- end
158
-
159
- while repeat == 1
160
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
161
- aln_test = aln_seq[1]
162
- aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
163
- gap_begin = $1.size
164
- gap_end = $3.size
165
- aln_test = $2
166
- aln_test =~ /^(\w+)(\-*)\w/
167
- s1 = $1.size
168
- g1 = $2.size
169
- aln_test =~ /\w(\-*)(\w+)$/
170
- s2 = $2.size
171
- g2 = $1.size
172
- ref = aln_seq[0]
173
- ref = ref[gap_begin..(-gap_end-1)]
174
- l1 = l1 + gap_begin
175
- l2 = l2 + gap_end
176
- repeat = 0
177
- if g1 > 2*s1
178
- ref = ref[g1..-1]
179
- repeat = 1
180
- l1 = l1 + g1
181
- end
182
- if g2 > 2*s2
183
- ref = ref[0..(-g2 - 1)]
184
- repeat = 1
185
- l2 = l2 + g2
186
- end
187
- end
188
- ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
189
-
190
-
191
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
192
- aln_test = aln_seq[1]
193
- ref = aln_seq[0]
194
-
195
- #refine alignment
196
-
197
- if ref =~ /^(\-+)/
198
- l1 = l1 - $1.size
199
- elsif ref =~ /(\-+)$/
200
- l2 = l2 + $1.size
201
- end
202
-
203
- if (ori_ref_l - l2 - 1) >= l1
204
- ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
205
- aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
206
- aln_test = aln_seq[1]
207
- ref = aln_seq[0]
208
-
209
- ref_size = ref.size
210
- sim_count = 0
211
- (0..(ref_size-1)).each do |n|
212
- ref_base = ref[n]
213
- test_base = aln_test[n]
214
- sim_count += 1 if ref_base == test_base
215
- end
216
- similarity = (sim_count/ref_size.to_f*100).round(1)
217
-
218
- loc_p1 = l1 + 1
219
- loc_p2 = ori_ref_l - l2
220
- if seq.size != (loc_p2 - loc_p1 + 1)
221
- indel = true
222
- elsif aln_test.include?("-")
223
- indel = true
224
- else
225
- indel = false
226
- end
227
- return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
228
- else
229
- return [0,0,0,0,0,0,0]
230
- end
231
- rescue => e
232
- puts "Unexpected error occured."
233
- puts "Exception Class: #{ e.class.name }"
234
- puts "Exception Message: #{ e.message }"
235
- puts "Exception Backtrace: #{ e.backtrace[0] }"
236
- puts "ViralSeq.sequence_locator returns nil"
237
- return nil
238
- end
239
- end
240
-
241
- # sequence clip function
242
- def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
243
- loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
244
- l1 = loc[0]
245
- l2 = loc[1]
246
- if (p1 >= l1) & (p2 <= l2)
247
- seq = loc[4]
248
- ref = loc[5]
249
- g1 = 0
250
- ref.each_char do |char|
251
- break if l1 == p1
252
- g1 += 1
253
- l1 += 1 unless char == "-"
254
- end
255
- g2 = 1
256
- ref.reverse.each_char do |char|
257
- break if l2 == p2
258
- g2 += 1
259
- l2 -= 1 unless char == "-"
260
- end
261
- return seq[g1..(-g2)].tr("-","")
262
- else
263
- return nil
264
- end
265
- end
266
-
267
- # batch quality check of HIV sequences based on ViralSeq.sequence_locator
268
- # input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
269
- # and allow the sequence to contain indels
270
- # return a hash of filtered sequences
271
-
272
- def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
273
- seq_hash_unique = seq_hash.values.uniq
274
- seq_hash_unique_pass = []
275
- start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
276
- end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
277
- seq_hash_unique.each do |seq|
278
- loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
279
- if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
280
- if indel
281
- seq_hash_unique_pass << seq
282
- elsif loc[3] == false
283
- seq_hash_unique_pass << seq
284
- end
285
- end
286
- end
287
- seq_pass = {}
288
- seq_hash_unique_pass.each do |seq|
289
- seq_hash.each do |seq_name, orginal_seq|
290
- if orginal_seq == seq
291
- seq_pass[seq_name] = seq
292
- seq_hash.delete(seq_name)
293
- end
294
- end
295
- end
296
- return seq_pass
297
- end
298
-
299
- end
@@ -1,103 +0,0 @@
1
- # viral_seq/misc.rb
2
-
3
- # miscellaneous methods
4
- # including
5
- # Hash#copyhash
6
- # Hash#difference
7
- # Hash#uniq_hash
8
- # ViralSeq::tail
9
-
10
- class Hash
11
-
12
- # Hash#copyhash
13
- # copy a hash
14
- # different from "="
15
- # # example
16
- # h1 = {1=>'a'}
17
- # h2 = h1
18
- # h3 = h1.copyhash
19
- # h1.object_id == h2.object_id
20
- # => true
21
- # h1.object_id == h3.object_id
22
- # => false
23
-
24
- def copyhash
25
- h = Hash.new
26
- self.each do |pair|
27
- h.store(pair[0], pair[1])
28
- end
29
- return h
30
- end
31
-
32
- # subtract one hash (h2) from the other (h1) if the keys are identical
33
- # example:
34
- # h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
35
- # h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
36
- # h1.difference(h2) = {"Bird" => 2, "Snake" => 10}
37
-
38
- def difference(other)
39
- reject do |k,_v|
40
- other.has_key? k
41
- end
42
- end
43
-
44
- # input hash A, return hash B with the unique values of hash A as keys,
45
- # and the keys of the unique values of hash A as values of hash B
46
- # # example
47
- # hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
48
- # p hash.uniq_hash
49
- # => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
50
-
51
- def uniq_hash
52
- uniq_values = self.values.uniq
53
- out_hash = {}
54
- uniq_values.each do |uniq_va|
55
- self.each do |k,v|
56
- if v == uniq_va
57
- if out_hash[uniq_va]
58
- out_hash[uniq_va] << k
59
- else
60
- out_hash[uniq_va] = []
61
- out_hash[uniq_va] << k
62
- end
63
- end
64
- end
65
- end
66
- return out_hash
67
- end
68
- end
69
-
70
- # Tail function for file as 'tail' in bash.
71
- def ViralSeq.tail(path, n)
72
- file = File.open(path, "r")
73
- buffer_s = 512
74
- line_count = 0
75
- file.seek(0, IO::SEEK_END)
76
-
77
- offset = file.pos # we start at the end
78
-
79
- while line_count <= n && offset > 0
80
- to_read = if (offset - buffer_s) < 0
81
- offset
82
- else
83
- buffer_s
84
- end
85
-
86
- file.seek(offset-to_read)
87
- data = file.read(to_read)
88
-
89
- data.reverse.each_char do |c|
90
- if line_count > n
91
- offset += 1
92
- break
93
- end
94
- offset -= 1
95
- if c == "\n"
96
- line_count += 1
97
- end
98
- end
99
- end
100
-
101
- file.seek(offset)
102
- file.read
103
- end