viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/tcs_core.rb
DELETED
@@ -1,556 +0,0 @@
|
|
1
|
-
# viral_seq/tcs_core
|
2
|
-
# core functions for TCS and DR pipeline
|
3
|
-
# functions to manipulate sequences including:
|
4
|
-
# ViralSeq::calculate_pid_cut_off
|
5
|
-
# ViralSeq::consensus
|
6
|
-
# ViralSeq::generate_primer_id_pool
|
7
|
-
# ViralSeq::similar_pid?
|
8
|
-
# ViralSeq::filter_similar_pid
|
9
|
-
# ViralSeq::collapse_sequence_by_x_nt_difference
|
10
|
-
# ViralSeq::compare_two_seq
|
11
|
-
# ViralSeq::gap_strip
|
12
|
-
# ViralSeq::gap_strip_ends
|
13
|
-
# ViralSeq::paired_join1
|
14
|
-
# ViralSeq::paired_join2
|
15
|
-
|
16
|
-
# ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
|
17
|
-
# # A function to calcuate cut-off for offspring primer IDs.
|
18
|
-
# # see reference at Zhou et al. JVI 2016.
|
19
|
-
# # https://www.ncbi.nlm.nih.gov/pubmed/26041299
|
20
|
-
# # PID_abundance is the abundance of a certain PID
|
21
|
-
# # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
|
22
|
-
# # the model supports error rate from 0.003 to 0.03.
|
23
|
-
# # return an abundance cut-off (Integer) for offspring Primer IDs.
|
24
|
-
|
25
|
-
# ViralSeq.consensus(seq_array, majority_cutoff)
|
26
|
-
# # Generate a consensus sequence from a given sequence array.
|
27
|
-
# # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
|
28
|
-
# # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
|
29
|
-
# =USAGE
|
30
|
-
# a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
|
31
|
-
|
32
|
-
# ViralSeq.generate_primer_id_pool(n)
|
33
|
-
# # generate all Primer ID combinations given the length of Primer ID
|
34
|
-
# # n is the length of the Primer ID (Integer). default value of n is 8.
|
35
|
-
# =USAGE
|
36
|
-
# primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
|
37
|
-
# puts primer_id_pool.size #should be 4^10
|
38
|
-
# => 1048576
|
39
|
-
|
40
|
-
# ViralSeq.similar_pid?(pid1, pid2, base_difference)
|
41
|
-
# # compare two primer ID sequences.
|
42
|
-
# # If they differ in certain bases, return boolean value "TURE",
|
43
|
-
# # else, return boolean value "FALSE"
|
44
|
-
# # where pid1 and pid2 are two Primer IDs for comparison
|
45
|
-
# # base_difference is an Integer for difference bases that allowed
|
46
|
-
# =USAGE
|
47
|
-
# # example
|
48
|
-
# ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
|
49
|
-
# => true
|
50
|
-
|
51
|
-
# ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
|
52
|
-
# # compare PID with sequences which have identical sequences.
|
53
|
-
# # PIDs differ by 1 base will be recognized.
|
54
|
-
# # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
|
55
|
-
# # where sequence_fasta_file is the sequence file in fasta format
|
56
|
-
# # each sequence tag starting with ">" and the Primer ID sequence
|
57
|
-
# # followed by the number of Primer ID appeared in the raw sequence
|
58
|
-
# # the information sections in the tags are separated by underscore "_"
|
59
|
-
# # example sequence tag: >AGGCGTAGA_32_sample1_RT
|
60
|
-
# # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
|
61
|
-
# # default value for cut_off is 10
|
62
|
-
# # return a new sequence hash. {sequence_name => sequence, ...}
|
63
|
-
|
64
|
-
# ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
|
65
|
-
# # ollapse sequences with x number of nt differences.
|
66
|
-
# # input an Array object of sequences, make sure sequences are aligned.
|
67
|
-
# # return a new Array object of collapsed sequences
|
68
|
-
# # The return frequency is NOT the frequency of the collasped sequences.
|
69
|
-
|
70
|
-
# ViralSeq.compare_two_seq(seq1, seq2)
|
71
|
-
# # compare two sequences as String object, return the number of differences as integer
|
72
|
-
# # sequences will NOT align
|
73
|
-
# # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
|
74
|
-
# =USAGE
|
75
|
-
# # example
|
76
|
-
# seq1 = 'AAGGCGTAGGAC'
|
77
|
-
# seq2 = 'AAGCTTAGGACG'
|
78
|
-
# puts ViralSeq.compare_two_seq(seq1, seq2)
|
79
|
-
# => 8
|
80
|
-
# aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
|
81
|
-
# puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
|
82
|
-
# => 4
|
83
|
-
|
84
|
-
# ViralSeq.gap_strip(sequence_hash)
|
85
|
-
# # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
|
86
|
-
# =USAGE
|
87
|
-
# # example
|
88
|
-
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
89
|
-
# '>seq2' => 'A-CCGGTT',
|
90
|
-
# '>seq3' => 'AAC-GGTT',
|
91
|
-
# '>seq4' => 'AACCG-TT',
|
92
|
-
# '>seq5' => 'AACCGGT-'}
|
93
|
-
# ViralSeq.gap_strip(sequence_hash)
|
94
|
-
# => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
|
95
|
-
|
96
|
-
# ViralSeq.gap_strip_ends(sequence_hash)
|
97
|
-
# # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
|
98
|
-
# =USAGE
|
99
|
-
# # example
|
100
|
-
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
101
|
-
# '>seq2' => 'A-CCGGTT',
|
102
|
-
# '>seq3' => 'AAC-GGTT',
|
103
|
-
# '>seq4' => 'AACCG-TT',
|
104
|
-
# '>seq5' => 'AACCGGT-'}
|
105
|
-
# ViralSeq.gap_strip_ends(sequence_hash)
|
106
|
-
# => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
|
107
|
-
|
108
|
-
# ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
|
109
|
-
# # pair-end join function for KNOW overlap size
|
110
|
-
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
111
|
-
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
112
|
-
# # overlap is an integer that indicate how many bases are overlapped.
|
113
|
-
# # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
|
114
|
-
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
115
|
-
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
116
|
-
# =USAGE
|
117
|
-
# # example
|
118
|
-
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
119
|
-
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
120
|
-
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
121
|
-
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
122
|
-
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
123
|
-
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
124
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
|
125
|
-
# => [">pair1"]
|
126
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
|
127
|
-
# => [">pair1", ">pair2"]
|
128
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.02)
|
129
|
-
# => [">pair1", ">pair2", ">pair3"]
|
130
|
-
|
131
|
-
# ViralSeq.paired_join2(seq_pair_hash, model, diff)
|
132
|
-
# # pair-end join function for UNKNOW overlap
|
133
|
-
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
134
|
-
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
135
|
-
# # model has two options, 1 or 2 as Integer
|
136
|
-
# # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
137
|
-
# # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
138
|
-
# # minimal overlap by model 2 set to 4 positions
|
139
|
-
# # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
|
140
|
-
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
141
|
-
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
142
|
-
# =USAGE
|
143
|
-
# # example 1
|
144
|
-
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
145
|
-
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
146
|
-
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
147
|
-
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
148
|
-
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
149
|
-
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
150
|
-
# ViralSeq.paired_join2(paired_seqs, 1).keys
|
151
|
-
# => [">pair1"]
|
152
|
-
# ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
|
153
|
-
# => [">pair1", ">pair2"]
|
154
|
-
#
|
155
|
-
# # example 2
|
156
|
-
# paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
|
157
|
-
# ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
|
158
|
-
# ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
|
159
|
-
# }
|
160
|
-
# ViralSeq.paired_join2(paired_seq2, 1)
|
161
|
-
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
162
|
-
# ViralSeq.paired_join2(paired_seq2, 2)
|
163
|
-
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
164
|
-
|
165
|
-
|
166
|
-
module ViralSeq
|
167
|
-
|
168
|
-
# calculate cut-off for offspring primer IDs.
|
169
|
-
def self.calculate_pid_cut_off(m, error_rate = 0.02)
|
170
|
-
if m <= 10
|
171
|
-
return 2
|
172
|
-
end
|
173
|
-
n = 0
|
174
|
-
case error_rate
|
175
|
-
when 0...0.0075
|
176
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
177
|
-
when 0.0075...0.015
|
178
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
179
|
-
when 0.015..0.03
|
180
|
-
if m <= 8500
|
181
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
182
|
-
else
|
183
|
-
n = 0.0079 * m + 9.4869
|
184
|
-
end
|
185
|
-
else
|
186
|
-
raise ArgumentError.new('Error_rate has be between 0 to 0.03')
|
187
|
-
end
|
188
|
-
n = n.round
|
189
|
-
n = 2 if n < 3
|
190
|
-
return n
|
191
|
-
end
|
192
|
-
|
193
|
-
# create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
|
194
|
-
# example:
|
195
|
-
# position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
|
196
|
-
def self.consensus(seq_array, cutoff = 0.5)
|
197
|
-
seq_length = seq_array[0].size
|
198
|
-
seq_size = seq_array.size
|
199
|
-
consensus_seq = ""
|
200
|
-
(0..(seq_length - 1)).each do |position|
|
201
|
-
all_base = []
|
202
|
-
seq_array.each do |seq|
|
203
|
-
all_base << seq[position]
|
204
|
-
end
|
205
|
-
base_count = ViralSeq.count(all_base)
|
206
|
-
max_base_list = []
|
207
|
-
|
208
|
-
base_count.each do |k,v|
|
209
|
-
if v/seq_size.to_f >= cutoff
|
210
|
-
max_base_list << k
|
211
|
-
end
|
212
|
-
end
|
213
|
-
consensus_seq += ViralSeq.call_consensus_base(max_base_list)
|
214
|
-
end
|
215
|
-
return consensus_seq
|
216
|
-
end
|
217
|
-
|
218
|
-
# call consensus nucleotide, used by ViralSeq.consensus
|
219
|
-
def self.call_consensus_base(base_array)
|
220
|
-
if base_array.size == 1
|
221
|
-
base_array[0]
|
222
|
-
elsif base_array.size == 2
|
223
|
-
case base_array.sort!
|
224
|
-
when ["A","T"]
|
225
|
-
"W"
|
226
|
-
when ["C","G"]
|
227
|
-
"S"
|
228
|
-
when ["A","C"]
|
229
|
-
"M"
|
230
|
-
when ["G","T"]
|
231
|
-
"K"
|
232
|
-
when ["A","G"]
|
233
|
-
"R"
|
234
|
-
when ["C","T"]
|
235
|
-
"Y"
|
236
|
-
else
|
237
|
-
"N"
|
238
|
-
end
|
239
|
-
|
240
|
-
elsif base_array.size == 3
|
241
|
-
case base_array.sort!
|
242
|
-
when ["C","G","T"]
|
243
|
-
"B"
|
244
|
-
when ["A","G","T"]
|
245
|
-
"D"
|
246
|
-
when ["A","C","T"]
|
247
|
-
"H"
|
248
|
-
when ["A","C","G"]
|
249
|
-
"V"
|
250
|
-
else
|
251
|
-
"N"
|
252
|
-
end
|
253
|
-
else
|
254
|
-
"N"
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
# generate all Primer ID combinations given the length of Primer ID
|
259
|
-
def self.generate_primer_id_pool(l=8)
|
260
|
-
nt = ['A','T','C','G']
|
261
|
-
pid_pool = ['A','T','C','G']
|
262
|
-
(l-1).times do
|
263
|
-
pid_pool = pid_pool.product(nt)
|
264
|
-
pid_pool.collect! do |v|
|
265
|
-
v.join("")
|
266
|
-
end
|
267
|
-
end
|
268
|
-
return pid_pool
|
269
|
-
end
|
270
|
-
|
271
|
-
# compare two primer ID sequences.
|
272
|
-
# If they differ in x base, return boolean value "TURE",
|
273
|
-
# else, return boolean value "FALSE"
|
274
|
-
def self.similar_pid?(pid1="",pid2="", x=0)
|
275
|
-
l = pid1.size
|
276
|
-
m = l - x
|
277
|
-
n = 0
|
278
|
-
if pid1.size != pid2.size
|
279
|
-
return false
|
280
|
-
else
|
281
|
-
(0..(pid1.size - 1)).each do |k|
|
282
|
-
if pid1[k] == pid2[k]
|
283
|
-
n += 1
|
284
|
-
end
|
285
|
-
end
|
286
|
-
if n >= m
|
287
|
-
return true
|
288
|
-
else
|
289
|
-
return false
|
290
|
-
end
|
291
|
-
end
|
292
|
-
end
|
293
|
-
|
294
|
-
# compare PID with sequences which have identical sequences.
|
295
|
-
# PIDs differ by 1 base will be recognized.
|
296
|
-
# if PID1 is x time greater than PID2, PID2 will be disgarded
|
297
|
-
def self.filter_similar_pid(sequence_file = "", cutoff = 10)
|
298
|
-
seq = ViralSeq.fasta_to_hash(sequence_file)
|
299
|
-
uni_seq = seq.values.uniq
|
300
|
-
uni_seq_pid = {}
|
301
|
-
uni_seq.each do |k|
|
302
|
-
seq.each do |name,s|
|
303
|
-
name = name[1..-1]
|
304
|
-
if k == s
|
305
|
-
if uni_seq_pid[k]
|
306
|
-
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
307
|
-
else
|
308
|
-
uni_seq_pid[k] = []
|
309
|
-
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
dup_pid = []
|
316
|
-
uni_seq_pid.values.each do |v|
|
317
|
-
next if v.size == 1
|
318
|
-
pid_hash = Hash[v]
|
319
|
-
list = pid_hash.keys
|
320
|
-
list2 = Array.new(list)
|
321
|
-
pairs = []
|
322
|
-
|
323
|
-
list.each do |k|
|
324
|
-
list2.delete(k)
|
325
|
-
list2.each do |k1|
|
326
|
-
pairs << [k,k1]
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
|
331
|
-
pairs.each do |p|
|
332
|
-
pid1 = p[0]
|
333
|
-
pid2 = p[1]
|
334
|
-
if ViralSeq.similar_pid?(pid1,pid2,1)
|
335
|
-
n1 = pid_hash[pid1].to_i
|
336
|
-
n2 = pid_hash[pid2].to_i
|
337
|
-
if n1 >= cutoff * n2
|
338
|
-
dup_pid << pid2
|
339
|
-
elsif n2 >= cutoff * n1
|
340
|
-
dup_pid << pid1
|
341
|
-
end
|
342
|
-
end
|
343
|
-
end
|
344
|
-
end
|
345
|
-
|
346
|
-
|
347
|
-
new_seq = {}
|
348
|
-
seq.each do |name,s|
|
349
|
-
pid = name.split("_")[0][1..-1]
|
350
|
-
unless dup_pid.include?(pid)
|
351
|
-
new_seq[name] = s
|
352
|
-
end
|
353
|
-
end
|
354
|
-
return new_seq
|
355
|
-
end
|
356
|
-
|
357
|
-
# collapse sequences with x number of nt differences. make sure sequences are aligned.
|
358
|
-
# The return frequency is NOT the frequency of the collasped sequences.
|
359
|
-
def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
|
360
|
-
new_seq_freq = {}
|
361
|
-
seq_freq = ViralSeq.count(seq_array)
|
362
|
-
if seq_freq.size == 1
|
363
|
-
new_seq_freq = seq_freq
|
364
|
-
else
|
365
|
-
uniq_seq = seq_freq.keys
|
366
|
-
unique_seq_pair = uniq_seq.combination(2)
|
367
|
-
dupli_seq = []
|
368
|
-
unique_seq_pair.each do |pair|
|
369
|
-
seq1 = pair[0]
|
370
|
-
seq2 = pair[1]
|
371
|
-
diff = ViralSeq.compare_two_seq(seq1,seq2)
|
372
|
-
if diff <= cutoff
|
373
|
-
freq1 = seq_freq[seq1]
|
374
|
-
freq2 = seq_freq[seq2]
|
375
|
-
freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
|
376
|
-
end
|
377
|
-
end
|
378
|
-
|
379
|
-
seq_freq.each do |seq,freq|
|
380
|
-
unless dupli_seq.include?(seq)
|
381
|
-
new_seq_freq[seq] = freq
|
382
|
-
end
|
383
|
-
end
|
384
|
-
return new_seq_freq
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
|
389
|
-
# compare two sequences, return the number of different positions, NO NEED alignment
|
390
|
-
|
391
|
-
def self.compare_two_seq(seq1 = "", seq2 = "")
|
392
|
-
length = seq1.size
|
393
|
-
diff = 0
|
394
|
-
(0..(length-1)).each do |position|
|
395
|
-
nt1 = seq1[position]
|
396
|
-
nt2 = seq2[position]
|
397
|
-
diff += 1 unless nt1 == nt2
|
398
|
-
end
|
399
|
-
return diff
|
400
|
-
end
|
401
|
-
|
402
|
-
# gap strip from a sequence alignment
|
403
|
-
|
404
|
-
def self.gap_strip(sequence_alignment)
|
405
|
-
new_seq_hash = {}
|
406
|
-
seq_size = sequence_alignment.values[0].size
|
407
|
-
seq_matrix = {}
|
408
|
-
(0..(seq_size - 1)).each do |p|
|
409
|
-
seq_matrix[p] = []
|
410
|
-
sequence_alignment.values.each do |s|
|
411
|
-
seq_matrix[p] << s[p]
|
412
|
-
end
|
413
|
-
end
|
414
|
-
|
415
|
-
seq_matrix.delete_if do |_p, list|
|
416
|
-
list.include?("-")
|
417
|
-
end
|
418
|
-
|
419
|
-
sequence_alignment.each do |n,s|
|
420
|
-
new_s = ""
|
421
|
-
seq_matrix.keys.each {|p| new_s += s[p]}
|
422
|
-
new_seq_hash[n] = new_s
|
423
|
-
end
|
424
|
-
return new_seq_hash
|
425
|
-
end
|
426
|
-
|
427
|
-
# gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
|
428
|
-
|
429
|
-
def self.gap_strip_ends(sequence_alignment)
|
430
|
-
new_seq_hash = {}
|
431
|
-
seq_size = sequence_alignment.values[0].size
|
432
|
-
seq_matrix = {}
|
433
|
-
(0..(seq_size - 1)).each do |p|
|
434
|
-
seq_matrix[p] = []
|
435
|
-
sequence_alignment.values.each do |s|
|
436
|
-
seq_matrix[p] << s[p]
|
437
|
-
end
|
438
|
-
end
|
439
|
-
n1 = 0
|
440
|
-
n2 = 0
|
441
|
-
seq_matrix.each do |_p, list|
|
442
|
-
if list.include?("-")
|
443
|
-
n1 += 1
|
444
|
-
else
|
445
|
-
break
|
446
|
-
end
|
447
|
-
end
|
448
|
-
|
449
|
-
seq_matrix.keys.reverse.each do |p|
|
450
|
-
list = seq_matrix[p]
|
451
|
-
if list.include?("-")
|
452
|
-
n2 += 1
|
453
|
-
else
|
454
|
-
break
|
455
|
-
end
|
456
|
-
end
|
457
|
-
|
458
|
-
sequence_alignment.each do |n,s|
|
459
|
-
new_s = s[n1..(- n2 - 1)]
|
460
|
-
new_seq_hash[n] = new_s
|
461
|
-
end
|
462
|
-
return new_seq_hash
|
463
|
-
end
|
464
|
-
|
465
|
-
# input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
|
466
|
-
# overlap is pre-determined
|
467
|
-
def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
|
468
|
-
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
469
|
-
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
470
|
-
joined_seq_hash = {}
|
471
|
-
seq_pair_hash.each do |seq_name, seq_pair|
|
472
|
-
r1_seq = seq_pair[0]
|
473
|
-
r2_seq = seq_pair[1]
|
474
|
-
if overlap.zero?
|
475
|
-
joined_seq_hash[seq_name] = r1_seq + r2_seq
|
476
|
-
elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
|
477
|
-
joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
|
478
|
-
else
|
479
|
-
next
|
480
|
-
end
|
481
|
-
end
|
482
|
-
return joined_seq_hash
|
483
|
-
end
|
484
|
-
|
485
|
-
|
486
|
-
# overlap is not predetermined
|
487
|
-
# model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
488
|
-
# model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
489
|
-
def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
|
490
|
-
begin
|
491
|
-
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
492
|
-
if model == 1
|
493
|
-
overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
|
494
|
-
return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
|
495
|
-
elsif model == 2
|
496
|
-
joined_seq_hash = {}
|
497
|
-
seq_pair_hash.each do |seq_name, seq_pair|
|
498
|
-
overlap_list = []
|
499
|
-
ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
|
500
|
-
cut_off_base = overlap1 * diff
|
501
|
-
overlap_list << overlap1 if diff_nt <= cut_off_base
|
502
|
-
end
|
503
|
-
if overlap_list.empty?
|
504
|
-
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
|
505
|
-
else
|
506
|
-
overlap = overlap_list.max
|
507
|
-
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
|
508
|
-
end
|
509
|
-
end
|
510
|
-
return joined_seq_hash
|
511
|
-
else
|
512
|
-
raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
|
513
|
-
end
|
514
|
-
rescue ArgumentError => e
|
515
|
-
puts e
|
516
|
-
return nil
|
517
|
-
end
|
518
|
-
end
|
519
|
-
|
520
|
-
# determine overlap size from a paired sequence Hash object
|
521
|
-
def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
|
522
|
-
overlaps = []
|
523
|
-
seq_pair_hash.each do |_seq_name, seq_pair|
|
524
|
-
overlap_list = []
|
525
|
-
matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
|
526
|
-
matrix.each do |overlap, diff_nt|
|
527
|
-
cut_off_base = overlap * diff
|
528
|
-
overlap_list << overlap if diff_nt <= cut_off_base
|
529
|
-
end
|
530
|
-
if overlap_list.empty?
|
531
|
-
overlaps << 0
|
532
|
-
else
|
533
|
-
overlaps << overlap_list.max
|
534
|
-
end
|
535
|
-
end
|
536
|
-
count_overlaps = ViralSeq.count(overlaps)
|
537
|
-
max_value = count_overlaps.values.max
|
538
|
-
max_overlap_list = []
|
539
|
-
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
540
|
-
max_overlap_list.max
|
541
|
-
end
|
542
|
-
|
543
|
-
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
544
|
-
# {:overlap_size => number_of_differnt_positions, ...}
|
545
|
-
# {minimal overlap set to 4. }
|
546
|
-
def self.overlap_matrix(sequence1, sequence2)
|
547
|
-
min_overlap = 4
|
548
|
-
max_overlap = [sequence1.size, sequence2.size].max
|
549
|
-
matrix_hash = {}
|
550
|
-
(min_overlap..max_overlap).each do |overlap|
|
551
|
-
matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
|
552
|
-
end
|
553
|
-
return matrix_hash
|
554
|
-
end
|
555
|
-
|
556
|
-
end
|