viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/tcs_core.rb
DELETED
@@ -1,556 +0,0 @@
|
|
1
|
-
# viral_seq/tcs_core
|
2
|
-
# core functions for TCS and DR pipeline
|
3
|
-
# functions to manipulate sequences including:
|
4
|
-
# ViralSeq::calculate_pid_cut_off
|
5
|
-
# ViralSeq::consensus
|
6
|
-
# ViralSeq::generate_primer_id_pool
|
7
|
-
# ViralSeq::similar_pid?
|
8
|
-
# ViralSeq::filter_similar_pid
|
9
|
-
# ViralSeq::collapse_sequence_by_x_nt_difference
|
10
|
-
# ViralSeq::compare_two_seq
|
11
|
-
# ViralSeq::gap_strip
|
12
|
-
# ViralSeq::gap_strip_ends
|
13
|
-
# ViralSeq::paired_join1
|
14
|
-
# ViralSeq::paired_join2
|
15
|
-
|
16
|
-
# ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
|
17
|
-
# # A function to calcuate cut-off for offspring primer IDs.
|
18
|
-
# # see reference at Zhou et al. JVI 2016.
|
19
|
-
# # https://www.ncbi.nlm.nih.gov/pubmed/26041299
|
20
|
-
# # PID_abundance is the abundance of a certain PID
|
21
|
-
# # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
|
22
|
-
# # the model supports error rate from 0.003 to 0.03.
|
23
|
-
# # return an abundance cut-off (Integer) for offspring Primer IDs.
|
24
|
-
|
25
|
-
# ViralSeq.consensus(seq_array, majority_cutoff)
|
26
|
-
# # Generate a consensus sequence from a given sequence array.
|
27
|
-
# # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
|
28
|
-
# # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
|
29
|
-
# =USAGE
|
30
|
-
# a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
|
31
|
-
|
32
|
-
# ViralSeq.generate_primer_id_pool(n)
|
33
|
-
# # generate all Primer ID combinations given the length of Primer ID
|
34
|
-
# # n is the length of the Primer ID (Integer). default value of n is 8.
|
35
|
-
# =USAGE
|
36
|
-
# primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
|
37
|
-
# puts primer_id_pool.size #should be 4^10
|
38
|
-
# => 1048576
|
39
|
-
|
40
|
-
# ViralSeq.similar_pid?(pid1, pid2, base_difference)
|
41
|
-
# # compare two primer ID sequences.
|
42
|
-
# # If they differ in certain bases, return boolean value "TURE",
|
43
|
-
# # else, return boolean value "FALSE"
|
44
|
-
# # where pid1 and pid2 are two Primer IDs for comparison
|
45
|
-
# # base_difference is an Integer for difference bases that allowed
|
46
|
-
# =USAGE
|
47
|
-
# # example
|
48
|
-
# ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
|
49
|
-
# => true
|
50
|
-
|
51
|
-
# ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
|
52
|
-
# # compare PID with sequences which have identical sequences.
|
53
|
-
# # PIDs differ by 1 base will be recognized.
|
54
|
-
# # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
|
55
|
-
# # where sequence_fasta_file is the sequence file in fasta format
|
56
|
-
# # each sequence tag starting with ">" and the Primer ID sequence
|
57
|
-
# # followed by the number of Primer ID appeared in the raw sequence
|
58
|
-
# # the information sections in the tags are separated by underscore "_"
|
59
|
-
# # example sequence tag: >AGGCGTAGA_32_sample1_RT
|
60
|
-
# # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
|
61
|
-
# # default value for cut_off is 10
|
62
|
-
# # return a new sequence hash. {sequence_name => sequence, ...}
|
63
|
-
|
64
|
-
# ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
|
65
|
-
# # ollapse sequences with x number of nt differences.
|
66
|
-
# # input an Array object of sequences, make sure sequences are aligned.
|
67
|
-
# # return a new Array object of collapsed sequences
|
68
|
-
# # The return frequency is NOT the frequency of the collasped sequences.
|
69
|
-
|
70
|
-
# ViralSeq.compare_two_seq(seq1, seq2)
|
71
|
-
# # compare two sequences as String object, return the number of differences as integer
|
72
|
-
# # sequences will NOT align
|
73
|
-
# # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
|
74
|
-
# =USAGE
|
75
|
-
# # example
|
76
|
-
# seq1 = 'AAGGCGTAGGAC'
|
77
|
-
# seq2 = 'AAGCTTAGGACG'
|
78
|
-
# puts ViralSeq.compare_two_seq(seq1, seq2)
|
79
|
-
# => 8
|
80
|
-
# aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
|
81
|
-
# puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
|
82
|
-
# => 4
|
83
|
-
|
84
|
-
# ViralSeq.gap_strip(sequence_hash)
|
85
|
-
# # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
|
86
|
-
# =USAGE
|
87
|
-
# # example
|
88
|
-
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
89
|
-
# '>seq2' => 'A-CCGGTT',
|
90
|
-
# '>seq3' => 'AAC-GGTT',
|
91
|
-
# '>seq4' => 'AACCG-TT',
|
92
|
-
# '>seq5' => 'AACCGGT-'}
|
93
|
-
# ViralSeq.gap_strip(sequence_hash)
|
94
|
-
# => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
|
95
|
-
|
96
|
-
# ViralSeq.gap_strip_ends(sequence_hash)
|
97
|
-
# # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
|
98
|
-
# =USAGE
|
99
|
-
# # example
|
100
|
-
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
101
|
-
# '>seq2' => 'A-CCGGTT',
|
102
|
-
# '>seq3' => 'AAC-GGTT',
|
103
|
-
# '>seq4' => 'AACCG-TT',
|
104
|
-
# '>seq5' => 'AACCGGT-'}
|
105
|
-
# ViralSeq.gap_strip_ends(sequence_hash)
|
106
|
-
# => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
|
107
|
-
|
108
|
-
# ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
|
109
|
-
# # pair-end join function for KNOW overlap size
|
110
|
-
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
111
|
-
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
112
|
-
# # overlap is an integer that indicate how many bases are overlapped.
|
113
|
-
# # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
|
114
|
-
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
115
|
-
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
116
|
-
# =USAGE
|
117
|
-
# # example
|
118
|
-
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
119
|
-
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
120
|
-
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
121
|
-
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
122
|
-
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
123
|
-
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
124
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
|
125
|
-
# => [">pair1"]
|
126
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
|
127
|
-
# => [">pair1", ">pair2"]
|
128
|
-
# ViralSeq.paired_join1(paired_seqs, 100, 0.02)
|
129
|
-
# => [">pair1", ">pair2", ">pair3"]
|
130
|
-
|
131
|
-
# ViralSeq.paired_join2(seq_pair_hash, model, diff)
|
132
|
-
# # pair-end join function for UNKNOW overlap
|
133
|
-
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
134
|
-
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
135
|
-
# # model has two options, 1 or 2 as Integer
|
136
|
-
# # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
137
|
-
# # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
138
|
-
# # minimal overlap by model 2 set to 4 positions
|
139
|
-
# # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
|
140
|
-
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
141
|
-
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
142
|
-
# =USAGE
|
143
|
-
# # example 1
|
144
|
-
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
145
|
-
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
146
|
-
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
147
|
-
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
148
|
-
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
149
|
-
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
150
|
-
# ViralSeq.paired_join2(paired_seqs, 1).keys
|
151
|
-
# => [">pair1"]
|
152
|
-
# ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
|
153
|
-
# => [">pair1", ">pair2"]
|
154
|
-
#
|
155
|
-
# # example 2
|
156
|
-
# paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
|
157
|
-
# ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
|
158
|
-
# ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
|
159
|
-
# }
|
160
|
-
# ViralSeq.paired_join2(paired_seq2, 1)
|
161
|
-
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
162
|
-
# ViralSeq.paired_join2(paired_seq2, 2)
|
163
|
-
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
164
|
-
|
165
|
-
|
166
|
-
module ViralSeq
|
167
|
-
|
168
|
-
# calculate cut-off for offspring primer IDs.
|
169
|
-
def self.calculate_pid_cut_off(m, error_rate = 0.02)
|
170
|
-
if m <= 10
|
171
|
-
return 2
|
172
|
-
end
|
173
|
-
n = 0
|
174
|
-
case error_rate
|
175
|
-
when 0...0.0075
|
176
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
177
|
-
when 0.0075...0.015
|
178
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
179
|
-
when 0.015..0.03
|
180
|
-
if m <= 8500
|
181
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
182
|
-
else
|
183
|
-
n = 0.0079 * m + 9.4869
|
184
|
-
end
|
185
|
-
else
|
186
|
-
raise ArgumentError.new('Error_rate has be between 0 to 0.03')
|
187
|
-
end
|
188
|
-
n = n.round
|
189
|
-
n = 2 if n < 3
|
190
|
-
return n
|
191
|
-
end
|
192
|
-
|
193
|
-
# create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
|
194
|
-
# example:
|
195
|
-
# position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
|
196
|
-
def self.consensus(seq_array, cutoff = 0.5)
|
197
|
-
seq_length = seq_array[0].size
|
198
|
-
seq_size = seq_array.size
|
199
|
-
consensus_seq = ""
|
200
|
-
(0..(seq_length - 1)).each do |position|
|
201
|
-
all_base = []
|
202
|
-
seq_array.each do |seq|
|
203
|
-
all_base << seq[position]
|
204
|
-
end
|
205
|
-
base_count = ViralSeq.count(all_base)
|
206
|
-
max_base_list = []
|
207
|
-
|
208
|
-
base_count.each do |k,v|
|
209
|
-
if v/seq_size.to_f >= cutoff
|
210
|
-
max_base_list << k
|
211
|
-
end
|
212
|
-
end
|
213
|
-
consensus_seq += ViralSeq.call_consensus_base(max_base_list)
|
214
|
-
end
|
215
|
-
return consensus_seq
|
216
|
-
end
|
217
|
-
|
218
|
-
# call consensus nucleotide, used by ViralSeq.consensus
|
219
|
-
def self.call_consensus_base(base_array)
|
220
|
-
if base_array.size == 1
|
221
|
-
base_array[0]
|
222
|
-
elsif base_array.size == 2
|
223
|
-
case base_array.sort!
|
224
|
-
when ["A","T"]
|
225
|
-
"W"
|
226
|
-
when ["C","G"]
|
227
|
-
"S"
|
228
|
-
when ["A","C"]
|
229
|
-
"M"
|
230
|
-
when ["G","T"]
|
231
|
-
"K"
|
232
|
-
when ["A","G"]
|
233
|
-
"R"
|
234
|
-
when ["C","T"]
|
235
|
-
"Y"
|
236
|
-
else
|
237
|
-
"N"
|
238
|
-
end
|
239
|
-
|
240
|
-
elsif base_array.size == 3
|
241
|
-
case base_array.sort!
|
242
|
-
when ["C","G","T"]
|
243
|
-
"B"
|
244
|
-
when ["A","G","T"]
|
245
|
-
"D"
|
246
|
-
when ["A","C","T"]
|
247
|
-
"H"
|
248
|
-
when ["A","C","G"]
|
249
|
-
"V"
|
250
|
-
else
|
251
|
-
"N"
|
252
|
-
end
|
253
|
-
else
|
254
|
-
"N"
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
# generate all Primer ID combinations given the length of Primer ID
|
259
|
-
def self.generate_primer_id_pool(l=8)
|
260
|
-
nt = ['A','T','C','G']
|
261
|
-
pid_pool = ['A','T','C','G']
|
262
|
-
(l-1).times do
|
263
|
-
pid_pool = pid_pool.product(nt)
|
264
|
-
pid_pool.collect! do |v|
|
265
|
-
v.join("")
|
266
|
-
end
|
267
|
-
end
|
268
|
-
return pid_pool
|
269
|
-
end
|
270
|
-
|
271
|
-
# compare two primer ID sequences.
|
272
|
-
# If they differ in x base, return boolean value "TURE",
|
273
|
-
# else, return boolean value "FALSE"
|
274
|
-
def self.similar_pid?(pid1="",pid2="", x=0)
|
275
|
-
l = pid1.size
|
276
|
-
m = l - x
|
277
|
-
n = 0
|
278
|
-
if pid1.size != pid2.size
|
279
|
-
return false
|
280
|
-
else
|
281
|
-
(0..(pid1.size - 1)).each do |k|
|
282
|
-
if pid1[k] == pid2[k]
|
283
|
-
n += 1
|
284
|
-
end
|
285
|
-
end
|
286
|
-
if n >= m
|
287
|
-
return true
|
288
|
-
else
|
289
|
-
return false
|
290
|
-
end
|
291
|
-
end
|
292
|
-
end
|
293
|
-
|
294
|
-
# compare PID with sequences which have identical sequences.
|
295
|
-
# PIDs differ by 1 base will be recognized.
|
296
|
-
# if PID1 is x time greater than PID2, PID2 will be disgarded
|
297
|
-
def self.filter_similar_pid(sequence_file = "", cutoff = 10)
|
298
|
-
seq = ViralSeq.fasta_to_hash(sequence_file)
|
299
|
-
uni_seq = seq.values.uniq
|
300
|
-
uni_seq_pid = {}
|
301
|
-
uni_seq.each do |k|
|
302
|
-
seq.each do |name,s|
|
303
|
-
name = name[1..-1]
|
304
|
-
if k == s
|
305
|
-
if uni_seq_pid[k]
|
306
|
-
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
307
|
-
else
|
308
|
-
uni_seq_pid[k] = []
|
309
|
-
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
dup_pid = []
|
316
|
-
uni_seq_pid.values.each do |v|
|
317
|
-
next if v.size == 1
|
318
|
-
pid_hash = Hash[v]
|
319
|
-
list = pid_hash.keys
|
320
|
-
list2 = Array.new(list)
|
321
|
-
pairs = []
|
322
|
-
|
323
|
-
list.each do |k|
|
324
|
-
list2.delete(k)
|
325
|
-
list2.each do |k1|
|
326
|
-
pairs << [k,k1]
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
|
331
|
-
pairs.each do |p|
|
332
|
-
pid1 = p[0]
|
333
|
-
pid2 = p[1]
|
334
|
-
if ViralSeq.similar_pid?(pid1,pid2,1)
|
335
|
-
n1 = pid_hash[pid1].to_i
|
336
|
-
n2 = pid_hash[pid2].to_i
|
337
|
-
if n1 >= cutoff * n2
|
338
|
-
dup_pid << pid2
|
339
|
-
elsif n2 >= cutoff * n1
|
340
|
-
dup_pid << pid1
|
341
|
-
end
|
342
|
-
end
|
343
|
-
end
|
344
|
-
end
|
345
|
-
|
346
|
-
|
347
|
-
new_seq = {}
|
348
|
-
seq.each do |name,s|
|
349
|
-
pid = name.split("_")[0][1..-1]
|
350
|
-
unless dup_pid.include?(pid)
|
351
|
-
new_seq[name] = s
|
352
|
-
end
|
353
|
-
end
|
354
|
-
return new_seq
|
355
|
-
end
|
356
|
-
|
357
|
-
# collapse sequences with x number of nt differences. make sure sequences are aligned.
|
358
|
-
# The return frequency is NOT the frequency of the collasped sequences.
|
359
|
-
def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
|
360
|
-
new_seq_freq = {}
|
361
|
-
seq_freq = ViralSeq.count(seq_array)
|
362
|
-
if seq_freq.size == 1
|
363
|
-
new_seq_freq = seq_freq
|
364
|
-
else
|
365
|
-
uniq_seq = seq_freq.keys
|
366
|
-
unique_seq_pair = uniq_seq.combination(2)
|
367
|
-
dupli_seq = []
|
368
|
-
unique_seq_pair.each do |pair|
|
369
|
-
seq1 = pair[0]
|
370
|
-
seq2 = pair[1]
|
371
|
-
diff = ViralSeq.compare_two_seq(seq1,seq2)
|
372
|
-
if diff <= cutoff
|
373
|
-
freq1 = seq_freq[seq1]
|
374
|
-
freq2 = seq_freq[seq2]
|
375
|
-
freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
|
376
|
-
end
|
377
|
-
end
|
378
|
-
|
379
|
-
seq_freq.each do |seq,freq|
|
380
|
-
unless dupli_seq.include?(seq)
|
381
|
-
new_seq_freq[seq] = freq
|
382
|
-
end
|
383
|
-
end
|
384
|
-
return new_seq_freq
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
|
389
|
-
# compare two sequences, return the number of different positions, NO NEED alignment
|
390
|
-
|
391
|
-
def self.compare_two_seq(seq1 = "", seq2 = "")
|
392
|
-
length = seq1.size
|
393
|
-
diff = 0
|
394
|
-
(0..(length-1)).each do |position|
|
395
|
-
nt1 = seq1[position]
|
396
|
-
nt2 = seq2[position]
|
397
|
-
diff += 1 unless nt1 == nt2
|
398
|
-
end
|
399
|
-
return diff
|
400
|
-
end
|
401
|
-
|
402
|
-
# gap strip from a sequence alignment
|
403
|
-
|
404
|
-
def self.gap_strip(sequence_alignment)
|
405
|
-
new_seq_hash = {}
|
406
|
-
seq_size = sequence_alignment.values[0].size
|
407
|
-
seq_matrix = {}
|
408
|
-
(0..(seq_size - 1)).each do |p|
|
409
|
-
seq_matrix[p] = []
|
410
|
-
sequence_alignment.values.each do |s|
|
411
|
-
seq_matrix[p] << s[p]
|
412
|
-
end
|
413
|
-
end
|
414
|
-
|
415
|
-
seq_matrix.delete_if do |_p, list|
|
416
|
-
list.include?("-")
|
417
|
-
end
|
418
|
-
|
419
|
-
sequence_alignment.each do |n,s|
|
420
|
-
new_s = ""
|
421
|
-
seq_matrix.keys.each {|p| new_s += s[p]}
|
422
|
-
new_seq_hash[n] = new_s
|
423
|
-
end
|
424
|
-
return new_seq_hash
|
425
|
-
end
|
426
|
-
|
427
|
-
# gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
|
428
|
-
|
429
|
-
def self.gap_strip_ends(sequence_alignment)
|
430
|
-
new_seq_hash = {}
|
431
|
-
seq_size = sequence_alignment.values[0].size
|
432
|
-
seq_matrix = {}
|
433
|
-
(0..(seq_size - 1)).each do |p|
|
434
|
-
seq_matrix[p] = []
|
435
|
-
sequence_alignment.values.each do |s|
|
436
|
-
seq_matrix[p] << s[p]
|
437
|
-
end
|
438
|
-
end
|
439
|
-
n1 = 0
|
440
|
-
n2 = 0
|
441
|
-
seq_matrix.each do |_p, list|
|
442
|
-
if list.include?("-")
|
443
|
-
n1 += 1
|
444
|
-
else
|
445
|
-
break
|
446
|
-
end
|
447
|
-
end
|
448
|
-
|
449
|
-
seq_matrix.keys.reverse.each do |p|
|
450
|
-
list = seq_matrix[p]
|
451
|
-
if list.include?("-")
|
452
|
-
n2 += 1
|
453
|
-
else
|
454
|
-
break
|
455
|
-
end
|
456
|
-
end
|
457
|
-
|
458
|
-
sequence_alignment.each do |n,s|
|
459
|
-
new_s = s[n1..(- n2 - 1)]
|
460
|
-
new_seq_hash[n] = new_s
|
461
|
-
end
|
462
|
-
return new_seq_hash
|
463
|
-
end
|
464
|
-
|
465
|
-
# input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
|
466
|
-
# overlap is pre-determined
|
467
|
-
def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
|
468
|
-
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
469
|
-
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
470
|
-
joined_seq_hash = {}
|
471
|
-
seq_pair_hash.each do |seq_name, seq_pair|
|
472
|
-
r1_seq = seq_pair[0]
|
473
|
-
r2_seq = seq_pair[1]
|
474
|
-
if overlap.zero?
|
475
|
-
joined_seq_hash[seq_name] = r1_seq + r2_seq
|
476
|
-
elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
|
477
|
-
joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
|
478
|
-
else
|
479
|
-
next
|
480
|
-
end
|
481
|
-
end
|
482
|
-
return joined_seq_hash
|
483
|
-
end
|
484
|
-
|
485
|
-
|
486
|
-
# overlap is not predetermined
|
487
|
-
# model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
488
|
-
# model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
489
|
-
def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
|
490
|
-
begin
|
491
|
-
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
492
|
-
if model == 1
|
493
|
-
overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
|
494
|
-
return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
|
495
|
-
elsif model == 2
|
496
|
-
joined_seq_hash = {}
|
497
|
-
seq_pair_hash.each do |seq_name, seq_pair|
|
498
|
-
overlap_list = []
|
499
|
-
ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
|
500
|
-
cut_off_base = overlap1 * diff
|
501
|
-
overlap_list << overlap1 if diff_nt <= cut_off_base
|
502
|
-
end
|
503
|
-
if overlap_list.empty?
|
504
|
-
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
|
505
|
-
else
|
506
|
-
overlap = overlap_list.max
|
507
|
-
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
|
508
|
-
end
|
509
|
-
end
|
510
|
-
return joined_seq_hash
|
511
|
-
else
|
512
|
-
raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
|
513
|
-
end
|
514
|
-
rescue ArgumentError => e
|
515
|
-
puts e
|
516
|
-
return nil
|
517
|
-
end
|
518
|
-
end
|
519
|
-
|
520
|
-
# determine overlap size from a paired sequence Hash object
|
521
|
-
def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
|
522
|
-
overlaps = []
|
523
|
-
seq_pair_hash.each do |_seq_name, seq_pair|
|
524
|
-
overlap_list = []
|
525
|
-
matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
|
526
|
-
matrix.each do |overlap, diff_nt|
|
527
|
-
cut_off_base = overlap * diff
|
528
|
-
overlap_list << overlap if diff_nt <= cut_off_base
|
529
|
-
end
|
530
|
-
if overlap_list.empty?
|
531
|
-
overlaps << 0
|
532
|
-
else
|
533
|
-
overlaps << overlap_list.max
|
534
|
-
end
|
535
|
-
end
|
536
|
-
count_overlaps = ViralSeq.count(overlaps)
|
537
|
-
max_value = count_overlaps.values.max
|
538
|
-
max_overlap_list = []
|
539
|
-
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
540
|
-
max_overlap_list.max
|
541
|
-
end
|
542
|
-
|
543
|
-
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
544
|
-
# {:overlap_size => number_of_differnt_positions, ...}
|
545
|
-
# {minimal overlap set to 4. }
|
546
|
-
def self.overlap_matrix(sequence1, sequence2)
|
547
|
-
min_overlap = 4
|
548
|
-
max_overlap = [sequence1.size, sequence2.size].max
|
549
|
-
matrix_hash = {}
|
550
|
-
(min_overlap..max_overlap).each do |overlap|
|
551
|
-
matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
|
552
|
-
end
|
553
|
-
return matrix_hash
|
554
|
-
end
|
555
|
-
|
556
|
-
end
|