viral_seq 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/viral_seq/a3g.rb +172 -0
- data/lib/viral_seq/fasta.rb +154 -0
- data/lib/viral_seq/hcv_dr.rb +54 -0
- data/lib/viral_seq/locator.rb +299 -0
- data/lib/viral_seq/math.rb +401 -0
- data/lib/viral_seq/misc.rb +103 -0
- data/lib/viral_seq/muscle.rb +89 -0
- data/lib/viral_seq/nt_variation.rb +148 -0
- data/lib/viral_seq/poisson_cutoff.rb +68 -0
- data/lib/viral_seq/refseq.rb +45 -0
- data/lib/viral_seq/sdrm_core.rb +652 -0
- data/lib/viral_seq/sequence.rb +392 -0
- data/lib/viral_seq/tcs_core.rb +556 -0
- data/lib/viral_seq/version.rb +6 -0
- data/lib/viral_seq.rb +41 -0
- data/viral_seq.gemspec +37 -0
- metadata +130 -0
@@ -0,0 +1,392 @@
|
|
1
|
+
# lib/sequence.rb
|
2
|
+
# Includes functions for sequence operations
|
3
|
+
# Including methods as:
|
4
|
+
# ViralSeq::AMINO_ACID_LIST
|
5
|
+
# ViralSeq::Sequence
|
6
|
+
# ViralSeq::Sequence#rev_complement
|
7
|
+
# ViralSeq::Sequence#get_aa_sequence
|
8
|
+
# ViralSeq::Sequence#get_aa_array
|
9
|
+
# ViralSeq::Sequence#name
|
10
|
+
# ViralSeq::Sequence#dna_sequence
|
11
|
+
# ViralSeq::Sequence#aa_sequence
|
12
|
+
# ViralSeq::Sequence#aa_array
|
13
|
+
# ViralSeq::amino_acid
|
14
|
+
# ViralSeq::amino_acid_2
|
15
|
+
# ViralSeq::to_list
|
16
|
+
# ViralSeq::uniq_sequence_hash
|
17
|
+
# ViralSeq::stop_codon_seq_hash
|
18
|
+
# String#rc
|
19
|
+
# String#mutation
|
20
|
+
# String#nt_parser
|
21
|
+
|
22
|
+
# ViralSeq::AMINO_ACID_LIST
|
23
|
+
# # Array of all amino acid one letter abbreviations
|
24
|
+
|
25
|
+
# ViralSeq::Sequence
|
26
|
+
# # Sequence class
|
27
|
+
# =USAGE
|
28
|
+
# # create a sequence object
|
29
|
+
# seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
|
30
|
+
#
|
31
|
+
# # print dna sequence
|
32
|
+
# puts seq.dna_sequence
|
33
|
+
#
|
34
|
+
# # reserce complement sequence of DNA sequence, return as a string
|
35
|
+
# seq.rev_complement
|
36
|
+
#
|
37
|
+
# # change @dna_sequence to reverse complement DNA sequence
|
38
|
+
# seq.rev_complement!
|
39
|
+
#
|
40
|
+
# # generate amino acid sequences. either return string or array.
|
41
|
+
# # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
|
42
|
+
# # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
|
43
|
+
# seq.get_aa_sequence
|
44
|
+
# # or
|
45
|
+
# seq.get_aa_array
|
46
|
+
#
|
47
|
+
# # print amino acid sequence
|
48
|
+
# puts seq.aa_sequence
|
49
|
+
|
50
|
+
# ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
|
51
|
+
# # collapse sequence hash to unique sequence hash.
|
52
|
+
# # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
|
53
|
+
# # master_sequence_tag is the master tag for unique sequences
|
54
|
+
# # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
|
55
|
+
# =USAGE
|
56
|
+
# sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
|
57
|
+
# '>seq4' => 'CCCC', '>seq5' => 'CCCC',
|
58
|
+
# '>seq6' => 'TTTT' }
|
59
|
+
# uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
|
60
|
+
# => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
|
61
|
+
|
62
|
+
module ViralSeq
|
63
|
+
|
64
|
+
# array for all amino acid one letter abbreviations
|
65
|
+
AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
|
66
|
+
|
67
|
+
# sequence class
|
68
|
+
|
69
|
+
class Sequence
|
70
|
+
def initialize (name = ">sequence",dna_sequence ="")
|
71
|
+
@name = name
|
72
|
+
@dna_sequence = dna_sequence.upcase
|
73
|
+
@aa_sequence = ""
|
74
|
+
@aa_array = []
|
75
|
+
end
|
76
|
+
|
77
|
+
attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
|
78
|
+
|
79
|
+
def rev_complement
|
80
|
+
@dna_sequence.reverse.upcase.tr('ATCG','TAGC')
|
81
|
+
end
|
82
|
+
def rev_complement!
|
83
|
+
@dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
|
84
|
+
end
|
85
|
+
|
86
|
+
def get_aa_sequence(initial_position = 0)
|
87
|
+
@aa_sequence = ""
|
88
|
+
require_sequence = @dna_sequence[initial_position..-1]
|
89
|
+
base_array = []
|
90
|
+
require_sequence.each_char {|base| base_array << base}
|
91
|
+
while (base_array.length>=3) do
|
92
|
+
base_3= ""
|
93
|
+
3.times {base_3 += base_array.shift}
|
94
|
+
@aa_sequence << amino_acid(base_3)
|
95
|
+
end
|
96
|
+
return @aa_sequence
|
97
|
+
end
|
98
|
+
|
99
|
+
# get amino acid calls, return a array.keep ambiguity calls.
|
100
|
+
def get_aa_array(initial_position = 0)
|
101
|
+
@aa_array = []
|
102
|
+
require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
|
103
|
+
base_array = []
|
104
|
+
require_sequence.each_char {|base| base_array << base}
|
105
|
+
while (base_array.length>=3) do
|
106
|
+
base_3= ""
|
107
|
+
3.times{base_3 += base_array.shift}
|
108
|
+
@aa_array<< ViralSeq.amino_acid_2(base_3)
|
109
|
+
end
|
110
|
+
return @aa_array
|
111
|
+
end
|
112
|
+
def dna_length
|
113
|
+
@dna_sequence.length
|
114
|
+
end
|
115
|
+
def aa_length
|
116
|
+
@aa_sequence.length
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# generate amino acid abbreviations from 3 bases, ambiguity will return "#"
|
121
|
+
def self.amino_acid (bases)
|
122
|
+
case bases
|
123
|
+
when /^TT[TCY]$/
|
124
|
+
return "F"
|
125
|
+
when /^TT[AGR]$/
|
126
|
+
return "L"
|
127
|
+
when /^CT.$/
|
128
|
+
return "L"
|
129
|
+
when /^AT[TCAHYWM]$/
|
130
|
+
return "I"
|
131
|
+
when "ATG"
|
132
|
+
return "M"
|
133
|
+
when /^GT.$/
|
134
|
+
return "V"
|
135
|
+
when /^TC.$/
|
136
|
+
return "S"
|
137
|
+
when /^CC.$/
|
138
|
+
return "P"
|
139
|
+
when /^AC.$/
|
140
|
+
return "T"
|
141
|
+
when /^GC.$/
|
142
|
+
return "A"
|
143
|
+
when /^TA[TCY]$/
|
144
|
+
return "Y"
|
145
|
+
when /^TA[AGR]$/
|
146
|
+
return "*"
|
147
|
+
when /^T[GR]A$/
|
148
|
+
return "*"
|
149
|
+
when /^CA[TCY]$/
|
150
|
+
return "H"
|
151
|
+
when /^CA[AGR]$/
|
152
|
+
return "Q"
|
153
|
+
when /^AA[TCY]$/
|
154
|
+
return "N"
|
155
|
+
when /^AA[AGR]$/
|
156
|
+
return "K"
|
157
|
+
when /^GA[TCY]$/
|
158
|
+
return "D"
|
159
|
+
when /^GA[AGR]$/
|
160
|
+
return "E"
|
161
|
+
when /^TG[TCY]$/
|
162
|
+
return "C"
|
163
|
+
when "TGG"
|
164
|
+
return "W"
|
165
|
+
when /^CG.$/
|
166
|
+
return "R"
|
167
|
+
when /^AG[TCY]$/
|
168
|
+
return "S"
|
169
|
+
when /^[AM]G[AGR]$/
|
170
|
+
return "R"
|
171
|
+
when /^GG.$/
|
172
|
+
return "G"
|
173
|
+
when /^[ATW][CGS][CTY]$/
|
174
|
+
return "S"
|
175
|
+
when /^[TCY]T[AGR]$/
|
176
|
+
return "L"
|
177
|
+
else
|
178
|
+
return "#"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# keep ambiguities, return all possible amino acids.
|
183
|
+
|
184
|
+
def self.amino_acid_2 (bases)
|
185
|
+
bases_to_aa = []
|
186
|
+
aa_list = []
|
187
|
+
base1 = ViralSeq.to_list(bases[0])
|
188
|
+
base2 = ViralSeq.to_list(bases[1])
|
189
|
+
base3 = ViralSeq.to_list(bases[2])
|
190
|
+
l1 = base1.size - 1
|
191
|
+
l2 = base2.size - 1
|
192
|
+
l3 = base3.size - 1
|
193
|
+
(0..l1).each do |n1|
|
194
|
+
b1 = base1[n1]
|
195
|
+
(0..l2).each do |n2|
|
196
|
+
b2 = base2[n2]
|
197
|
+
(0..l3).each do |n3|
|
198
|
+
b3 = base3[n3]
|
199
|
+
bases_all = b1 + b2 + b3
|
200
|
+
bases_to_aa << bases_all
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
bases_to_aa.each do |base|
|
206
|
+
case base
|
207
|
+
when /^TT[TCY]$/
|
208
|
+
aa = "F"
|
209
|
+
when /^TT[AGR]$/
|
210
|
+
aa = "L"
|
211
|
+
when /^CT.$/
|
212
|
+
aa = "L"
|
213
|
+
when /^AT[TCAHYWM]$/
|
214
|
+
aa = "I"
|
215
|
+
when "ATG"
|
216
|
+
aa = "M"
|
217
|
+
when /^GT.$/
|
218
|
+
aa = "V"
|
219
|
+
when /^TC.$/
|
220
|
+
aa = "S"
|
221
|
+
when /^CC.$/
|
222
|
+
aa = "P"
|
223
|
+
when /^AC.$/
|
224
|
+
aa = "T"
|
225
|
+
when /^GC.$/
|
226
|
+
aa = "A"
|
227
|
+
when /^TA[TCY]$/
|
228
|
+
aa = "Y"
|
229
|
+
when /^TA[AGR]$/
|
230
|
+
aa = "*"
|
231
|
+
when /^T[GR]A$/
|
232
|
+
aa = "*"
|
233
|
+
when /^CA[TCY]$/
|
234
|
+
aa = "H"
|
235
|
+
when /^CA[AGR]$/
|
236
|
+
aa = "Q"
|
237
|
+
when /^AA[TCY]$/
|
238
|
+
aa = "N"
|
239
|
+
when /^AA[AGR]$/
|
240
|
+
aa = "K"
|
241
|
+
when /^GA[TCY]$/
|
242
|
+
aa = "D"
|
243
|
+
when /^GA[AGR]$/
|
244
|
+
aa = "E"
|
245
|
+
when /^TG[TCY]$/
|
246
|
+
aa = "C"
|
247
|
+
when "TGG"
|
248
|
+
aa = "W"
|
249
|
+
when /^CG.$/
|
250
|
+
aa = "R"
|
251
|
+
when /^AG[TCY]$/
|
252
|
+
aa = "S"
|
253
|
+
when /^[AM]G[AGR]$/
|
254
|
+
aa = "R"
|
255
|
+
when /^GG.$/
|
256
|
+
aa = "G"
|
257
|
+
when /^[ATW][CGS][CTY]$/
|
258
|
+
aa = "S"
|
259
|
+
when /^[TCY]T[AGR]$/
|
260
|
+
aa = "L"
|
261
|
+
else
|
262
|
+
aa = "-"
|
263
|
+
end
|
264
|
+
aa_list << aa
|
265
|
+
end
|
266
|
+
aa_out = aa_list.uniq.join('/')
|
267
|
+
return aa_out
|
268
|
+
end
|
269
|
+
|
270
|
+
# parse ambiguity bases, aka %w{W S M K R Y B D H V N}
|
271
|
+
|
272
|
+
def self.to_list(base = "")
|
273
|
+
list = []
|
274
|
+
case base
|
275
|
+
when /[A|T|C|G]/
|
276
|
+
list << base
|
277
|
+
when "W"
|
278
|
+
list = ['A','T']
|
279
|
+
when "S"
|
280
|
+
list = ['C','G']
|
281
|
+
when "M"
|
282
|
+
list = ['A','C']
|
283
|
+
when 'K'
|
284
|
+
list = ['G','C']
|
285
|
+
when 'R'
|
286
|
+
list = ['A','G']
|
287
|
+
when 'Y'
|
288
|
+
list = ['C','T']
|
289
|
+
when 'B'
|
290
|
+
list = ['C','G','T']
|
291
|
+
when 'D'
|
292
|
+
list = ['A','G','T']
|
293
|
+
when 'H'
|
294
|
+
list = ['A','C','T']
|
295
|
+
when 'V'
|
296
|
+
list = ['A','C','G']
|
297
|
+
when 'N'
|
298
|
+
list = ['A','T','C','G']
|
299
|
+
end
|
300
|
+
return list
|
301
|
+
end
|
302
|
+
|
303
|
+
# ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
|
304
|
+
# collapse sequence hash to unique sequence hash.
|
305
|
+
# input_sequence_hash is a sequence hash {:name => :sequence, ...}
|
306
|
+
# master_sequence_tag is the master tag for unique sequences
|
307
|
+
# sequences will be named as (master_sequence_tag + "_" + Integer)
|
308
|
+
|
309
|
+
def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
|
310
|
+
uni = ViralSeq.count(seq.values)
|
311
|
+
new_seq = {}
|
312
|
+
n = 1
|
313
|
+
uni.each do |s,c|
|
314
|
+
name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
|
315
|
+
new_seq[name] = s
|
316
|
+
n += 1
|
317
|
+
end
|
318
|
+
return new_seq
|
319
|
+
end
|
320
|
+
|
321
|
+
# input a sequence hash, return a sequence hash with stop codons.
|
322
|
+
def self.stop_codon_seq_hash(seq_hash, rf = 0)
|
323
|
+
out_seq_hash = {}
|
324
|
+
seq_hash.each do |k,v|
|
325
|
+
sequence = Sequence.new(k,v)
|
326
|
+
sequence.get_aa_array(rf)
|
327
|
+
if sequence.aa_array.include?("*")
|
328
|
+
out_seq_hash[k] = v
|
329
|
+
end
|
330
|
+
end
|
331
|
+
return out_seq_hash
|
332
|
+
end
|
333
|
+
|
334
|
+
end
|
335
|
+
|
336
|
+
# functions added to Class::String for direct operation on sequence if it is a String object
|
337
|
+
# String.rc
|
338
|
+
# # reverse complement
|
339
|
+
# # example
|
340
|
+
# "ACAGA".rc
|
341
|
+
# => "TCTGT"
|
342
|
+
#
|
343
|
+
# String.mutation(error_rate)
|
344
|
+
# # mutate a nt sequence (String class) randomly
|
345
|
+
# # must define error rate, default value 0.01, aka 1%
|
346
|
+
# =USAGE
|
347
|
+
# # example
|
348
|
+
# seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
|
349
|
+
# seq.mutation(0.05)
|
350
|
+
# => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
|
351
|
+
#
|
352
|
+
# String.nt_parser
|
353
|
+
# # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
|
354
|
+
# =USAGE
|
355
|
+
# "ATRWCG".nt_parser
|
356
|
+
# => /AT[A|G][A|T]CG/
|
357
|
+
|
358
|
+
class String
|
359
|
+
# direct function of calling reverse complement on String class
|
360
|
+
def rc
|
361
|
+
self.reverse.tr("ACTG","TGAC")
|
362
|
+
end
|
363
|
+
|
364
|
+
def mutation(error_rate = 0.01)
|
365
|
+
new_string = ""
|
366
|
+
self.split("").each do |nt|
|
367
|
+
pool = ["A","C","T","G"]
|
368
|
+
pool.delete(nt)
|
369
|
+
s = error_rate * 10000
|
370
|
+
r = rand(10000)
|
371
|
+
if r < s
|
372
|
+
nt = pool.sample
|
373
|
+
end
|
374
|
+
new_string << nt
|
375
|
+
end
|
376
|
+
return new_string
|
377
|
+
end
|
378
|
+
|
379
|
+
def nt_parser
|
380
|
+
match = ""
|
381
|
+
self.each_char.each do |base|
|
382
|
+
base_array = ViralSeq.to_list(base)
|
383
|
+
if base_array.size == 1
|
384
|
+
match += base_array[0]
|
385
|
+
else
|
386
|
+
pattern = "[" + base_array.join("|") + "]"
|
387
|
+
match += pattern
|
388
|
+
end
|
389
|
+
end
|
390
|
+
Regexp.new match
|
391
|
+
end
|
392
|
+
end
|
@@ -0,0 +1,556 @@
|
|
1
|
+
# viral_seq/tcs_core
|
2
|
+
# core functions for TCS and DR pipeline
|
3
|
+
# functions to manipulate sequences including:
|
4
|
+
# ViralSeq::calculate_pid_cut_off
|
5
|
+
# ViralSeq::consensus
|
6
|
+
# ViralSeq::generate_primer_id_pool
|
7
|
+
# ViralSeq::similar_pid?
|
8
|
+
# ViralSeq::filter_similar_pid
|
9
|
+
# ViralSeq::collapse_sequence_by_x_nt_difference
|
10
|
+
# ViralSeq::compare_two_seq
|
11
|
+
# ViralSeq::gap_strip
|
12
|
+
# ViralSeq::gap_strip_ends
|
13
|
+
# ViralSeq::paired_join1
|
14
|
+
# ViralSeq::paired_join2
|
15
|
+
|
16
|
+
# ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
|
17
|
+
# # A function to calcuate cut-off for offspring primer IDs.
|
18
|
+
# # see reference at Zhou et al. JVI 2016.
|
19
|
+
# # https://www.ncbi.nlm.nih.gov/pubmed/26041299
|
20
|
+
# # PID_abundance is the abundance of a certain PID
|
21
|
+
# # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
|
22
|
+
# # the model supports error rate from 0.003 to 0.03.
|
23
|
+
# # return an abundance cut-off (Integer) for offspring Primer IDs.
|
24
|
+
|
25
|
+
# ViralSeq.consensus(seq_array, majority_cutoff)
|
26
|
+
# # Generate a consensus sequence from a given sequence array.
|
27
|
+
# # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
|
28
|
+
# # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
|
29
|
+
# =USAGE
|
30
|
+
# a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
|
31
|
+
|
32
|
+
# ViralSeq.generate_primer_id_pool(n)
|
33
|
+
# # generate all Primer ID combinations given the length of Primer ID
|
34
|
+
# # n is the length of the Primer ID (Integer). default value of n is 8.
|
35
|
+
# =USAGE
|
36
|
+
# primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
|
37
|
+
# puts primer_id_pool.size #should be 4^10
|
38
|
+
# => 1048576
|
39
|
+
|
40
|
+
# ViralSeq.similar_pid?(pid1, pid2, base_difference)
|
41
|
+
# # compare two primer ID sequences.
|
42
|
+
# # If they differ in certain bases, return boolean value "TURE",
|
43
|
+
# # else, return boolean value "FALSE"
|
44
|
+
# # where pid1 and pid2 are two Primer IDs for comparison
|
45
|
+
# # base_difference is an Integer for difference bases that allowed
|
46
|
+
# =USAGE
|
47
|
+
# # example
|
48
|
+
# ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
|
49
|
+
# => true
|
50
|
+
|
51
|
+
# ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
|
52
|
+
# # compare PID with sequences which have identical sequences.
|
53
|
+
# # PIDs differ by 1 base will be recognized.
|
54
|
+
# # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
|
55
|
+
# # where sequence_fasta_file is the sequence file in fasta format
|
56
|
+
# # each sequence tag starting with ">" and the Primer ID sequence
|
57
|
+
# # followed by the number of Primer ID appeared in the raw sequence
|
58
|
+
# # the information sections in the tags are separated by underscore "_"
|
59
|
+
# # example sequence tag: >AGGCGTAGA_32_sample1_RT
|
60
|
+
# # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
|
61
|
+
# # default value for cut_off is 10
|
62
|
+
# # return a new sequence hash. {sequence_name => sequence, ...}
|
63
|
+
|
64
|
+
# ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
|
65
|
+
# # ollapse sequences with x number of nt differences.
|
66
|
+
# # input an Array object of sequences, make sure sequences are aligned.
|
67
|
+
# # return a new Array object of collapsed sequences
|
68
|
+
# # The return frequency is NOT the frequency of the collasped sequences.
|
69
|
+
|
70
|
+
# ViralSeq.compare_two_seq(seq1, seq2)
|
71
|
+
# # compare two sequences as String object, return the number of differences as integer
|
72
|
+
# # sequences will NOT align
|
73
|
+
# # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
|
74
|
+
# =USAGE
|
75
|
+
# # example
|
76
|
+
# seq1 = 'AAGGCGTAGGAC'
|
77
|
+
# seq2 = 'AAGCTTAGGACG'
|
78
|
+
# puts ViralSeq.compare_two_seq(seq1, seq2)
|
79
|
+
# => 8
|
80
|
+
# aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
|
81
|
+
# puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
|
82
|
+
# => 4
|
83
|
+
|
84
|
+
# ViralSeq.gap_strip(sequence_hash)
|
85
|
+
# # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
|
86
|
+
# =USAGE
|
87
|
+
# # example
|
88
|
+
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
89
|
+
# '>seq2' => 'A-CCGGTT',
|
90
|
+
# '>seq3' => 'AAC-GGTT',
|
91
|
+
# '>seq4' => 'AACCG-TT',
|
92
|
+
# '>seq5' => 'AACCGGT-'}
|
93
|
+
# ViralSeq.gap_strip(sequence_hash)
|
94
|
+
# => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
|
95
|
+
|
96
|
+
# ViralSeq.gap_strip_ends(sequence_hash)
|
97
|
+
# # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
|
98
|
+
# =USAGE
|
99
|
+
# # example
|
100
|
+
# sequence_hash = {'>seq1' => 'AACCGGTT',
|
101
|
+
# '>seq2' => 'A-CCGGTT',
|
102
|
+
# '>seq3' => 'AAC-GGTT',
|
103
|
+
# '>seq4' => 'AACCG-TT',
|
104
|
+
# '>seq5' => 'AACCGGT-'}
|
105
|
+
# ViralSeq.gap_strip_ends(sequence_hash)
|
106
|
+
# => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
|
107
|
+
|
108
|
+
# ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
|
109
|
+
# # pair-end join function for KNOW overlap size
|
110
|
+
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
111
|
+
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
112
|
+
# # overlap is an integer that indicate how many bases are overlapped.
|
113
|
+
# # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
|
114
|
+
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
115
|
+
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
116
|
+
# =USAGE
|
117
|
+
# # example
|
118
|
+
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
119
|
+
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
120
|
+
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
121
|
+
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
122
|
+
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
123
|
+
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
124
|
+
# ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
|
125
|
+
# => [">pair1"]
|
126
|
+
# ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
|
127
|
+
# => [">pair1", ">pair2"]
|
128
|
+
# ViralSeq.paired_join1(paired_seqs, 100, 0.02)
|
129
|
+
# => [">pair1", ">pair2", ">pair3"]
|
130
|
+
|
131
|
+
# ViralSeq.paired_join2(seq_pair_hash, model, diff)
|
132
|
+
# # pair-end join function for UNKNOW overlap
|
133
|
+
# # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
|
134
|
+
# # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
|
135
|
+
# # model has two options, 1 or 2 as Integer
|
136
|
+
# # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
137
|
+
# # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
138
|
+
# # minimal overlap by model 2 set to 4 positions
|
139
|
+
# # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
|
140
|
+
# # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
|
141
|
+
# # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
|
142
|
+
# =USAGE
|
143
|
+
# # example 1
|
144
|
+
# paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
145
|
+
# "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
146
|
+
# ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
147
|
+
# "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
|
148
|
+
# ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
149
|
+
# "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
|
150
|
+
# ViralSeq.paired_join2(paired_seqs, 1).keys
|
151
|
+
# => [">pair1"]
|
152
|
+
# ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
|
153
|
+
# => [">pair1", ">pair2"]
|
154
|
+
#
|
155
|
+
# # example 2
|
156
|
+
# paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
|
157
|
+
# ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
|
158
|
+
# ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
|
159
|
+
# }
|
160
|
+
# ViralSeq.paired_join2(paired_seq2, 1)
|
161
|
+
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
162
|
+
# ViralSeq.paired_join2(paired_seq2, 2)
|
163
|
+
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
164
|
+
|
165
|
+
|
166
|
+
module ViralSeq
|
167
|
+
|
168
|
+
# calculate cut-off for offspring primer IDs.
|
169
|
+
def self.calculate_pid_cut_off(m, error_rate = 0.02)
|
170
|
+
if m <= 10
|
171
|
+
return 2
|
172
|
+
end
|
173
|
+
n = 0
|
174
|
+
case error_rate
|
175
|
+
when 0...0.0075
|
176
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
177
|
+
when 0.0075...0.015
|
178
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
179
|
+
when 0.015..0.03
|
180
|
+
if m <= 8500
|
181
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
182
|
+
else
|
183
|
+
n = 0.0079 * m + 9.4869
|
184
|
+
end
|
185
|
+
else
|
186
|
+
raise ArgumentError.new('Error_rate has be between 0 to 0.03')
|
187
|
+
end
|
188
|
+
n = n.round
|
189
|
+
n = 2 if n < 3
|
190
|
+
return n
|
191
|
+
end
|
192
|
+
|
193
|
+
# create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
|
194
|
+
# example:
|
195
|
+
# position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
|
196
|
+
def self.consensus(seq_array, cutoff = 0.5)
|
197
|
+
seq_length = seq_array[0].size
|
198
|
+
seq_size = seq_array.size
|
199
|
+
consensus_seq = ""
|
200
|
+
(0..(seq_length - 1)).each do |position|
|
201
|
+
all_base = []
|
202
|
+
seq_array.each do |seq|
|
203
|
+
all_base << seq[position]
|
204
|
+
end
|
205
|
+
base_count = ViralSeq.count(all_base)
|
206
|
+
max_base_list = []
|
207
|
+
|
208
|
+
base_count.each do |k,v|
|
209
|
+
if v/seq_size.to_f >= cutoff
|
210
|
+
max_base_list << k
|
211
|
+
end
|
212
|
+
end
|
213
|
+
consensus_seq += ViralSeq.call_consensus_base(max_base_list)
|
214
|
+
end
|
215
|
+
return consensus_seq
|
216
|
+
end
|
217
|
+
|
218
|
+
# call consensus nucleotide, used by ViralSeq.consensus
|
219
|
+
def self.call_consensus_base(base_array)
|
220
|
+
if base_array.size == 1
|
221
|
+
base_array[0]
|
222
|
+
elsif base_array.size == 2
|
223
|
+
case base_array.sort!
|
224
|
+
when ["A","T"]
|
225
|
+
"W"
|
226
|
+
when ["C","G"]
|
227
|
+
"S"
|
228
|
+
when ["A","C"]
|
229
|
+
"M"
|
230
|
+
when ["G","T"]
|
231
|
+
"K"
|
232
|
+
when ["A","G"]
|
233
|
+
"R"
|
234
|
+
when ["C","T"]
|
235
|
+
"Y"
|
236
|
+
else
|
237
|
+
"N"
|
238
|
+
end
|
239
|
+
|
240
|
+
elsif base_array.size == 3
|
241
|
+
case base_array.sort!
|
242
|
+
when ["C","G","T"]
|
243
|
+
"B"
|
244
|
+
when ["A","G","T"]
|
245
|
+
"D"
|
246
|
+
when ["A","C","T"]
|
247
|
+
"H"
|
248
|
+
when ["A","C","G"]
|
249
|
+
"V"
|
250
|
+
else
|
251
|
+
"N"
|
252
|
+
end
|
253
|
+
else
|
254
|
+
"N"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# generate all Primer ID combinations given the length of Primer ID
|
259
|
+
def self.generate_primer_id_pool(l=8)
|
260
|
+
nt = ['A','T','C','G']
|
261
|
+
pid_pool = ['A','T','C','G']
|
262
|
+
(l-1).times do
|
263
|
+
pid_pool = pid_pool.product(nt)
|
264
|
+
pid_pool.collect! do |v|
|
265
|
+
v.join("")
|
266
|
+
end
|
267
|
+
end
|
268
|
+
return pid_pool
|
269
|
+
end
|
270
|
+
|
271
|
+
# compare two primer ID sequences.
|
272
|
+
# If they differ in x base, return boolean value "TURE",
|
273
|
+
# else, return boolean value "FALSE"
|
274
|
+
def self.similar_pid?(pid1="",pid2="", x=0)
|
275
|
+
l = pid1.size
|
276
|
+
m = l - x
|
277
|
+
n = 0
|
278
|
+
if pid1.size != pid2.size
|
279
|
+
return false
|
280
|
+
else
|
281
|
+
(0..(pid1.size - 1)).each do |k|
|
282
|
+
if pid1[k] == pid2[k]
|
283
|
+
n += 1
|
284
|
+
end
|
285
|
+
end
|
286
|
+
if n >= m
|
287
|
+
return true
|
288
|
+
else
|
289
|
+
return false
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
# compare PID with sequences which have identical sequences.
|
295
|
+
# PIDs differ by 1 base will be recognized.
|
296
|
+
# if PID1 is x time greater than PID2, PID2 will be disgarded
|
297
|
+
def self.filter_similar_pid(sequence_file = "", cutoff = 10)
|
298
|
+
seq = ViralSeq.fasta_to_hash(sequence_file)
|
299
|
+
uni_seq = seq.values.uniq
|
300
|
+
uni_seq_pid = {}
|
301
|
+
uni_seq.each do |k|
|
302
|
+
seq.each do |name,s|
|
303
|
+
name = name[1..-1]
|
304
|
+
if k == s
|
305
|
+
if uni_seq_pid[k]
|
306
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
307
|
+
else
|
308
|
+
uni_seq_pid[k] = []
|
309
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
dup_pid = []
|
316
|
+
uni_seq_pid.values.each do |v|
|
317
|
+
next if v.size == 1
|
318
|
+
pid_hash = Hash[v]
|
319
|
+
list = pid_hash.keys
|
320
|
+
list2 = Array.new(list)
|
321
|
+
pairs = []
|
322
|
+
|
323
|
+
list.each do |k|
|
324
|
+
list2.delete(k)
|
325
|
+
list2.each do |k1|
|
326
|
+
pairs << [k,k1]
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
|
331
|
+
pairs.each do |p|
|
332
|
+
pid1 = p[0]
|
333
|
+
pid2 = p[1]
|
334
|
+
if ViralSeq.similar_pid?(pid1,pid2,1)
|
335
|
+
n1 = pid_hash[pid1].to_i
|
336
|
+
n2 = pid_hash[pid2].to_i
|
337
|
+
if n1 >= cutoff * n2
|
338
|
+
dup_pid << pid2
|
339
|
+
elsif n2 >= cutoff * n1
|
340
|
+
dup_pid << pid1
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
new_seq = {}
|
348
|
+
seq.each do |name,s|
|
349
|
+
pid = name.split("_")[0][1..-1]
|
350
|
+
unless dup_pid.include?(pid)
|
351
|
+
new_seq[name] = s
|
352
|
+
end
|
353
|
+
end
|
354
|
+
return new_seq
|
355
|
+
end
|
356
|
+
|
357
|
+
# collapse sequences with x number of nt differences. make sure sequences are aligned.
|
358
|
+
# The return frequency is NOT the frequency of the collasped sequences.
|
359
|
+
def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
|
360
|
+
new_seq_freq = {}
|
361
|
+
seq_freq = ViralSeq.count(seq_array)
|
362
|
+
if seq_freq.size == 1
|
363
|
+
new_seq_freq = seq_freq
|
364
|
+
else
|
365
|
+
uniq_seq = seq_freq.keys
|
366
|
+
unique_seq_pair = uniq_seq.combination(2)
|
367
|
+
dupli_seq = []
|
368
|
+
unique_seq_pair.each do |pair|
|
369
|
+
seq1 = pair[0]
|
370
|
+
seq2 = pair[1]
|
371
|
+
diff = ViralSeq.compare_two_seq(seq1,seq2)
|
372
|
+
if diff <= cutoff
|
373
|
+
freq1 = seq_freq[seq1]
|
374
|
+
freq2 = seq_freq[seq2]
|
375
|
+
freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
seq_freq.each do |seq,freq|
|
380
|
+
unless dupli_seq.include?(seq)
|
381
|
+
new_seq_freq[seq] = freq
|
382
|
+
end
|
383
|
+
end
|
384
|
+
return new_seq_freq
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
# compare two sequences, return the number of different positions, NO NEED alignment
|
390
|
+
|
391
|
+
def self.compare_two_seq(seq1 = "", seq2 = "")
|
392
|
+
length = seq1.size
|
393
|
+
diff = 0
|
394
|
+
(0..(length-1)).each do |position|
|
395
|
+
nt1 = seq1[position]
|
396
|
+
nt2 = seq2[position]
|
397
|
+
diff += 1 unless nt1 == nt2
|
398
|
+
end
|
399
|
+
return diff
|
400
|
+
end
|
401
|
+
|
402
|
+
# gap strip from a sequence alignment
|
403
|
+
|
404
|
+
def self.gap_strip(sequence_alignment)
|
405
|
+
new_seq_hash = {}
|
406
|
+
seq_size = sequence_alignment.values[0].size
|
407
|
+
seq_matrix = {}
|
408
|
+
(0..(seq_size - 1)).each do |p|
|
409
|
+
seq_matrix[p] = []
|
410
|
+
sequence_alignment.values.each do |s|
|
411
|
+
seq_matrix[p] << s[p]
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
seq_matrix.delete_if do |_p, list|
|
416
|
+
list.include?("-")
|
417
|
+
end
|
418
|
+
|
419
|
+
sequence_alignment.each do |n,s|
|
420
|
+
new_s = ""
|
421
|
+
seq_matrix.keys.each {|p| new_s += s[p]}
|
422
|
+
new_seq_hash[n] = new_s
|
423
|
+
end
|
424
|
+
return new_seq_hash
|
425
|
+
end
|
426
|
+
|
427
|
+
# gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
|
428
|
+
|
429
|
+
def self.gap_strip_ends(sequence_alignment)
|
430
|
+
new_seq_hash = {}
|
431
|
+
seq_size = sequence_alignment.values[0].size
|
432
|
+
seq_matrix = {}
|
433
|
+
(0..(seq_size - 1)).each do |p|
|
434
|
+
seq_matrix[p] = []
|
435
|
+
sequence_alignment.values.each do |s|
|
436
|
+
seq_matrix[p] << s[p]
|
437
|
+
end
|
438
|
+
end
|
439
|
+
n1 = 0
|
440
|
+
n2 = 0
|
441
|
+
seq_matrix.each do |_p, list|
|
442
|
+
if list.include?("-")
|
443
|
+
n1 += 1
|
444
|
+
else
|
445
|
+
break
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
seq_matrix.keys.reverse.each do |p|
|
450
|
+
list = seq_matrix[p]
|
451
|
+
if list.include?("-")
|
452
|
+
n2 += 1
|
453
|
+
else
|
454
|
+
break
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
sequence_alignment.each do |n,s|
|
459
|
+
new_s = s[n1..(- n2 - 1)]
|
460
|
+
new_seq_hash[n] = new_s
|
461
|
+
end
|
462
|
+
return new_seq_hash
|
463
|
+
end
|
464
|
+
|
465
|
+
# input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
|
466
|
+
# overlap is pre-determined
|
467
|
+
def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
|
468
|
+
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
469
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
470
|
+
joined_seq_hash = {}
|
471
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
472
|
+
r1_seq = seq_pair[0]
|
473
|
+
r2_seq = seq_pair[1]
|
474
|
+
if overlap.zero?
|
475
|
+
joined_seq_hash[seq_name] = r1_seq + r2_seq
|
476
|
+
elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
|
477
|
+
joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
|
478
|
+
else
|
479
|
+
next
|
480
|
+
end
|
481
|
+
end
|
482
|
+
return joined_seq_hash
|
483
|
+
end
|
484
|
+
|
485
|
+
|
486
|
+
# overlap is not predetermined
|
487
|
+
# model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
|
488
|
+
# model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
|
489
|
+
def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
|
490
|
+
begin
|
491
|
+
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
492
|
+
if model == 1
|
493
|
+
overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
|
494
|
+
return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
|
495
|
+
elsif model == 2
|
496
|
+
joined_seq_hash = {}
|
497
|
+
seq_pair_hash.each do |seq_name, seq_pair|
|
498
|
+
overlap_list = []
|
499
|
+
ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
|
500
|
+
cut_off_base = overlap1 * diff
|
501
|
+
overlap_list << overlap1 if diff_nt <= cut_off_base
|
502
|
+
end
|
503
|
+
if overlap_list.empty?
|
504
|
+
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
|
505
|
+
else
|
506
|
+
overlap = overlap_list.max
|
507
|
+
joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
|
508
|
+
end
|
509
|
+
end
|
510
|
+
return joined_seq_hash
|
511
|
+
else
|
512
|
+
raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
|
513
|
+
end
|
514
|
+
rescue ArgumentError => e
|
515
|
+
puts e
|
516
|
+
return nil
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
# determine overlap size from a paired sequence Hash object
|
521
|
+
def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
|
522
|
+
overlaps = []
|
523
|
+
seq_pair_hash.each do |_seq_name, seq_pair|
|
524
|
+
overlap_list = []
|
525
|
+
matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
|
526
|
+
matrix.each do |overlap, diff_nt|
|
527
|
+
cut_off_base = overlap * diff
|
528
|
+
overlap_list << overlap if diff_nt <= cut_off_base
|
529
|
+
end
|
530
|
+
if overlap_list.empty?
|
531
|
+
overlaps << 0
|
532
|
+
else
|
533
|
+
overlaps << overlap_list.max
|
534
|
+
end
|
535
|
+
end
|
536
|
+
count_overlaps = ViralSeq.count(overlaps)
|
537
|
+
max_value = count_overlaps.values.max
|
538
|
+
max_overlap_list = []
|
539
|
+
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
540
|
+
max_overlap_list.max
|
541
|
+
end
|
542
|
+
|
543
|
+
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
544
|
+
# {:overlap_size => number_of_differnt_positions, ...}
|
545
|
+
# {minimal overlap set to 4. }
|
546
|
+
def self.overlap_matrix(sequence1, sequence2)
|
547
|
+
min_overlap = 4
|
548
|
+
max_overlap = [sequence1.size, sequence2.size].max
|
549
|
+
matrix_hash = {}
|
550
|
+
(min_overlap..max_overlap).each do |overlap|
|
551
|
+
matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
|
552
|
+
end
|
553
|
+
return matrix_hash
|
554
|
+
end
|
555
|
+
|
556
|
+
end
|