viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,556 +0,0 @@
1
- # viral_seq/tcs_core
2
- # core functions for TCS and DR pipeline
3
- # functions to manipulate sequences including:
4
- # ViralSeq::calculate_pid_cut_off
5
- # ViralSeq::consensus
6
- # ViralSeq::generate_primer_id_pool
7
- # ViralSeq::similar_pid?
8
- # ViralSeq::filter_similar_pid
9
- # ViralSeq::collapse_sequence_by_x_nt_difference
10
- # ViralSeq::compare_two_seq
11
- # ViralSeq::gap_strip
12
- # ViralSeq::gap_strip_ends
13
- # ViralSeq::paired_join1
14
- # ViralSeq::paired_join2
15
-
16
- # ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
17
- # # A function to calcuate cut-off for offspring primer IDs.
18
- # # see reference at Zhou et al. JVI 2016.
19
- # # https://www.ncbi.nlm.nih.gov/pubmed/26041299
20
- # # PID_abundance is the abundance of a certain PID
21
- # # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
22
- # # the model supports error rate from 0.003 to 0.03.
23
- # # return an abundance cut-off (Integer) for offspring Primer IDs.
24
-
25
- # ViralSeq.consensus(seq_array, majority_cutoff)
26
- # # Generate a consensus sequence from a given sequence array.
27
- # # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
28
- # # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
29
- # =USAGE
30
- # a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
31
-
32
- # ViralSeq.generate_primer_id_pool(n)
33
- # # generate all Primer ID combinations given the length of Primer ID
34
- # # n is the length of the Primer ID (Integer). default value of n is 8.
35
- # =USAGE
36
- # primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
37
- # puts primer_id_pool.size #should be 4^10
38
- # => 1048576
39
-
40
- # ViralSeq.similar_pid?(pid1, pid2, base_difference)
41
- # # compare two primer ID sequences.
42
- # # If they differ in certain bases, return boolean value "TURE",
43
- # # else, return boolean value "FALSE"
44
- # # where pid1 and pid2 are two Primer IDs for comparison
45
- # # base_difference is an Integer for difference bases that allowed
46
- # =USAGE
47
- # # example
48
- # ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
49
- # => true
50
-
51
- # ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
52
- # # compare PID with sequences which have identical sequences.
53
- # # PIDs differ by 1 base will be recognized.
54
- # # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
55
- # # where sequence_fasta_file is the sequence file in fasta format
56
- # # each sequence tag starting with ">" and the Primer ID sequence
57
- # # followed by the number of Primer ID appeared in the raw sequence
58
- # # the information sections in the tags are separated by underscore "_"
59
- # # example sequence tag: >AGGCGTAGA_32_sample1_RT
60
- # # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
61
- # # default value for cut_off is 10
62
- # # return a new sequence hash. {sequence_name => sequence, ...}
63
-
64
- # ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
65
- # # ollapse sequences with x number of nt differences.
66
- # # input an Array object of sequences, make sure sequences are aligned.
67
- # # return a new Array object of collapsed sequences
68
- # # The return frequency is NOT the frequency of the collasped sequences.
69
-
70
- # ViralSeq.compare_two_seq(seq1, seq2)
71
- # # compare two sequences as String object, return the number of differences as integer
72
- # # sequences will NOT align
73
- # # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
74
- # =USAGE
75
- # # example
76
- # seq1 = 'AAGGCGTAGGAC'
77
- # seq2 = 'AAGCTTAGGACG'
78
- # puts ViralSeq.compare_two_seq(seq1, seq2)
79
- # => 8
80
- # aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
81
- # puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
82
- # => 4
83
-
84
- # ViralSeq.gap_strip(sequence_hash)
85
- # # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
86
- # =USAGE
87
- # # example
88
- # sequence_hash = {'>seq1' => 'AACCGGTT',
89
- # '>seq2' => 'A-CCGGTT',
90
- # '>seq3' => 'AAC-GGTT',
91
- # '>seq4' => 'AACCG-TT',
92
- # '>seq5' => 'AACCGGT-'}
93
- # ViralSeq.gap_strip(sequence_hash)
94
- # => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
95
-
96
- # ViralSeq.gap_strip_ends(sequence_hash)
97
- # # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
98
- # =USAGE
99
- # # example
100
- # sequence_hash = {'>seq1' => 'AACCGGTT',
101
- # '>seq2' => 'A-CCGGTT',
102
- # '>seq3' => 'AAC-GGTT',
103
- # '>seq4' => 'AACCG-TT',
104
- # '>seq5' => 'AACCGGT-'}
105
- # ViralSeq.gap_strip_ends(sequence_hash)
106
- # => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
107
-
108
- # ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
109
- # # pair-end join function for KNOW overlap size
110
- # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
111
- # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
112
- # # overlap is an integer that indicate how many bases are overlapped.
113
- # # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
114
- # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
115
- # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
116
- # =USAGE
117
- # # example
118
- # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
119
- # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
120
- # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
121
- # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
122
- # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
123
- # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
124
- # ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
125
- # => [">pair1"]
126
- # ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
127
- # => [">pair1", ">pair2"]
128
- # ViralSeq.paired_join1(paired_seqs, 100, 0.02)
129
- # => [">pair1", ">pair2", ">pair3"]
130
-
131
- # ViralSeq.paired_join2(seq_pair_hash, model, diff)
132
- # # pair-end join function for UNKNOW overlap
133
- # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
134
- # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
135
- # # model has two options, 1 or 2 as Integer
136
- # # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
137
- # # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
138
- # # minimal overlap by model 2 set to 4 positions
139
- # # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
140
- # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
141
- # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
142
- # =USAGE
143
- # # example 1
144
- # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
145
- # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
146
- # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
147
- # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
148
- # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
149
- # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
150
- # ViralSeq.paired_join2(paired_seqs, 1).keys
151
- # => [">pair1"]
152
- # ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
153
- # => [">pair1", ">pair2"]
154
- #
155
- # # example 2
156
- # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
157
- # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
158
- # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
159
- # }
160
- # ViralSeq.paired_join2(paired_seq2, 1)
161
- # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
162
- # ViralSeq.paired_join2(paired_seq2, 2)
163
- # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
164
-
165
-
166
- module ViralSeq
167
-
168
- # calculate cut-off for offspring primer IDs.
169
- def self.calculate_pid_cut_off(m, error_rate = 0.02)
170
- if m <= 10
171
- return 2
172
- end
173
- n = 0
174
- case error_rate
175
- when 0...0.0075
176
- n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
177
- when 0.0075...0.015
178
- n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
179
- when 0.015..0.03
180
- if m <= 8500
181
- n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
182
- else
183
- n = 0.0079 * m + 9.4869
184
- end
185
- else
186
- raise ArgumentError.new('Error_rate has be between 0 to 0.03')
187
- end
188
- n = n.round
189
- n = 2 if n < 3
190
- return n
191
- end
192
-
193
- # create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
194
- # example:
195
- # position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
196
- def self.consensus(seq_array, cutoff = 0.5)
197
- seq_length = seq_array[0].size
198
- seq_size = seq_array.size
199
- consensus_seq = ""
200
- (0..(seq_length - 1)).each do |position|
201
- all_base = []
202
- seq_array.each do |seq|
203
- all_base << seq[position]
204
- end
205
- base_count = ViralSeq.count(all_base)
206
- max_base_list = []
207
-
208
- base_count.each do |k,v|
209
- if v/seq_size.to_f >= cutoff
210
- max_base_list << k
211
- end
212
- end
213
- consensus_seq += ViralSeq.call_consensus_base(max_base_list)
214
- end
215
- return consensus_seq
216
- end
217
-
218
- # call consensus nucleotide, used by ViralSeq.consensus
219
- def self.call_consensus_base(base_array)
220
- if base_array.size == 1
221
- base_array[0]
222
- elsif base_array.size == 2
223
- case base_array.sort!
224
- when ["A","T"]
225
- "W"
226
- when ["C","G"]
227
- "S"
228
- when ["A","C"]
229
- "M"
230
- when ["G","T"]
231
- "K"
232
- when ["A","G"]
233
- "R"
234
- when ["C","T"]
235
- "Y"
236
- else
237
- "N"
238
- end
239
-
240
- elsif base_array.size == 3
241
- case base_array.sort!
242
- when ["C","G","T"]
243
- "B"
244
- when ["A","G","T"]
245
- "D"
246
- when ["A","C","T"]
247
- "H"
248
- when ["A","C","G"]
249
- "V"
250
- else
251
- "N"
252
- end
253
- else
254
- "N"
255
- end
256
- end
257
-
258
- # generate all Primer ID combinations given the length of Primer ID
259
- def self.generate_primer_id_pool(l=8)
260
- nt = ['A','T','C','G']
261
- pid_pool = ['A','T','C','G']
262
- (l-1).times do
263
- pid_pool = pid_pool.product(nt)
264
- pid_pool.collect! do |v|
265
- v.join("")
266
- end
267
- end
268
- return pid_pool
269
- end
270
-
271
- # compare two primer ID sequences.
272
- # If they differ in x base, return boolean value "TURE",
273
- # else, return boolean value "FALSE"
274
- def self.similar_pid?(pid1="",pid2="", x=0)
275
- l = pid1.size
276
- m = l - x
277
- n = 0
278
- if pid1.size != pid2.size
279
- return false
280
- else
281
- (0..(pid1.size - 1)).each do |k|
282
- if pid1[k] == pid2[k]
283
- n += 1
284
- end
285
- end
286
- if n >= m
287
- return true
288
- else
289
- return false
290
- end
291
- end
292
- end
293
-
294
- # compare PID with sequences which have identical sequences.
295
- # PIDs differ by 1 base will be recognized.
296
- # if PID1 is x time greater than PID2, PID2 will be disgarded
297
- def self.filter_similar_pid(sequence_file = "", cutoff = 10)
298
- seq = ViralSeq.fasta_to_hash(sequence_file)
299
- uni_seq = seq.values.uniq
300
- uni_seq_pid = {}
301
- uni_seq.each do |k|
302
- seq.each do |name,s|
303
- name = name[1..-1]
304
- if k == s
305
- if uni_seq_pid[k]
306
- uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
307
- else
308
- uni_seq_pid[k] = []
309
- uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
310
- end
311
- end
312
- end
313
- end
314
-
315
- dup_pid = []
316
- uni_seq_pid.values.each do |v|
317
- next if v.size == 1
318
- pid_hash = Hash[v]
319
- list = pid_hash.keys
320
- list2 = Array.new(list)
321
- pairs = []
322
-
323
- list.each do |k|
324
- list2.delete(k)
325
- list2.each do |k1|
326
- pairs << [k,k1]
327
- end
328
- end
329
-
330
-
331
- pairs.each do |p|
332
- pid1 = p[0]
333
- pid2 = p[1]
334
- if ViralSeq.similar_pid?(pid1,pid2,1)
335
- n1 = pid_hash[pid1].to_i
336
- n2 = pid_hash[pid2].to_i
337
- if n1 >= cutoff * n2
338
- dup_pid << pid2
339
- elsif n2 >= cutoff * n1
340
- dup_pid << pid1
341
- end
342
- end
343
- end
344
- end
345
-
346
-
347
- new_seq = {}
348
- seq.each do |name,s|
349
- pid = name.split("_")[0][1..-1]
350
- unless dup_pid.include?(pid)
351
- new_seq[name] = s
352
- end
353
- end
354
- return new_seq
355
- end
356
-
357
- # collapse sequences with x number of nt differences. make sure sequences are aligned.
358
- # The return frequency is NOT the frequency of the collasped sequences.
359
- def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
360
- new_seq_freq = {}
361
- seq_freq = ViralSeq.count(seq_array)
362
- if seq_freq.size == 1
363
- new_seq_freq = seq_freq
364
- else
365
- uniq_seq = seq_freq.keys
366
- unique_seq_pair = uniq_seq.combination(2)
367
- dupli_seq = []
368
- unique_seq_pair.each do |pair|
369
- seq1 = pair[0]
370
- seq2 = pair[1]
371
- diff = ViralSeq.compare_two_seq(seq1,seq2)
372
- if diff <= cutoff
373
- freq1 = seq_freq[seq1]
374
- freq2 = seq_freq[seq2]
375
- freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
376
- end
377
- end
378
-
379
- seq_freq.each do |seq,freq|
380
- unless dupli_seq.include?(seq)
381
- new_seq_freq[seq] = freq
382
- end
383
- end
384
- return new_seq_freq
385
- end
386
- end
387
-
388
-
389
- # compare two sequences, return the number of different positions, NO NEED alignment
390
-
391
- def self.compare_two_seq(seq1 = "", seq2 = "")
392
- length = seq1.size
393
- diff = 0
394
- (0..(length-1)).each do |position|
395
- nt1 = seq1[position]
396
- nt2 = seq2[position]
397
- diff += 1 unless nt1 == nt2
398
- end
399
- return diff
400
- end
401
-
402
- # gap strip from a sequence alignment
403
-
404
- def self.gap_strip(sequence_alignment)
405
- new_seq_hash = {}
406
- seq_size = sequence_alignment.values[0].size
407
- seq_matrix = {}
408
- (0..(seq_size - 1)).each do |p|
409
- seq_matrix[p] = []
410
- sequence_alignment.values.each do |s|
411
- seq_matrix[p] << s[p]
412
- end
413
- end
414
-
415
- seq_matrix.delete_if do |_p, list|
416
- list.include?("-")
417
- end
418
-
419
- sequence_alignment.each do |n,s|
420
- new_s = ""
421
- seq_matrix.keys.each {|p| new_s += s[p]}
422
- new_seq_hash[n] = new_s
423
- end
424
- return new_seq_hash
425
- end
426
-
427
- # gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
428
-
429
- def self.gap_strip_ends(sequence_alignment)
430
- new_seq_hash = {}
431
- seq_size = sequence_alignment.values[0].size
432
- seq_matrix = {}
433
- (0..(seq_size - 1)).each do |p|
434
- seq_matrix[p] = []
435
- sequence_alignment.values.each do |s|
436
- seq_matrix[p] << s[p]
437
- end
438
- end
439
- n1 = 0
440
- n2 = 0
441
- seq_matrix.each do |_p, list|
442
- if list.include?("-")
443
- n1 += 1
444
- else
445
- break
446
- end
447
- end
448
-
449
- seq_matrix.keys.reverse.each do |p|
450
- list = seq_matrix[p]
451
- if list.include?("-")
452
- n2 += 1
453
- else
454
- break
455
- end
456
- end
457
-
458
- sequence_alignment.each do |n,s|
459
- new_s = s[n1..(- n2 - 1)]
460
- new_seq_hash[n] = new_s
461
- end
462
- return new_seq_hash
463
- end
464
-
465
- # input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
466
- # overlap is pre-determined
467
- def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
468
- raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
469
- raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
470
- joined_seq_hash = {}
471
- seq_pair_hash.each do |seq_name, seq_pair|
472
- r1_seq = seq_pair[0]
473
- r2_seq = seq_pair[1]
474
- if overlap.zero?
475
- joined_seq_hash[seq_name] = r1_seq + r2_seq
476
- elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
477
- joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
478
- else
479
- next
480
- end
481
- end
482
- return joined_seq_hash
483
- end
484
-
485
-
486
- # overlap is not predetermined
487
- # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
488
- # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
489
- def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
490
- begin
491
- raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
492
- if model == 1
493
- overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
494
- return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
495
- elsif model == 2
496
- joined_seq_hash = {}
497
- seq_pair_hash.each do |seq_name, seq_pair|
498
- overlap_list = []
499
- ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
500
- cut_off_base = overlap1 * diff
501
- overlap_list << overlap1 if diff_nt <= cut_off_base
502
- end
503
- if overlap_list.empty?
504
- joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
505
- else
506
- overlap = overlap_list.max
507
- joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
508
- end
509
- end
510
- return joined_seq_hash
511
- else
512
- raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
513
- end
514
- rescue ArgumentError => e
515
- puts e
516
- return nil
517
- end
518
- end
519
-
520
- # determine overlap size from a paired sequence Hash object
521
- def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
522
- overlaps = []
523
- seq_pair_hash.each do |_seq_name, seq_pair|
524
- overlap_list = []
525
- matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
526
- matrix.each do |overlap, diff_nt|
527
- cut_off_base = overlap * diff
528
- overlap_list << overlap if diff_nt <= cut_off_base
529
- end
530
- if overlap_list.empty?
531
- overlaps << 0
532
- else
533
- overlaps << overlap_list.max
534
- end
535
- end
536
- count_overlaps = ViralSeq.count(overlaps)
537
- max_value = count_overlaps.values.max
538
- max_overlap_list = []
539
- count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
540
- max_overlap_list.max
541
- end
542
-
543
- # input a pair of sequences as String, return a Hash object of overlapping Hash object
544
- # {:overlap_size => number_of_differnt_positions, ...}
545
- # {minimal overlap set to 4. }
546
- def self.overlap_matrix(sequence1, sequence2)
547
- min_overlap = 4
548
- max_overlap = [sequence1.size, sequence2.size].max
549
- matrix_hash = {}
550
- (min_overlap..max_overlap).each do |overlap|
551
- matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
552
- end
553
- return matrix_hash
554
- end
555
-
556
- end