viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,556 +0,0 @@
1
- # viral_seq/tcs_core
2
- # core functions for TCS and DR pipeline
3
- # functions to manipulate sequences including:
4
- # ViralSeq::calculate_pid_cut_off
5
- # ViralSeq::consensus
6
- # ViralSeq::generate_primer_id_pool
7
- # ViralSeq::similar_pid?
8
- # ViralSeq::filter_similar_pid
9
- # ViralSeq::collapse_sequence_by_x_nt_difference
10
- # ViralSeq::compare_two_seq
11
- # ViralSeq::gap_strip
12
- # ViralSeq::gap_strip_ends
13
- # ViralSeq::paired_join1
14
- # ViralSeq::paired_join2
15
-
16
- # ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
17
- # # A function to calcuate cut-off for offspring primer IDs.
18
- # # see reference at Zhou et al. JVI 2016.
19
- # # https://www.ncbi.nlm.nih.gov/pubmed/26041299
20
- # # PID_abundance is the abundance of a certain PID
21
- # # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
22
- # # the model supports error rate from 0.003 to 0.03.
23
- # # return an abundance cut-off (Integer) for offspring Primer IDs.
24
-
25
- # ViralSeq.consensus(seq_array, majority_cutoff)
26
- # # Generate a consensus sequence from a given sequence array.
27
- # # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
28
- # # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
29
- # =USAGE
30
- # a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
31
-
32
- # ViralSeq.generate_primer_id_pool(n)
33
- # # generate all Primer ID combinations given the length of Primer ID
34
- # # n is the length of the Primer ID (Integer). default value of n is 8.
35
- # =USAGE
36
- # primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
37
- # puts primer_id_pool.size #should be 4^10
38
- # => 1048576
39
-
40
- # ViralSeq.similar_pid?(pid1, pid2, base_difference)
41
- # # compare two primer ID sequences.
42
- # # If they differ in certain bases, return boolean value "TURE",
43
- # # else, return boolean value "FALSE"
44
- # # where pid1 and pid2 are two Primer IDs for comparison
45
- # # base_difference is an Integer for difference bases that allowed
46
- # =USAGE
47
- # # example
48
- # ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
49
- # => true
50
-
51
- # ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
52
- # # compare PID with sequences which have identical sequences.
53
- # # PIDs differ by 1 base will be recognized.
54
- # # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
55
- # # where sequence_fasta_file is the sequence file in fasta format
56
- # # each sequence tag starting with ">" and the Primer ID sequence
57
- # # followed by the number of Primer ID appeared in the raw sequence
58
- # # the information sections in the tags are separated by underscore "_"
59
- # # example sequence tag: >AGGCGTAGA_32_sample1_RT
60
- # # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
61
- # # default value for cut_off is 10
62
- # # return a new sequence hash. {sequence_name => sequence, ...}
63
-
64
- # ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
65
- # # ollapse sequences with x number of nt differences.
66
- # # input an Array object of sequences, make sure sequences are aligned.
67
- # # return a new Array object of collapsed sequences
68
- # # The return frequency is NOT the frequency of the collasped sequences.
69
-
70
- # ViralSeq.compare_two_seq(seq1, seq2)
71
- # # compare two sequences as String object, return the number of differences as integer
72
- # # sequences will NOT align
73
- # # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
74
- # =USAGE
75
- # # example
76
- # seq1 = 'AAGGCGTAGGAC'
77
- # seq2 = 'AAGCTTAGGACG'
78
- # puts ViralSeq.compare_two_seq(seq1, seq2)
79
- # => 8
80
- # aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
81
- # puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
82
- # => 4
83
-
84
- # ViralSeq.gap_strip(sequence_hash)
85
- # # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
86
- # =USAGE
87
- # # example
88
- # sequence_hash = {'>seq1' => 'AACCGGTT',
89
- # '>seq2' => 'A-CCGGTT',
90
- # '>seq3' => 'AAC-GGTT',
91
- # '>seq4' => 'AACCG-TT',
92
- # '>seq5' => 'AACCGGT-'}
93
- # ViralSeq.gap_strip(sequence_hash)
94
- # => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
95
-
96
- # ViralSeq.gap_strip_ends(sequence_hash)
97
- # # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
98
- # =USAGE
99
- # # example
100
- # sequence_hash = {'>seq1' => 'AACCGGTT',
101
- # '>seq2' => 'A-CCGGTT',
102
- # '>seq3' => 'AAC-GGTT',
103
- # '>seq4' => 'AACCG-TT',
104
- # '>seq5' => 'AACCGGT-'}
105
- # ViralSeq.gap_strip_ends(sequence_hash)
106
- # => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
107
-
108
- # ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
109
- # # pair-end join function for KNOW overlap size
110
- # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
111
- # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
112
- # # overlap is an integer that indicate how many bases are overlapped.
113
- # # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
114
- # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
115
- # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
116
- # =USAGE
117
- # # example
118
- # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
119
- # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
120
- # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
121
- # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
122
- # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
123
- # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
124
- # ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
125
- # => [">pair1"]
126
- # ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
127
- # => [">pair1", ">pair2"]
128
- # ViralSeq.paired_join1(paired_seqs, 100, 0.02)
129
- # => [">pair1", ">pair2", ">pair3"]
130
-
131
- # ViralSeq.paired_join2(seq_pair_hash, model, diff)
132
- # # pair-end join function for UNKNOW overlap
133
- # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
134
- # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
135
- # # model has two options, 1 or 2 as Integer
136
- # # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
137
- # # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
138
- # # minimal overlap by model 2 set to 4 positions
139
- # # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
140
- # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
141
- # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
142
- # =USAGE
143
- # # example 1
144
- # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
145
- # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
146
- # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
147
- # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
148
- # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
149
- # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
150
- # ViralSeq.paired_join2(paired_seqs, 1).keys
151
- # => [">pair1"]
152
- # ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
153
- # => [">pair1", ">pair2"]
154
- #
155
- # # example 2
156
- # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
157
- # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
158
- # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
159
- # }
160
- # ViralSeq.paired_join2(paired_seq2, 1)
161
- # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
162
- # ViralSeq.paired_join2(paired_seq2, 2)
163
- # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
164
-
165
-
166
- module ViralSeq
167
-
168
- # calculate cut-off for offspring primer IDs.
169
- def self.calculate_pid_cut_off(m, error_rate = 0.02)
170
- if m <= 10
171
- return 2
172
- end
173
- n = 0
174
- case error_rate
175
- when 0...0.0075
176
- n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
177
- when 0.0075...0.015
178
- n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
179
- when 0.015..0.03
180
- if m <= 8500
181
- n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
182
- else
183
- n = 0.0079 * m + 9.4869
184
- end
185
- else
186
- raise ArgumentError.new('Error_rate has be between 0 to 0.03')
187
- end
188
- n = n.round
189
- n = 2 if n < 3
190
- return n
191
- end
192
-
193
- # create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
194
- # example:
195
- # position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
196
- def self.consensus(seq_array, cutoff = 0.5)
197
- seq_length = seq_array[0].size
198
- seq_size = seq_array.size
199
- consensus_seq = ""
200
- (0..(seq_length - 1)).each do |position|
201
- all_base = []
202
- seq_array.each do |seq|
203
- all_base << seq[position]
204
- end
205
- base_count = ViralSeq.count(all_base)
206
- max_base_list = []
207
-
208
- base_count.each do |k,v|
209
- if v/seq_size.to_f >= cutoff
210
- max_base_list << k
211
- end
212
- end
213
- consensus_seq += ViralSeq.call_consensus_base(max_base_list)
214
- end
215
- return consensus_seq
216
- end
217
-
218
- # call consensus nucleotide, used by ViralSeq.consensus
219
- def self.call_consensus_base(base_array)
220
- if base_array.size == 1
221
- base_array[0]
222
- elsif base_array.size == 2
223
- case base_array.sort!
224
- when ["A","T"]
225
- "W"
226
- when ["C","G"]
227
- "S"
228
- when ["A","C"]
229
- "M"
230
- when ["G","T"]
231
- "K"
232
- when ["A","G"]
233
- "R"
234
- when ["C","T"]
235
- "Y"
236
- else
237
- "N"
238
- end
239
-
240
- elsif base_array.size == 3
241
- case base_array.sort!
242
- when ["C","G","T"]
243
- "B"
244
- when ["A","G","T"]
245
- "D"
246
- when ["A","C","T"]
247
- "H"
248
- when ["A","C","G"]
249
- "V"
250
- else
251
- "N"
252
- end
253
- else
254
- "N"
255
- end
256
- end
257
-
258
- # generate all Primer ID combinations given the length of Primer ID
259
- def self.generate_primer_id_pool(l=8)
260
- nt = ['A','T','C','G']
261
- pid_pool = ['A','T','C','G']
262
- (l-1).times do
263
- pid_pool = pid_pool.product(nt)
264
- pid_pool.collect! do |v|
265
- v.join("")
266
- end
267
- end
268
- return pid_pool
269
- end
270
-
271
- # compare two primer ID sequences.
272
- # If they differ in x base, return boolean value "TURE",
273
- # else, return boolean value "FALSE"
274
- def self.similar_pid?(pid1="",pid2="", x=0)
275
- l = pid1.size
276
- m = l - x
277
- n = 0
278
- if pid1.size != pid2.size
279
- return false
280
- else
281
- (0..(pid1.size - 1)).each do |k|
282
- if pid1[k] == pid2[k]
283
- n += 1
284
- end
285
- end
286
- if n >= m
287
- return true
288
- else
289
- return false
290
- end
291
- end
292
- end
293
-
294
- # compare PID with sequences which have identical sequences.
295
- # PIDs differ by 1 base will be recognized.
296
- # if PID1 is x time greater than PID2, PID2 will be disgarded
297
- def self.filter_similar_pid(sequence_file = "", cutoff = 10)
298
- seq = ViralSeq.fasta_to_hash(sequence_file)
299
- uni_seq = seq.values.uniq
300
- uni_seq_pid = {}
301
- uni_seq.each do |k|
302
- seq.each do |name,s|
303
- name = name[1..-1]
304
- if k == s
305
- if uni_seq_pid[k]
306
- uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
307
- else
308
- uni_seq_pid[k] = []
309
- uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
310
- end
311
- end
312
- end
313
- end
314
-
315
- dup_pid = []
316
- uni_seq_pid.values.each do |v|
317
- next if v.size == 1
318
- pid_hash = Hash[v]
319
- list = pid_hash.keys
320
- list2 = Array.new(list)
321
- pairs = []
322
-
323
- list.each do |k|
324
- list2.delete(k)
325
- list2.each do |k1|
326
- pairs << [k,k1]
327
- end
328
- end
329
-
330
-
331
- pairs.each do |p|
332
- pid1 = p[0]
333
- pid2 = p[1]
334
- if ViralSeq.similar_pid?(pid1,pid2,1)
335
- n1 = pid_hash[pid1].to_i
336
- n2 = pid_hash[pid2].to_i
337
- if n1 >= cutoff * n2
338
- dup_pid << pid2
339
- elsif n2 >= cutoff * n1
340
- dup_pid << pid1
341
- end
342
- end
343
- end
344
- end
345
-
346
-
347
- new_seq = {}
348
- seq.each do |name,s|
349
- pid = name.split("_")[0][1..-1]
350
- unless dup_pid.include?(pid)
351
- new_seq[name] = s
352
- end
353
- end
354
- return new_seq
355
- end
356
-
357
- # collapse sequences with x number of nt differences. make sure sequences are aligned.
358
- # The return frequency is NOT the frequency of the collasped sequences.
359
- def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
360
- new_seq_freq = {}
361
- seq_freq = ViralSeq.count(seq_array)
362
- if seq_freq.size == 1
363
- new_seq_freq = seq_freq
364
- else
365
- uniq_seq = seq_freq.keys
366
- unique_seq_pair = uniq_seq.combination(2)
367
- dupli_seq = []
368
- unique_seq_pair.each do |pair|
369
- seq1 = pair[0]
370
- seq2 = pair[1]
371
- diff = ViralSeq.compare_two_seq(seq1,seq2)
372
- if diff <= cutoff
373
- freq1 = seq_freq[seq1]
374
- freq2 = seq_freq[seq2]
375
- freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
376
- end
377
- end
378
-
379
- seq_freq.each do |seq,freq|
380
- unless dupli_seq.include?(seq)
381
- new_seq_freq[seq] = freq
382
- end
383
- end
384
- return new_seq_freq
385
- end
386
- end
387
-
388
-
389
- # compare two sequences, return the number of different positions, NO NEED alignment
390
-
391
- def self.compare_two_seq(seq1 = "", seq2 = "")
392
- length = seq1.size
393
- diff = 0
394
- (0..(length-1)).each do |position|
395
- nt1 = seq1[position]
396
- nt2 = seq2[position]
397
- diff += 1 unless nt1 == nt2
398
- end
399
- return diff
400
- end
401
-
402
- # gap strip from a sequence alignment
403
-
404
- def self.gap_strip(sequence_alignment)
405
- new_seq_hash = {}
406
- seq_size = sequence_alignment.values[0].size
407
- seq_matrix = {}
408
- (0..(seq_size - 1)).each do |p|
409
- seq_matrix[p] = []
410
- sequence_alignment.values.each do |s|
411
- seq_matrix[p] << s[p]
412
- end
413
- end
414
-
415
- seq_matrix.delete_if do |_p, list|
416
- list.include?("-")
417
- end
418
-
419
- sequence_alignment.each do |n,s|
420
- new_s = ""
421
- seq_matrix.keys.each {|p| new_s += s[p]}
422
- new_seq_hash[n] = new_s
423
- end
424
- return new_seq_hash
425
- end
426
-
427
- # gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
428
-
429
- def self.gap_strip_ends(sequence_alignment)
430
- new_seq_hash = {}
431
- seq_size = sequence_alignment.values[0].size
432
- seq_matrix = {}
433
- (0..(seq_size - 1)).each do |p|
434
- seq_matrix[p] = []
435
- sequence_alignment.values.each do |s|
436
- seq_matrix[p] << s[p]
437
- end
438
- end
439
- n1 = 0
440
- n2 = 0
441
- seq_matrix.each do |_p, list|
442
- if list.include?("-")
443
- n1 += 1
444
- else
445
- break
446
- end
447
- end
448
-
449
- seq_matrix.keys.reverse.each do |p|
450
- list = seq_matrix[p]
451
- if list.include?("-")
452
- n2 += 1
453
- else
454
- break
455
- end
456
- end
457
-
458
- sequence_alignment.each do |n,s|
459
- new_s = s[n1..(- n2 - 1)]
460
- new_seq_hash[n] = new_s
461
- end
462
- return new_seq_hash
463
- end
464
-
465
- # input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
466
- # overlap is pre-determined
467
- def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
468
- raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
469
- raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
470
- joined_seq_hash = {}
471
- seq_pair_hash.each do |seq_name, seq_pair|
472
- r1_seq = seq_pair[0]
473
- r2_seq = seq_pair[1]
474
- if overlap.zero?
475
- joined_seq_hash[seq_name] = r1_seq + r2_seq
476
- elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
477
- joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
478
- else
479
- next
480
- end
481
- end
482
- return joined_seq_hash
483
- end
484
-
485
-
486
- # overlap is not predetermined
487
- # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
488
- # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
489
- def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
490
- begin
491
- raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
492
- if model == 1
493
- overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
494
- return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
495
- elsif model == 2
496
- joined_seq_hash = {}
497
- seq_pair_hash.each do |seq_name, seq_pair|
498
- overlap_list = []
499
- ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
500
- cut_off_base = overlap1 * diff
501
- overlap_list << overlap1 if diff_nt <= cut_off_base
502
- end
503
- if overlap_list.empty?
504
- joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
505
- else
506
- overlap = overlap_list.max
507
- joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
508
- end
509
- end
510
- return joined_seq_hash
511
- else
512
- raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
513
- end
514
- rescue ArgumentError => e
515
- puts e
516
- return nil
517
- end
518
- end
519
-
520
- # determine overlap size from a paired sequence Hash object
521
- def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
522
- overlaps = []
523
- seq_pair_hash.each do |_seq_name, seq_pair|
524
- overlap_list = []
525
- matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
526
- matrix.each do |overlap, diff_nt|
527
- cut_off_base = overlap * diff
528
- overlap_list << overlap if diff_nt <= cut_off_base
529
- end
530
- if overlap_list.empty?
531
- overlaps << 0
532
- else
533
- overlaps << overlap_list.max
534
- end
535
- end
536
- count_overlaps = ViralSeq.count(overlaps)
537
- max_value = count_overlaps.values.max
538
- max_overlap_list = []
539
- count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
540
- max_overlap_list.max
541
- end
542
-
543
- # input a pair of sequences as String, return a Hash object of overlapping Hash object
544
- # {:overlap_size => number_of_differnt_positions, ...}
545
- # {minimal overlap set to 4. }
546
- def self.overlap_matrix(sequence1, sequence2)
547
- min_overlap = 4
548
- max_overlap = [sequence1.size, sequence2.size].max
549
- matrix_hash = {}
550
- (min_overlap..max_overlap).each do |overlap|
551
- matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
552
- end
553
- return matrix_hash
554
- end
555
-
556
- end