viral_seq 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,392 @@
1
+ # lib/sequence.rb
2
+ # Includes functions for sequence operations
3
+ # Including methods as:
4
+ # ViralSeq::AMINO_ACID_LIST
5
+ # ViralSeq::Sequence
6
+ # ViralSeq::Sequence#rev_complement
7
+ # ViralSeq::Sequence#get_aa_sequence
8
+ # ViralSeq::Sequence#get_aa_array
9
+ # ViralSeq::Sequence#name
10
+ # ViralSeq::Sequence#dna_sequence
11
+ # ViralSeq::Sequence#aa_sequence
12
+ # ViralSeq::Sequence#aa_array
13
+ # ViralSeq::amino_acid
14
+ # ViralSeq::amino_acid_2
15
+ # ViralSeq::to_list
16
+ # ViralSeq::uniq_sequence_hash
17
+ # ViralSeq::stop_codon_seq_hash
18
+ # String#rc
19
+ # String#mutation
20
+ # String#nt_parser
21
+
22
+ # ViralSeq::AMINO_ACID_LIST
23
+ # # Array of all amino acid one letter abbreviations
24
+
25
+ # ViralSeq::Sequence
26
+ # # Sequence class
27
+ # =USAGE
28
+ # # create a sequence object
29
+ # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
30
+ #
31
+ # # print dna sequence
32
+ # puts seq.dna_sequence
33
+ #
34
+ # # reserce complement sequence of DNA sequence, return as a string
35
+ # seq.rev_complement
36
+ #
37
+ # # change @dna_sequence to reverse complement DNA sequence
38
+ # seq.rev_complement!
39
+ #
40
+ # # generate amino acid sequences. either return string or array.
41
+ # # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
42
+ # # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
43
+ # seq.get_aa_sequence
44
+ # # or
45
+ # seq.get_aa_array
46
+ #
47
+ # # print amino acid sequence
48
+ # puts seq.aa_sequence
49
+
50
+ # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
51
+ # # collapse sequence hash to unique sequence hash.
52
+ # # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
53
+ # # master_sequence_tag is the master tag for unique sequences
54
+ # # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
55
+ # =USAGE
56
+ # sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
57
+ # '>seq4' => 'CCCC', '>seq5' => 'CCCC',
58
+ # '>seq6' => 'TTTT' }
59
+ # uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
60
+ # => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
61
+
62
+ module ViralSeq
63
+
64
+ # array for all amino acid one letter abbreviations
65
+ AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
66
+
67
+ # sequence class
68
+
69
+ class Sequence
70
+ def initialize (name = ">sequence",dna_sequence ="")
71
+ @name = name
72
+ @dna_sequence = dna_sequence.upcase
73
+ @aa_sequence = ""
74
+ @aa_array = []
75
+ end
76
+
77
+ attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
78
+
79
+ def rev_complement
80
+ @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
81
+ end
82
+ def rev_complement!
83
+ @dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
84
+ end
85
+
86
+ def get_aa_sequence(initial_position = 0)
87
+ @aa_sequence = ""
88
+ require_sequence = @dna_sequence[initial_position..-1]
89
+ base_array = []
90
+ require_sequence.each_char {|base| base_array << base}
91
+ while (base_array.length>=3) do
92
+ base_3= ""
93
+ 3.times {base_3 += base_array.shift}
94
+ @aa_sequence << amino_acid(base_3)
95
+ end
96
+ return @aa_sequence
97
+ end
98
+
99
+ # get amino acid calls, return a array.keep ambiguity calls.
100
+ def get_aa_array(initial_position = 0)
101
+ @aa_array = []
102
+ require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
103
+ base_array = []
104
+ require_sequence.each_char {|base| base_array << base}
105
+ while (base_array.length>=3) do
106
+ base_3= ""
107
+ 3.times{base_3 += base_array.shift}
108
+ @aa_array<< ViralSeq.amino_acid_2(base_3)
109
+ end
110
+ return @aa_array
111
+ end
112
+ def dna_length
113
+ @dna_sequence.length
114
+ end
115
+ def aa_length
116
+ @aa_sequence.length
117
+ end
118
+ end
119
+
120
+ # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
121
+ def self.amino_acid (bases)
122
+ case bases
123
+ when /^TT[TCY]$/
124
+ return "F"
125
+ when /^TT[AGR]$/
126
+ return "L"
127
+ when /^CT.$/
128
+ return "L"
129
+ when /^AT[TCAHYWM]$/
130
+ return "I"
131
+ when "ATG"
132
+ return "M"
133
+ when /^GT.$/
134
+ return "V"
135
+ when /^TC.$/
136
+ return "S"
137
+ when /^CC.$/
138
+ return "P"
139
+ when /^AC.$/
140
+ return "T"
141
+ when /^GC.$/
142
+ return "A"
143
+ when /^TA[TCY]$/
144
+ return "Y"
145
+ when /^TA[AGR]$/
146
+ return "*"
147
+ when /^T[GR]A$/
148
+ return "*"
149
+ when /^CA[TCY]$/
150
+ return "H"
151
+ when /^CA[AGR]$/
152
+ return "Q"
153
+ when /^AA[TCY]$/
154
+ return "N"
155
+ when /^AA[AGR]$/
156
+ return "K"
157
+ when /^GA[TCY]$/
158
+ return "D"
159
+ when /^GA[AGR]$/
160
+ return "E"
161
+ when /^TG[TCY]$/
162
+ return "C"
163
+ when "TGG"
164
+ return "W"
165
+ when /^CG.$/
166
+ return "R"
167
+ when /^AG[TCY]$/
168
+ return "S"
169
+ when /^[AM]G[AGR]$/
170
+ return "R"
171
+ when /^GG.$/
172
+ return "G"
173
+ when /^[ATW][CGS][CTY]$/
174
+ return "S"
175
+ when /^[TCY]T[AGR]$/
176
+ return "L"
177
+ else
178
+ return "#"
179
+ end
180
+ end
181
+
182
+ # keep ambiguities, return all possible amino acids.
183
+
184
+ def self.amino_acid_2 (bases)
185
+ bases_to_aa = []
186
+ aa_list = []
187
+ base1 = ViralSeq.to_list(bases[0])
188
+ base2 = ViralSeq.to_list(bases[1])
189
+ base3 = ViralSeq.to_list(bases[2])
190
+ l1 = base1.size - 1
191
+ l2 = base2.size - 1
192
+ l3 = base3.size - 1
193
+ (0..l1).each do |n1|
194
+ b1 = base1[n1]
195
+ (0..l2).each do |n2|
196
+ b2 = base2[n2]
197
+ (0..l3).each do |n3|
198
+ b3 = base3[n3]
199
+ bases_all = b1 + b2 + b3
200
+ bases_to_aa << bases_all
201
+ end
202
+ end
203
+ end
204
+
205
+ bases_to_aa.each do |base|
206
+ case base
207
+ when /^TT[TCY]$/
208
+ aa = "F"
209
+ when /^TT[AGR]$/
210
+ aa = "L"
211
+ when /^CT.$/
212
+ aa = "L"
213
+ when /^AT[TCAHYWM]$/
214
+ aa = "I"
215
+ when "ATG"
216
+ aa = "M"
217
+ when /^GT.$/
218
+ aa = "V"
219
+ when /^TC.$/
220
+ aa = "S"
221
+ when /^CC.$/
222
+ aa = "P"
223
+ when /^AC.$/
224
+ aa = "T"
225
+ when /^GC.$/
226
+ aa = "A"
227
+ when /^TA[TCY]$/
228
+ aa = "Y"
229
+ when /^TA[AGR]$/
230
+ aa = "*"
231
+ when /^T[GR]A$/
232
+ aa = "*"
233
+ when /^CA[TCY]$/
234
+ aa = "H"
235
+ when /^CA[AGR]$/
236
+ aa = "Q"
237
+ when /^AA[TCY]$/
238
+ aa = "N"
239
+ when /^AA[AGR]$/
240
+ aa = "K"
241
+ when /^GA[TCY]$/
242
+ aa = "D"
243
+ when /^GA[AGR]$/
244
+ aa = "E"
245
+ when /^TG[TCY]$/
246
+ aa = "C"
247
+ when "TGG"
248
+ aa = "W"
249
+ when /^CG.$/
250
+ aa = "R"
251
+ when /^AG[TCY]$/
252
+ aa = "S"
253
+ when /^[AM]G[AGR]$/
254
+ aa = "R"
255
+ when /^GG.$/
256
+ aa = "G"
257
+ when /^[ATW][CGS][CTY]$/
258
+ aa = "S"
259
+ when /^[TCY]T[AGR]$/
260
+ aa = "L"
261
+ else
262
+ aa = "-"
263
+ end
264
+ aa_list << aa
265
+ end
266
+ aa_out = aa_list.uniq.join('/')
267
+ return aa_out
268
+ end
269
+
270
+ # parse ambiguity bases, aka %w{W S M K R Y B D H V N}
271
+
272
+ def self.to_list(base = "")
273
+ list = []
274
+ case base
275
+ when /[A|T|C|G]/
276
+ list << base
277
+ when "W"
278
+ list = ['A','T']
279
+ when "S"
280
+ list = ['C','G']
281
+ when "M"
282
+ list = ['A','C']
283
+ when 'K'
284
+ list = ['G','C']
285
+ when 'R'
286
+ list = ['A','G']
287
+ when 'Y'
288
+ list = ['C','T']
289
+ when 'B'
290
+ list = ['C','G','T']
291
+ when 'D'
292
+ list = ['A','G','T']
293
+ when 'H'
294
+ list = ['A','C','T']
295
+ when 'V'
296
+ list = ['A','C','G']
297
+ when 'N'
298
+ list = ['A','T','C','G']
299
+ end
300
+ return list
301
+ end
302
+
303
+ # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
304
+ # collapse sequence hash to unique sequence hash.
305
+ # input_sequence_hash is a sequence hash {:name => :sequence, ...}
306
+ # master_sequence_tag is the master tag for unique sequences
307
+ # sequences will be named as (master_sequence_tag + "_" + Integer)
308
+
309
+ def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
310
+ uni = ViralSeq.count(seq.values)
311
+ new_seq = {}
312
+ n = 1
313
+ uni.each do |s,c|
314
+ name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
315
+ new_seq[name] = s
316
+ n += 1
317
+ end
318
+ return new_seq
319
+ end
320
+
321
+ # input a sequence hash, return a sequence hash with stop codons.
322
+ def self.stop_codon_seq_hash(seq_hash, rf = 0)
323
+ out_seq_hash = {}
324
+ seq_hash.each do |k,v|
325
+ sequence = Sequence.new(k,v)
326
+ sequence.get_aa_array(rf)
327
+ if sequence.aa_array.include?("*")
328
+ out_seq_hash[k] = v
329
+ end
330
+ end
331
+ return out_seq_hash
332
+ end
333
+
334
+ end
335
+
336
+ # functions added to Class::String for direct operation on sequence if it is a String object
337
+ # String.rc
338
+ # # reverse complement
339
+ # # example
340
+ # "ACAGA".rc
341
+ # => "TCTGT"
342
+ #
343
+ # String.mutation(error_rate)
344
+ # # mutate a nt sequence (String class) randomly
345
+ # # must define error rate, default value 0.01, aka 1%
346
+ # =USAGE
347
+ # # example
348
+ # seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
349
+ # seq.mutation(0.05)
350
+ # => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
351
+ #
352
+ # String.nt_parser
353
+ # # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
354
+ # =USAGE
355
+ # "ATRWCG".nt_parser
356
+ # => /AT[A|G][A|T]CG/
357
+
358
+ class String
359
+ # direct function of calling reverse complement on String class
360
+ def rc
361
+ self.reverse.tr("ACTG","TGAC")
362
+ end
363
+
364
+ def mutation(error_rate = 0.01)
365
+ new_string = ""
366
+ self.split("").each do |nt|
367
+ pool = ["A","C","T","G"]
368
+ pool.delete(nt)
369
+ s = error_rate * 10000
370
+ r = rand(10000)
371
+ if r < s
372
+ nt = pool.sample
373
+ end
374
+ new_string << nt
375
+ end
376
+ return new_string
377
+ end
378
+
379
+ def nt_parser
380
+ match = ""
381
+ self.each_char.each do |base|
382
+ base_array = ViralSeq.to_list(base)
383
+ if base_array.size == 1
384
+ match += base_array[0]
385
+ else
386
+ pattern = "[" + base_array.join("|") + "]"
387
+ match += pattern
388
+ end
389
+ end
390
+ Regexp.new match
391
+ end
392
+ end
@@ -0,0 +1,556 @@
1
+ # viral_seq/tcs_core
2
+ # core functions for TCS and DR pipeline
3
+ # functions to manipulate sequences including:
4
+ # ViralSeq::calculate_pid_cut_off
5
+ # ViralSeq::consensus
6
+ # ViralSeq::generate_primer_id_pool
7
+ # ViralSeq::similar_pid?
8
+ # ViralSeq::filter_similar_pid
9
+ # ViralSeq::collapse_sequence_by_x_nt_difference
10
+ # ViralSeq::compare_two_seq
11
+ # ViralSeq::gap_strip
12
+ # ViralSeq::gap_strip_ends
13
+ # ViralSeq::paired_join1
14
+ # ViralSeq::paired_join2
15
+
16
+ # ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
17
+ # # A function to calcuate cut-off for offspring primer IDs.
18
+ # # see reference at Zhou et al. JVI 2016.
19
+ # # https://www.ncbi.nlm.nih.gov/pubmed/26041299
20
+ # # PID_abundance is the abundance of a certain PID
21
+ # # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
22
+ # # the model supports error rate from 0.003 to 0.03.
23
+ # # return an abundance cut-off (Integer) for offspring Primer IDs.
24
+
25
+ # ViralSeq.consensus(seq_array, majority_cutoff)
26
+ # # Generate a consensus sequence from a given sequence array.
27
+ # # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
28
+ # # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
29
+ # =USAGE
30
+ # a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
31
+
32
+ # ViralSeq.generate_primer_id_pool(n)
33
+ # # generate all Primer ID combinations given the length of Primer ID
34
+ # # n is the length of the Primer ID (Integer). default value of n is 8.
35
+ # =USAGE
36
+ # primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
37
+ # puts primer_id_pool.size #should be 4^10
38
+ # => 1048576
39
+
40
+ # ViralSeq.similar_pid?(pid1, pid2, base_difference)
41
+ # # compare two primer ID sequences.
42
+ # # If they differ in certain bases, return boolean value "TURE",
43
+ # # else, return boolean value "FALSE"
44
+ # # where pid1 and pid2 are two Primer IDs for comparison
45
+ # # base_difference is an Integer for difference bases that allowed
46
+ # =USAGE
47
+ # # example
48
+ # ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
49
+ # => true
50
+
51
+ # ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
52
+ # # compare PID with sequences which have identical sequences.
53
+ # # PIDs differ by 1 base will be recognized.
54
+ # # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
55
+ # # where sequence_fasta_file is the sequence file in fasta format
56
+ # # each sequence tag starting with ">" and the Primer ID sequence
57
+ # # followed by the number of Primer ID appeared in the raw sequence
58
+ # # the information sections in the tags are separated by underscore "_"
59
+ # # example sequence tag: >AGGCGTAGA_32_sample1_RT
60
+ # # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
61
+ # # default value for cut_off is 10
62
+ # # return a new sequence hash. {sequence_name => sequence, ...}
63
+
64
+ # ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
65
+ # # ollapse sequences with x number of nt differences.
66
+ # # input an Array object of sequences, make sure sequences are aligned.
67
+ # # return a new Array object of collapsed sequences
68
+ # # The return frequency is NOT the frequency of the collasped sequences.
69
+
70
+ # ViralSeq.compare_two_seq(seq1, seq2)
71
+ # # compare two sequences as String object, return the number of differences as integer
72
+ # # sequences will NOT align
73
+ # # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
74
+ # =USAGE
75
+ # # example
76
+ # seq1 = 'AAGGCGTAGGAC'
77
+ # seq2 = 'AAGCTTAGGACG'
78
+ # puts ViralSeq.compare_two_seq(seq1, seq2)
79
+ # => 8
80
+ # aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
81
+ # puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
82
+ # => 4
83
+
84
+ # ViralSeq.gap_strip(sequence_hash)
85
+ # # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
86
+ # =USAGE
87
+ # # example
88
+ # sequence_hash = {'>seq1' => 'AACCGGTT',
89
+ # '>seq2' => 'A-CCGGTT',
90
+ # '>seq3' => 'AAC-GGTT',
91
+ # '>seq4' => 'AACCG-TT',
92
+ # '>seq5' => 'AACCGGT-'}
93
+ # ViralSeq.gap_strip(sequence_hash)
94
+ # => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
95
+
96
+ # ViralSeq.gap_strip_ends(sequence_hash)
97
+ # # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
98
+ # =USAGE
99
+ # # example
100
+ # sequence_hash = {'>seq1' => 'AACCGGTT',
101
+ # '>seq2' => 'A-CCGGTT',
102
+ # '>seq3' => 'AAC-GGTT',
103
+ # '>seq4' => 'AACCG-TT',
104
+ # '>seq5' => 'AACCGGT-'}
105
+ # ViralSeq.gap_strip_ends(sequence_hash)
106
+ # => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
107
+
108
+ # ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
109
+ # # pair-end join function for KNOW overlap size
110
+ # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
111
+ # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
112
+ # # overlap is an integer that indicate how many bases are overlapped.
113
+ # # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
114
+ # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
115
+ # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
116
+ # =USAGE
117
+ # # example
118
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
119
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
120
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
121
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
122
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
123
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
124
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
125
+ # => [">pair1"]
126
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
127
+ # => [">pair1", ">pair2"]
128
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.02)
129
+ # => [">pair1", ">pair2", ">pair3"]
130
+
131
+ # ViralSeq.paired_join2(seq_pair_hash, model, diff)
132
+ # # pair-end join function for UNKNOW overlap
133
+ # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
134
+ # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
135
+ # # model has two options, 1 or 2 as Integer
136
+ # # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
137
+ # # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
138
+ # # minimal overlap by model 2 set to 4 positions
139
+ # # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
140
+ # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
141
+ # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
142
+ # =USAGE
143
+ # # example 1
144
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
145
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
146
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
147
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
148
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
149
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
150
+ # ViralSeq.paired_join2(paired_seqs, 1).keys
151
+ # => [">pair1"]
152
+ # ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
153
+ # => [">pair1", ">pair2"]
154
+ #
155
+ # # example 2
156
+ # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
157
+ # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
158
+ # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
159
+ # }
160
+ # ViralSeq.paired_join2(paired_seq2, 1)
161
+ # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
162
+ # ViralSeq.paired_join2(paired_seq2, 2)
163
+ # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
164
+
165
+
166
+ module ViralSeq
167
+
168
+ # calculate cut-off for offspring primer IDs.
169
+ def self.calculate_pid_cut_off(m, error_rate = 0.02)
170
+ if m <= 10
171
+ return 2
172
+ end
173
+ n = 0
174
+ case error_rate
175
+ when 0...0.0075
176
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
177
+ when 0.0075...0.015
178
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
179
+ when 0.015..0.03
180
+ if m <= 8500
181
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
182
+ else
183
+ n = 0.0079 * m + 9.4869
184
+ end
185
+ else
186
+ raise ArgumentError.new('Error_rate has be between 0 to 0.03')
187
+ end
188
+ n = n.round
189
+ n = 2 if n < 3
190
+ return n
191
+ end
192
+
193
+ # create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
194
+ # example:
195
+ # position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
196
+ def self.consensus(seq_array, cutoff = 0.5)
197
+ seq_length = seq_array[0].size
198
+ seq_size = seq_array.size
199
+ consensus_seq = ""
200
+ (0..(seq_length - 1)).each do |position|
201
+ all_base = []
202
+ seq_array.each do |seq|
203
+ all_base << seq[position]
204
+ end
205
+ base_count = ViralSeq.count(all_base)
206
+ max_base_list = []
207
+
208
+ base_count.each do |k,v|
209
+ if v/seq_size.to_f >= cutoff
210
+ max_base_list << k
211
+ end
212
+ end
213
+ consensus_seq += ViralSeq.call_consensus_base(max_base_list)
214
+ end
215
+ return consensus_seq
216
+ end
217
+
218
+ # call consensus nucleotide, used by ViralSeq.consensus
219
+ def self.call_consensus_base(base_array)
220
+ if base_array.size == 1
221
+ base_array[0]
222
+ elsif base_array.size == 2
223
+ case base_array.sort!
224
+ when ["A","T"]
225
+ "W"
226
+ when ["C","G"]
227
+ "S"
228
+ when ["A","C"]
229
+ "M"
230
+ when ["G","T"]
231
+ "K"
232
+ when ["A","G"]
233
+ "R"
234
+ when ["C","T"]
235
+ "Y"
236
+ else
237
+ "N"
238
+ end
239
+
240
+ elsif base_array.size == 3
241
+ case base_array.sort!
242
+ when ["C","G","T"]
243
+ "B"
244
+ when ["A","G","T"]
245
+ "D"
246
+ when ["A","C","T"]
247
+ "H"
248
+ when ["A","C","G"]
249
+ "V"
250
+ else
251
+ "N"
252
+ end
253
+ else
254
+ "N"
255
+ end
256
+ end
257
+
258
+ # generate all Primer ID combinations given the length of Primer ID
259
+ def self.generate_primer_id_pool(l=8)
260
+ nt = ['A','T','C','G']
261
+ pid_pool = ['A','T','C','G']
262
+ (l-1).times do
263
+ pid_pool = pid_pool.product(nt)
264
+ pid_pool.collect! do |v|
265
+ v.join("")
266
+ end
267
+ end
268
+ return pid_pool
269
+ end
270
+
271
+ # compare two primer ID sequences.
272
+ # If they differ in x base, return boolean value "TURE",
273
+ # else, return boolean value "FALSE"
274
+ def self.similar_pid?(pid1="",pid2="", x=0)
275
+ l = pid1.size
276
+ m = l - x
277
+ n = 0
278
+ if pid1.size != pid2.size
279
+ return false
280
+ else
281
+ (0..(pid1.size - 1)).each do |k|
282
+ if pid1[k] == pid2[k]
283
+ n += 1
284
+ end
285
+ end
286
+ if n >= m
287
+ return true
288
+ else
289
+ return false
290
+ end
291
+ end
292
+ end
293
+
294
+ # compare PID with sequences which have identical sequences.
295
+ # PIDs differ by 1 base will be recognized.
296
+ # if PID1 is x time greater than PID2, PID2 will be disgarded
297
+ def self.filter_similar_pid(sequence_file = "", cutoff = 10)
298
+ seq = ViralSeq.fasta_to_hash(sequence_file)
299
+ uni_seq = seq.values.uniq
300
+ uni_seq_pid = {}
301
+ uni_seq.each do |k|
302
+ seq.each do |name,s|
303
+ name = name[1..-1]
304
+ if k == s
305
+ if uni_seq_pid[k]
306
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
307
+ else
308
+ uni_seq_pid[k] = []
309
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ dup_pid = []
316
+ uni_seq_pid.values.each do |v|
317
+ next if v.size == 1
318
+ pid_hash = Hash[v]
319
+ list = pid_hash.keys
320
+ list2 = Array.new(list)
321
+ pairs = []
322
+
323
+ list.each do |k|
324
+ list2.delete(k)
325
+ list2.each do |k1|
326
+ pairs << [k,k1]
327
+ end
328
+ end
329
+
330
+
331
+ pairs.each do |p|
332
+ pid1 = p[0]
333
+ pid2 = p[1]
334
+ if ViralSeq.similar_pid?(pid1,pid2,1)
335
+ n1 = pid_hash[pid1].to_i
336
+ n2 = pid_hash[pid2].to_i
337
+ if n1 >= cutoff * n2
338
+ dup_pid << pid2
339
+ elsif n2 >= cutoff * n1
340
+ dup_pid << pid1
341
+ end
342
+ end
343
+ end
344
+ end
345
+
346
+
347
+ new_seq = {}
348
+ seq.each do |name,s|
349
+ pid = name.split("_")[0][1..-1]
350
+ unless dup_pid.include?(pid)
351
+ new_seq[name] = s
352
+ end
353
+ end
354
+ return new_seq
355
+ end
356
+
357
+ # collapse sequences with x number of nt differences. make sure sequences are aligned.
358
+ # The return frequency is NOT the frequency of the collasped sequences.
359
+ def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
360
+ new_seq_freq = {}
361
+ seq_freq = ViralSeq.count(seq_array)
362
+ if seq_freq.size == 1
363
+ new_seq_freq = seq_freq
364
+ else
365
+ uniq_seq = seq_freq.keys
366
+ unique_seq_pair = uniq_seq.combination(2)
367
+ dupli_seq = []
368
+ unique_seq_pair.each do |pair|
369
+ seq1 = pair[0]
370
+ seq2 = pair[1]
371
+ diff = ViralSeq.compare_two_seq(seq1,seq2)
372
+ if diff <= cutoff
373
+ freq1 = seq_freq[seq1]
374
+ freq2 = seq_freq[seq2]
375
+ freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
376
+ end
377
+ end
378
+
379
+ seq_freq.each do |seq,freq|
380
+ unless dupli_seq.include?(seq)
381
+ new_seq_freq[seq] = freq
382
+ end
383
+ end
384
+ return new_seq_freq
385
+ end
386
+ end
387
+
388
+
389
+ # compare two sequences, return the number of different positions, NO NEED alignment
390
+
391
+ def self.compare_two_seq(seq1 = "", seq2 = "")
392
+ length = seq1.size
393
+ diff = 0
394
+ (0..(length-1)).each do |position|
395
+ nt1 = seq1[position]
396
+ nt2 = seq2[position]
397
+ diff += 1 unless nt1 == nt2
398
+ end
399
+ return diff
400
+ end
401
+
402
+ # gap strip from a sequence alignment
403
+
404
+ def self.gap_strip(sequence_alignment)
405
+ new_seq_hash = {}
406
+ seq_size = sequence_alignment.values[0].size
407
+ seq_matrix = {}
408
+ (0..(seq_size - 1)).each do |p|
409
+ seq_matrix[p] = []
410
+ sequence_alignment.values.each do |s|
411
+ seq_matrix[p] << s[p]
412
+ end
413
+ end
414
+
415
+ seq_matrix.delete_if do |_p, list|
416
+ list.include?("-")
417
+ end
418
+
419
+ sequence_alignment.each do |n,s|
420
+ new_s = ""
421
+ seq_matrix.keys.each {|p| new_s += s[p]}
422
+ new_seq_hash[n] = new_s
423
+ end
424
+ return new_seq_hash
425
+ end
426
+
427
+ # gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
428
+
429
+ def self.gap_strip_ends(sequence_alignment)
430
+ new_seq_hash = {}
431
+ seq_size = sequence_alignment.values[0].size
432
+ seq_matrix = {}
433
+ (0..(seq_size - 1)).each do |p|
434
+ seq_matrix[p] = []
435
+ sequence_alignment.values.each do |s|
436
+ seq_matrix[p] << s[p]
437
+ end
438
+ end
439
+ n1 = 0
440
+ n2 = 0
441
+ seq_matrix.each do |_p, list|
442
+ if list.include?("-")
443
+ n1 += 1
444
+ else
445
+ break
446
+ end
447
+ end
448
+
449
+ seq_matrix.keys.reverse.each do |p|
450
+ list = seq_matrix[p]
451
+ if list.include?("-")
452
+ n2 += 1
453
+ else
454
+ break
455
+ end
456
+ end
457
+
458
+ sequence_alignment.each do |n,s|
459
+ new_s = s[n1..(- n2 - 1)]
460
+ new_seq_hash[n] = new_s
461
+ end
462
+ return new_seq_hash
463
+ end
464
+
465
+ # input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
466
+ # overlap is pre-determined
467
+ def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
468
+ raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
469
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
470
+ joined_seq_hash = {}
471
+ seq_pair_hash.each do |seq_name, seq_pair|
472
+ r1_seq = seq_pair[0]
473
+ r2_seq = seq_pair[1]
474
+ if overlap.zero?
475
+ joined_seq_hash[seq_name] = r1_seq + r2_seq
476
+ elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
477
+ joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
478
+ else
479
+ next
480
+ end
481
+ end
482
+ return joined_seq_hash
483
+ end
484
+
485
+
486
+ # overlap is not predetermined
487
+ # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
488
+ # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
489
+ def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
490
+ begin
491
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
492
+ if model == 1
493
+ overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
494
+ return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
495
+ elsif model == 2
496
+ joined_seq_hash = {}
497
+ seq_pair_hash.each do |seq_name, seq_pair|
498
+ overlap_list = []
499
+ ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
500
+ cut_off_base = overlap1 * diff
501
+ overlap_list << overlap1 if diff_nt <= cut_off_base
502
+ end
503
+ if overlap_list.empty?
504
+ joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
505
+ else
506
+ overlap = overlap_list.max
507
+ joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
508
+ end
509
+ end
510
+ return joined_seq_hash
511
+ else
512
+ raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
513
+ end
514
+ rescue ArgumentError => e
515
+ puts e
516
+ return nil
517
+ end
518
+ end
519
+
520
+ # determine overlap size from a paired sequence Hash object
521
+ def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
522
+ overlaps = []
523
+ seq_pair_hash.each do |_seq_name, seq_pair|
524
+ overlap_list = []
525
+ matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
526
+ matrix.each do |overlap, diff_nt|
527
+ cut_off_base = overlap * diff
528
+ overlap_list << overlap if diff_nt <= cut_off_base
529
+ end
530
+ if overlap_list.empty?
531
+ overlaps << 0
532
+ else
533
+ overlaps << overlap_list.max
534
+ end
535
+ end
536
+ count_overlaps = ViralSeq.count(overlaps)
537
+ max_value = count_overlaps.values.max
538
+ max_overlap_list = []
539
+ count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
540
+ max_overlap_list.max
541
+ end
542
+
543
+ # input a pair of sequences as String, return a Hash object of overlapping Hash object
544
+ # {:overlap_size => number_of_differnt_positions, ...}
545
+ # {minimal overlap set to 4. }
546
+ def self.overlap_matrix(sequence1, sequence2)
547
+ min_overlap = 4
548
+ max_overlap = [sequence1.size, sequence2.size].max
549
+ matrix_hash = {}
550
+ (min_overlap..max_overlap).each do |overlap|
551
+ matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
552
+ end
553
+ return matrix_hash
554
+ end
555
+
556
+ end