viral_seq 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,392 @@
1
+ # lib/sequence.rb
2
+ # Includes functions for sequence operations
3
+ # Including methods as:
4
+ # ViralSeq::AMINO_ACID_LIST
5
+ # ViralSeq::Sequence
6
+ # ViralSeq::Sequence#rev_complement
7
+ # ViralSeq::Sequence#get_aa_sequence
8
+ # ViralSeq::Sequence#get_aa_array
9
+ # ViralSeq::Sequence#name
10
+ # ViralSeq::Sequence#dna_sequence
11
+ # ViralSeq::Sequence#aa_sequence
12
+ # ViralSeq::Sequence#aa_array
13
+ # ViralSeq::amino_acid
14
+ # ViralSeq::amino_acid_2
15
+ # ViralSeq::to_list
16
+ # ViralSeq::uniq_sequence_hash
17
+ # ViralSeq::stop_codon_seq_hash
18
+ # String#rc
19
+ # String#mutation
20
+ # String#nt_parser
21
+
22
+ # ViralSeq::AMINO_ACID_LIST
23
+ # # Array of all amino acid one letter abbreviations
24
+
25
+ # ViralSeq::Sequence
26
+ # # Sequence class
27
+ # =USAGE
28
+ # # create a sequence object
29
+ # seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
30
+ #
31
+ # # print dna sequence
32
+ # puts seq.dna_sequence
33
+ #
34
+ # # reserce complement sequence of DNA sequence, return as a string
35
+ # seq.rev_complement
36
+ #
37
+ # # change @dna_sequence to reverse complement DNA sequence
38
+ # seq.rev_complement!
39
+ #
40
+ # # generate amino acid sequences. either return string or array.
41
+ # # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
42
+ # # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
43
+ # seq.get_aa_sequence
44
+ # # or
45
+ # seq.get_aa_array
46
+ #
47
+ # # print amino acid sequence
48
+ # puts seq.aa_sequence
49
+
50
+ # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
51
+ # # collapse sequence hash to unique sequence hash.
52
+ # # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
53
+ # # master_sequence_tag is the master tag for unique sequences
54
+ # # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
55
+ # =USAGE
56
+ # sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
57
+ # '>seq4' => 'CCCC', '>seq5' => 'CCCC',
58
+ # '>seq6' => 'TTTT' }
59
+ # uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
60
+ # => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
61
+
62
+ module ViralSeq
63
+
64
+ # array for all amino acid one letter abbreviations
65
+ AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
66
+
67
+ # sequence class
68
+
69
+ class Sequence
70
+ def initialize (name = ">sequence",dna_sequence ="")
71
+ @name = name
72
+ @dna_sequence = dna_sequence.upcase
73
+ @aa_sequence = ""
74
+ @aa_array = []
75
+ end
76
+
77
+ attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
78
+
79
+ def rev_complement
80
+ @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
81
+ end
82
+ def rev_complement!
83
+ @dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
84
+ end
85
+
86
+ def get_aa_sequence(initial_position = 0)
87
+ @aa_sequence = ""
88
+ require_sequence = @dna_sequence[initial_position..-1]
89
+ base_array = []
90
+ require_sequence.each_char {|base| base_array << base}
91
+ while (base_array.length>=3) do
92
+ base_3= ""
93
+ 3.times {base_3 += base_array.shift}
94
+ @aa_sequence << amino_acid(base_3)
95
+ end
96
+ return @aa_sequence
97
+ end
98
+
99
+ # get amino acid calls, return a array.keep ambiguity calls.
100
+ def get_aa_array(initial_position = 0)
101
+ @aa_array = []
102
+ require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
103
+ base_array = []
104
+ require_sequence.each_char {|base| base_array << base}
105
+ while (base_array.length>=3) do
106
+ base_3= ""
107
+ 3.times{base_3 += base_array.shift}
108
+ @aa_array<< ViralSeq.amino_acid_2(base_3)
109
+ end
110
+ return @aa_array
111
+ end
112
+ def dna_length
113
+ @dna_sequence.length
114
+ end
115
+ def aa_length
116
+ @aa_sequence.length
117
+ end
118
+ end
119
+
120
+ # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
121
+ def self.amino_acid (bases)
122
+ case bases
123
+ when /^TT[TCY]$/
124
+ return "F"
125
+ when /^TT[AGR]$/
126
+ return "L"
127
+ when /^CT.$/
128
+ return "L"
129
+ when /^AT[TCAHYWM]$/
130
+ return "I"
131
+ when "ATG"
132
+ return "M"
133
+ when /^GT.$/
134
+ return "V"
135
+ when /^TC.$/
136
+ return "S"
137
+ when /^CC.$/
138
+ return "P"
139
+ when /^AC.$/
140
+ return "T"
141
+ when /^GC.$/
142
+ return "A"
143
+ when /^TA[TCY]$/
144
+ return "Y"
145
+ when /^TA[AGR]$/
146
+ return "*"
147
+ when /^T[GR]A$/
148
+ return "*"
149
+ when /^CA[TCY]$/
150
+ return "H"
151
+ when /^CA[AGR]$/
152
+ return "Q"
153
+ when /^AA[TCY]$/
154
+ return "N"
155
+ when /^AA[AGR]$/
156
+ return "K"
157
+ when /^GA[TCY]$/
158
+ return "D"
159
+ when /^GA[AGR]$/
160
+ return "E"
161
+ when /^TG[TCY]$/
162
+ return "C"
163
+ when "TGG"
164
+ return "W"
165
+ when /^CG.$/
166
+ return "R"
167
+ when /^AG[TCY]$/
168
+ return "S"
169
+ when /^[AM]G[AGR]$/
170
+ return "R"
171
+ when /^GG.$/
172
+ return "G"
173
+ when /^[ATW][CGS][CTY]$/
174
+ return "S"
175
+ when /^[TCY]T[AGR]$/
176
+ return "L"
177
+ else
178
+ return "#"
179
+ end
180
+ end
181
+
182
+ # keep ambiguities, return all possible amino acids.
183
+
184
+ def self.amino_acid_2 (bases)
185
+ bases_to_aa = []
186
+ aa_list = []
187
+ base1 = ViralSeq.to_list(bases[0])
188
+ base2 = ViralSeq.to_list(bases[1])
189
+ base3 = ViralSeq.to_list(bases[2])
190
+ l1 = base1.size - 1
191
+ l2 = base2.size - 1
192
+ l3 = base3.size - 1
193
+ (0..l1).each do |n1|
194
+ b1 = base1[n1]
195
+ (0..l2).each do |n2|
196
+ b2 = base2[n2]
197
+ (0..l3).each do |n3|
198
+ b3 = base3[n3]
199
+ bases_all = b1 + b2 + b3
200
+ bases_to_aa << bases_all
201
+ end
202
+ end
203
+ end
204
+
205
+ bases_to_aa.each do |base|
206
+ case base
207
+ when /^TT[TCY]$/
208
+ aa = "F"
209
+ when /^TT[AGR]$/
210
+ aa = "L"
211
+ when /^CT.$/
212
+ aa = "L"
213
+ when /^AT[TCAHYWM]$/
214
+ aa = "I"
215
+ when "ATG"
216
+ aa = "M"
217
+ when /^GT.$/
218
+ aa = "V"
219
+ when /^TC.$/
220
+ aa = "S"
221
+ when /^CC.$/
222
+ aa = "P"
223
+ when /^AC.$/
224
+ aa = "T"
225
+ when /^GC.$/
226
+ aa = "A"
227
+ when /^TA[TCY]$/
228
+ aa = "Y"
229
+ when /^TA[AGR]$/
230
+ aa = "*"
231
+ when /^T[GR]A$/
232
+ aa = "*"
233
+ when /^CA[TCY]$/
234
+ aa = "H"
235
+ when /^CA[AGR]$/
236
+ aa = "Q"
237
+ when /^AA[TCY]$/
238
+ aa = "N"
239
+ when /^AA[AGR]$/
240
+ aa = "K"
241
+ when /^GA[TCY]$/
242
+ aa = "D"
243
+ when /^GA[AGR]$/
244
+ aa = "E"
245
+ when /^TG[TCY]$/
246
+ aa = "C"
247
+ when "TGG"
248
+ aa = "W"
249
+ when /^CG.$/
250
+ aa = "R"
251
+ when /^AG[TCY]$/
252
+ aa = "S"
253
+ when /^[AM]G[AGR]$/
254
+ aa = "R"
255
+ when /^GG.$/
256
+ aa = "G"
257
+ when /^[ATW][CGS][CTY]$/
258
+ aa = "S"
259
+ when /^[TCY]T[AGR]$/
260
+ aa = "L"
261
+ else
262
+ aa = "-"
263
+ end
264
+ aa_list << aa
265
+ end
266
+ aa_out = aa_list.uniq.join('/')
267
+ return aa_out
268
+ end
269
+
270
+ # parse ambiguity bases, aka %w{W S M K R Y B D H V N}
271
+
272
+ def self.to_list(base = "")
273
+ list = []
274
+ case base
275
+ when /[A|T|C|G]/
276
+ list << base
277
+ when "W"
278
+ list = ['A','T']
279
+ when "S"
280
+ list = ['C','G']
281
+ when "M"
282
+ list = ['A','C']
283
+ when 'K'
284
+ list = ['G','C']
285
+ when 'R'
286
+ list = ['A','G']
287
+ when 'Y'
288
+ list = ['C','T']
289
+ when 'B'
290
+ list = ['C','G','T']
291
+ when 'D'
292
+ list = ['A','G','T']
293
+ when 'H'
294
+ list = ['A','C','T']
295
+ when 'V'
296
+ list = ['A','C','G']
297
+ when 'N'
298
+ list = ['A','T','C','G']
299
+ end
300
+ return list
301
+ end
302
+
303
+ # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
304
+ # collapse sequence hash to unique sequence hash.
305
+ # input_sequence_hash is a sequence hash {:name => :sequence, ...}
306
+ # master_sequence_tag is the master tag for unique sequences
307
+ # sequences will be named as (master_sequence_tag + "_" + Integer)
308
+
309
+ def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
310
+ uni = ViralSeq.count(seq.values)
311
+ new_seq = {}
312
+ n = 1
313
+ uni.each do |s,c|
314
+ name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
315
+ new_seq[name] = s
316
+ n += 1
317
+ end
318
+ return new_seq
319
+ end
320
+
321
+ # input a sequence hash, return a sequence hash with stop codons.
322
+ def self.stop_codon_seq_hash(seq_hash, rf = 0)
323
+ out_seq_hash = {}
324
+ seq_hash.each do |k,v|
325
+ sequence = Sequence.new(k,v)
326
+ sequence.get_aa_array(rf)
327
+ if sequence.aa_array.include?("*")
328
+ out_seq_hash[k] = v
329
+ end
330
+ end
331
+ return out_seq_hash
332
+ end
333
+
334
+ end
335
+
336
+ # functions added to Class::String for direct operation on sequence if it is a String object
337
+ # String.rc
338
+ # # reverse complement
339
+ # # example
340
+ # "ACAGA".rc
341
+ # => "TCTGT"
342
+ #
343
+ # String.mutation(error_rate)
344
+ # # mutate a nt sequence (String class) randomly
345
+ # # must define error rate, default value 0.01, aka 1%
346
+ # =USAGE
347
+ # # example
348
+ # seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
349
+ # seq.mutation(0.05)
350
+ # => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
351
+ #
352
+ # String.nt_parser
353
+ # # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
354
+ # =USAGE
355
+ # "ATRWCG".nt_parser
356
+ # => /AT[A|G][A|T]CG/
357
+
358
+ class String
359
+ # direct function of calling reverse complement on String class
360
+ def rc
361
+ self.reverse.tr("ACTG","TGAC")
362
+ end
363
+
364
+ def mutation(error_rate = 0.01)
365
+ new_string = ""
366
+ self.split("").each do |nt|
367
+ pool = ["A","C","T","G"]
368
+ pool.delete(nt)
369
+ s = error_rate * 10000
370
+ r = rand(10000)
371
+ if r < s
372
+ nt = pool.sample
373
+ end
374
+ new_string << nt
375
+ end
376
+ return new_string
377
+ end
378
+
379
+ def nt_parser
380
+ match = ""
381
+ self.each_char.each do |base|
382
+ base_array = ViralSeq.to_list(base)
383
+ if base_array.size == 1
384
+ match += base_array[0]
385
+ else
386
+ pattern = "[" + base_array.join("|") + "]"
387
+ match += pattern
388
+ end
389
+ end
390
+ Regexp.new match
391
+ end
392
+ end
@@ -0,0 +1,556 @@
1
+ # viral_seq/tcs_core
2
+ # core functions for TCS and DR pipeline
3
+ # functions to manipulate sequences including:
4
+ # ViralSeq::calculate_pid_cut_off
5
+ # ViralSeq::consensus
6
+ # ViralSeq::generate_primer_id_pool
7
+ # ViralSeq::similar_pid?
8
+ # ViralSeq::filter_similar_pid
9
+ # ViralSeq::collapse_sequence_by_x_nt_difference
10
+ # ViralSeq::compare_two_seq
11
+ # ViralSeq::gap_strip
12
+ # ViralSeq::gap_strip_ends
13
+ # ViralSeq::paired_join1
14
+ # ViralSeq::paired_join2
15
+
16
+ # ViralSeq.calculate_pid_cut_off(PID_abundance, estimated_error_rate)
17
+ # # A function to calcuate cut-off for offspring primer IDs.
18
+ # # see reference at Zhou et al. JVI 2016.
19
+ # # https://www.ncbi.nlm.nih.gov/pubmed/26041299
20
+ # # PID_abundance is the abundance of a certain PID
21
+ # # estimated_error_rate is the estimated platform error rate, 0.02 (2%) as default
22
+ # # the model supports error rate from 0.003 to 0.03.
23
+ # # return an abundance cut-off (Integer) for offspring Primer IDs.
24
+
25
+ # ViralSeq.consensus(seq_array, majority_cutoff)
26
+ # # Generate a consensus sequence from a given sequence array.
27
+ # # where seq_array is an Array of input sequences (aligned) [seq1, seq2, seq3, ...]
28
+ # # majority_cutoff is a Float of majority cut-off. default as simply majority (0.5)
29
+ # =USAGE
30
+ # a_consensus_sequence = ViralSeq.cosensus(seq_array, majority_cutoff)
31
+
32
+ # ViralSeq.generate_primer_id_pool(n)
33
+ # # generate all Primer ID combinations given the length of Primer ID
34
+ # # n is the length of the Primer ID (Integer). default value of n is 8.
35
+ # =USAGE
36
+ # primer_id_pool = ViralSeq.generate_primer_id_pool(10) # 10 is the length of Primer ID
37
+ # puts primer_id_pool.size #should be 4^10
38
+ # => 1048576
39
+
40
+ # ViralSeq.similar_pid?(pid1, pid2, base_difference)
41
+ # # compare two primer ID sequences.
42
+ # # If they differ in certain bases, return boolean value "TURE",
43
+ # # else, return boolean value "FALSE"
44
+ # # where pid1 and pid2 are two Primer IDs for comparison
45
+ # # base_difference is an Integer for difference bases that allowed
46
+ # =USAGE
47
+ # # example
48
+ # ViralSeq.similar_pid?("AAGGCTACGA", "AAGGATACGA", 1)
49
+ # => true
50
+
51
+ # ViralSeq.filter_similar_pid(sequence_fasta_file, cut_off)
52
+ # # compare PID with sequences which have identical sequences.
53
+ # # PIDs differ by 1 base will be recognized.
54
+ # # if PID1 is x time (cut-off) greater than PID2, PID2 will be disgarded
55
+ # # where sequence_fasta_file is the sequence file in fasta format
56
+ # # each sequence tag starting with ">" and the Primer ID sequence
57
+ # # followed by the number of Primer ID appeared in the raw sequence
58
+ # # the information sections in the tags are separated by underscore "_"
59
+ # # example sequence tag: >AGGCGTAGA_32_sample1_RT
60
+ # # cut_off is the fold cut-off to remove the potential residual offspring Primer IDs
61
+ # # default value for cut_off is 10
62
+ # # return a new sequence hash. {sequence_name => sequence, ...}
63
+
64
+ # ViralSeq.collapse_sequence_by_x_nt_difference(sequence_array, cutoff)
65
+ # # ollapse sequences with x number of nt differences.
66
+ # # input an Array object of sequences, make sure sequences are aligned.
67
+ # # return a new Array object of collapsed sequences
68
+ # # The return frequency is NOT the frequency of the collasped sequences.
69
+
70
+ # ViralSeq.compare_two_seq(seq1, seq2)
71
+ # # compare two sequences as String object, return the number of differences as integer
72
+ # # sequences will NOT align
73
+ # # can use ViralSeq.muscle_align(seq1, seq2) to get the aligned sequences
74
+ # =USAGE
75
+ # # example
76
+ # seq1 = 'AAGGCGTAGGAC'
77
+ # seq2 = 'AAGCTTAGGACG'
78
+ # puts ViralSeq.compare_two_seq(seq1, seq2)
79
+ # => 8
80
+ # aligned_seqs = ViralSeq.muscle_align(seq1,seq2)
81
+ # puts ViralSeq.compare_two_seq(aligned_seqs.values[0], aligned_seqs.values[1])
82
+ # => 4
83
+
84
+ # ViralSeq.gap_strip(sequence_hash)
85
+ # # strip positions with gaps in the sequence alignment as Hash object {:name => sequence, ...}
86
+ # =USAGE
87
+ # # example
88
+ # sequence_hash = {'>seq1' => 'AACCGGTT',
89
+ # '>seq2' => 'A-CCGGTT',
90
+ # '>seq3' => 'AAC-GGTT',
91
+ # '>seq4' => 'AACCG-TT',
92
+ # '>seq5' => 'AACCGGT-'}
93
+ # ViralSeq.gap_strip(sequence_hash)
94
+ # => {">seq1"=>"ACGT", ">seq2"=>"ACGT", ">seq3"=>"ACGT", ">seq4"=>"ACGT", ">seq5"=>"ACGT"}
95
+
96
+ # ViralSeq.gap_strip_ends(sequence_hash)
97
+ # # similar to ViralSeq.gap_strip , but only strip the gaps at both ends of the alignment
98
+ # =USAGE
99
+ # # example
100
+ # sequence_hash = {'>seq1' => 'AACCGGTT',
101
+ # '>seq2' => 'A-CCGGTT',
102
+ # '>seq3' => 'AAC-GGTT',
103
+ # '>seq4' => 'AACCG-TT',
104
+ # '>seq5' => 'AACCGGT-'}
105
+ # ViralSeq.gap_strip_ends(sequence_hash)
106
+ # => {">seq1"=>"AACCGGT", ">seq2"=>"A-CCGGT", ">seq3"=>"AAC-GGT", ">seq4"=>"AACCG-T", ">seq5"=>"AACCGGT"}
107
+
108
+ # ViralSeq.paired_join1(sequence_pair_hash, overlap, difference_cut_off)
109
+ # # pair-end join function for KNOW overlap size
110
+ # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
111
+ # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
112
+ # # overlap is an integer that indicate how many bases are overlapped.
113
+ # # overlap value at 0 means no overlap. R1 and R2 will be simply put together.
114
+ # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
115
+ # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
116
+ # =USAGE
117
+ # # example
118
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
119
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
120
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
121
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
122
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
123
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
124
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.0).keys
125
+ # => [">pair1"]
126
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.01).keys
127
+ # => [">pair1", ">pair2"]
128
+ # ViralSeq.paired_join1(paired_seqs, 100, 0.02)
129
+ # => [">pair1", ">pair2", ">pair3"]
130
+
131
+ # ViralSeq.paired_join2(seq_pair_hash, model, diff)
132
+ # # pair-end join function for UNKNOW overlap
133
+ # # sequence_pair_hash is a Hash object for paired sequences {:seq_name => [:r1_seq, :r2_seq], ...}
134
+ # # can use ViralSeq::pair_fasta_to_hash to load paired r1 and r2 sequences into paired sequence hash
135
+ # # model has two options, 1 or 2 as Integer
136
+ # # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
137
+ # # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
138
+ # # minimal overlap by model 2 set to 4 positions
139
+ # # if the sequence overlap may be smaller than 3 bases the model will consider as no overlap.
140
+ # # difference_cut_off is a Float variable for the maximum mismatch rate allowed for the overlapping region
141
+ # # default value for difference_cut_off is 0.0, i.e. no mis-match allowed
142
+ # =USAGE
143
+ # # example 1
144
+ # paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
145
+ # "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
146
+ # ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
147
+ # "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
148
+ # ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
149
+ # "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
150
+ # ViralSeq.paired_join2(paired_seqs, 1).keys
151
+ # => [">pair1"]
152
+ # ViralSeq.paired_join2(paired_seqs, 1, 0.01).keys
153
+ # => [">pair1", ">pair2"]
154
+ #
155
+ # # example 2
156
+ # paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
157
+ # ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
158
+ # ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"]
159
+ # }
160
+ # ViralSeq.paired_join2(paired_seq2, 1)
161
+ # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
162
+ # ViralSeq.paired_join2(paired_seq2, 2)
163
+ # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
164
+
165
+
166
+ module ViralSeq
167
+
168
+ # calculate cut-off for offspring primer IDs.
169
+ def self.calculate_pid_cut_off(m, error_rate = 0.02)
170
+ if m <= 10
171
+ return 2
172
+ end
173
+ n = 0
174
+ case error_rate
175
+ when 0...0.0075
176
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
177
+ when 0.0075...0.015
178
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
179
+ when 0.015..0.03
180
+ if m <= 8500
181
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
182
+ else
183
+ n = 0.0079 * m + 9.4869
184
+ end
185
+ else
186
+ raise ArgumentError.new('Error_rate has be between 0 to 0.03')
187
+ end
188
+ n = n.round
189
+ n = 2 if n < 3
190
+ return n
191
+ end
192
+
193
+ # create one consensus sequence from a sequence array with an optional majority cut-off for mixed bases.
194
+ # example:
195
+ # position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
196
+ def self.consensus(seq_array, cutoff = 0.5)
197
+ seq_length = seq_array[0].size
198
+ seq_size = seq_array.size
199
+ consensus_seq = ""
200
+ (0..(seq_length - 1)).each do |position|
201
+ all_base = []
202
+ seq_array.each do |seq|
203
+ all_base << seq[position]
204
+ end
205
+ base_count = ViralSeq.count(all_base)
206
+ max_base_list = []
207
+
208
+ base_count.each do |k,v|
209
+ if v/seq_size.to_f >= cutoff
210
+ max_base_list << k
211
+ end
212
+ end
213
+ consensus_seq += ViralSeq.call_consensus_base(max_base_list)
214
+ end
215
+ return consensus_seq
216
+ end
217
+
218
+ # call consensus nucleotide, used by ViralSeq.consensus
219
+ def self.call_consensus_base(base_array)
220
+ if base_array.size == 1
221
+ base_array[0]
222
+ elsif base_array.size == 2
223
+ case base_array.sort!
224
+ when ["A","T"]
225
+ "W"
226
+ when ["C","G"]
227
+ "S"
228
+ when ["A","C"]
229
+ "M"
230
+ when ["G","T"]
231
+ "K"
232
+ when ["A","G"]
233
+ "R"
234
+ when ["C","T"]
235
+ "Y"
236
+ else
237
+ "N"
238
+ end
239
+
240
+ elsif base_array.size == 3
241
+ case base_array.sort!
242
+ when ["C","G","T"]
243
+ "B"
244
+ when ["A","G","T"]
245
+ "D"
246
+ when ["A","C","T"]
247
+ "H"
248
+ when ["A","C","G"]
249
+ "V"
250
+ else
251
+ "N"
252
+ end
253
+ else
254
+ "N"
255
+ end
256
+ end
257
+
258
+ # generate all Primer ID combinations given the length of Primer ID
259
+ def self.generate_primer_id_pool(l=8)
260
+ nt = ['A','T','C','G']
261
+ pid_pool = ['A','T','C','G']
262
+ (l-1).times do
263
+ pid_pool = pid_pool.product(nt)
264
+ pid_pool.collect! do |v|
265
+ v.join("")
266
+ end
267
+ end
268
+ return pid_pool
269
+ end
270
+
271
+ # compare two primer ID sequences.
272
+ # If they differ in x base, return boolean value "TURE",
273
+ # else, return boolean value "FALSE"
274
+ def self.similar_pid?(pid1="",pid2="", x=0)
275
+ l = pid1.size
276
+ m = l - x
277
+ n = 0
278
+ if pid1.size != pid2.size
279
+ return false
280
+ else
281
+ (0..(pid1.size - 1)).each do |k|
282
+ if pid1[k] == pid2[k]
283
+ n += 1
284
+ end
285
+ end
286
+ if n >= m
287
+ return true
288
+ else
289
+ return false
290
+ end
291
+ end
292
+ end
293
+
294
+ # compare PID with sequences which have identical sequences.
295
+ # PIDs differ by 1 base will be recognized.
296
+ # if PID1 is x time greater than PID2, PID2 will be disgarded
297
+ def self.filter_similar_pid(sequence_file = "", cutoff = 10)
298
+ seq = ViralSeq.fasta_to_hash(sequence_file)
299
+ uni_seq = seq.values.uniq
300
+ uni_seq_pid = {}
301
+ uni_seq.each do |k|
302
+ seq.each do |name,s|
303
+ name = name[1..-1]
304
+ if k == s
305
+ if uni_seq_pid[k]
306
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
307
+ else
308
+ uni_seq_pid[k] = []
309
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ dup_pid = []
316
+ uni_seq_pid.values.each do |v|
317
+ next if v.size == 1
318
+ pid_hash = Hash[v]
319
+ list = pid_hash.keys
320
+ list2 = Array.new(list)
321
+ pairs = []
322
+
323
+ list.each do |k|
324
+ list2.delete(k)
325
+ list2.each do |k1|
326
+ pairs << [k,k1]
327
+ end
328
+ end
329
+
330
+
331
+ pairs.each do |p|
332
+ pid1 = p[0]
333
+ pid2 = p[1]
334
+ if ViralSeq.similar_pid?(pid1,pid2,1)
335
+ n1 = pid_hash[pid1].to_i
336
+ n2 = pid_hash[pid2].to_i
337
+ if n1 >= cutoff * n2
338
+ dup_pid << pid2
339
+ elsif n2 >= cutoff * n1
340
+ dup_pid << pid1
341
+ end
342
+ end
343
+ end
344
+ end
345
+
346
+
347
+ new_seq = {}
348
+ seq.each do |name,s|
349
+ pid = name.split("_")[0][1..-1]
350
+ unless dup_pid.include?(pid)
351
+ new_seq[name] = s
352
+ end
353
+ end
354
+ return new_seq
355
+ end
356
+
357
+ # collapse sequences with x number of nt differences. make sure sequences are aligned.
358
+ # The return frequency is NOT the frequency of the collasped sequences.
359
+ def self.collapse_sequence_by_x_nt_difference(seq_array,cutoff)
360
+ new_seq_freq = {}
361
+ seq_freq = ViralSeq.count(seq_array)
362
+ if seq_freq.size == 1
363
+ new_seq_freq = seq_freq
364
+ else
365
+ uniq_seq = seq_freq.keys
366
+ unique_seq_pair = uniq_seq.combination(2)
367
+ dupli_seq = []
368
+ unique_seq_pair.each do |pair|
369
+ seq1 = pair[0]
370
+ seq2 = pair[1]
371
+ diff = ViralSeq.compare_two_seq(seq1,seq2)
372
+ if diff <= cutoff
373
+ freq1 = seq_freq[seq1]
374
+ freq2 = seq_freq[seq2]
375
+ freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
376
+ end
377
+ end
378
+
379
+ seq_freq.each do |seq,freq|
380
+ unless dupli_seq.include?(seq)
381
+ new_seq_freq[seq] = freq
382
+ end
383
+ end
384
+ return new_seq_freq
385
+ end
386
+ end
387
+
388
+
389
+ # compare two sequences, return the number of different positions, NO NEED alignment
390
+
391
+ def self.compare_two_seq(seq1 = "", seq2 = "")
392
+ length = seq1.size
393
+ diff = 0
394
+ (0..(length-1)).each do |position|
395
+ nt1 = seq1[position]
396
+ nt2 = seq2[position]
397
+ diff += 1 unless nt1 == nt2
398
+ end
399
+ return diff
400
+ end
401
+
402
+ # gap strip from a sequence alignment
403
+
404
+ def self.gap_strip(sequence_alignment)
405
+ new_seq_hash = {}
406
+ seq_size = sequence_alignment.values[0].size
407
+ seq_matrix = {}
408
+ (0..(seq_size - 1)).each do |p|
409
+ seq_matrix[p] = []
410
+ sequence_alignment.values.each do |s|
411
+ seq_matrix[p] << s[p]
412
+ end
413
+ end
414
+
415
+ seq_matrix.delete_if do |_p, list|
416
+ list.include?("-")
417
+ end
418
+
419
+ sequence_alignment.each do |n,s|
420
+ new_s = ""
421
+ seq_matrix.keys.each {|p| new_s += s[p]}
422
+ new_seq_hash[n] = new_s
423
+ end
424
+ return new_seq_hash
425
+ end
426
+
427
+ # gap strip from a sequence alignment, only strip the gaps at the ends of the alignment
428
+
429
+ def self.gap_strip_ends(sequence_alignment)
430
+ new_seq_hash = {}
431
+ seq_size = sequence_alignment.values[0].size
432
+ seq_matrix = {}
433
+ (0..(seq_size - 1)).each do |p|
434
+ seq_matrix[p] = []
435
+ sequence_alignment.values.each do |s|
436
+ seq_matrix[p] << s[p]
437
+ end
438
+ end
439
+ n1 = 0
440
+ n2 = 0
441
+ seq_matrix.each do |_p, list|
442
+ if list.include?("-")
443
+ n1 += 1
444
+ else
445
+ break
446
+ end
447
+ end
448
+
449
+ seq_matrix.keys.reverse.each do |p|
450
+ list = seq_matrix[p]
451
+ if list.include?("-")
452
+ n2 += 1
453
+ else
454
+ break
455
+ end
456
+ end
457
+
458
+ sequence_alignment.each do |n,s|
459
+ new_s = s[n1..(- n2 - 1)]
460
+ new_seq_hash[n] = new_s
461
+ end
462
+ return new_seq_hash
463
+ end
464
+
465
+ # input paired-end sequence hash format seq_name => [r1_seq, r2_seq]
466
+ # overlap is pre-determined
467
+ def self.paired_join1(seq_pair_hash, overlap, diff = 0.0)
468
+ raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
469
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
470
+ joined_seq_hash = {}
471
+ seq_pair_hash.each do |seq_name, seq_pair|
472
+ r1_seq = seq_pair[0]
473
+ r2_seq = seq_pair[1]
474
+ if overlap.zero?
475
+ joined_seq_hash[seq_name] = r1_seq + r2_seq
476
+ elsif ViralSeq.compare_two_seq(r1_seq[-overlap..-1], r2_seq[0,overlap]) <= (overlap * diff)
477
+ joined_seq_hash[seq_name] = r1_seq + r2_seq[overlap..-1]
478
+ else
479
+ next
480
+ end
481
+ end
482
+ return joined_seq_hash
483
+ end
484
+
485
+
486
+ # overlap is not predetermined
487
+ # model 1: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
488
+ # model 2: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
489
+ def self.paired_join2(seq_pair_hash, model = 1, diff = 0.0)
490
+ begin
491
+ raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
492
+ if model == 1
493
+ overlap = ViralSeq.determine_overlap_pid_pair(seq_pair_hash, diff)
494
+ return ViralSeq.paired_join1(seq_pair_hash, overlap, diff)
495
+ elsif model == 2
496
+ joined_seq_hash = {}
497
+ seq_pair_hash.each do |seq_name, seq_pair|
498
+ overlap_list = []
499
+ ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
500
+ cut_off_base = overlap1 * diff
501
+ overlap_list << overlap1 if diff_nt <= cut_off_base
502
+ end
503
+ if overlap_list.empty?
504
+ joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1]
505
+ else
506
+ overlap = overlap_list.max
507
+ joined_seq_hash[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
508
+ end
509
+ end
510
+ return joined_seq_hash
511
+ else
512
+ raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \'#{model}\', expected '1' or '2'.")
513
+ end
514
+ rescue ArgumentError => e
515
+ puts e
516
+ return nil
517
+ end
518
+ end
519
+
520
+ # determine overlap size from a paired sequence Hash object
521
+ def self.determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
522
+ overlaps = []
523
+ seq_pair_hash.each do |_seq_name, seq_pair|
524
+ overlap_list = []
525
+ matrix = ViralSeq.overlap_matrix(seq_pair[0], seq_pair[1])
526
+ matrix.each do |overlap, diff_nt|
527
+ cut_off_base = overlap * diff
528
+ overlap_list << overlap if diff_nt <= cut_off_base
529
+ end
530
+ if overlap_list.empty?
531
+ overlaps << 0
532
+ else
533
+ overlaps << overlap_list.max
534
+ end
535
+ end
536
+ count_overlaps = ViralSeq.count(overlaps)
537
+ max_value = count_overlaps.values.max
538
+ max_overlap_list = []
539
+ count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
540
+ max_overlap_list.max
541
+ end
542
+
543
+ # input a pair of sequences as String, return a Hash object of overlapping Hash object
544
+ # {:overlap_size => number_of_differnt_positions, ...}
545
+ # {minimal overlap set to 4. }
546
+ def self.overlap_matrix(sequence1, sequence2)
547
+ min_overlap = 4
548
+ max_overlap = [sequence1.size, sequence2.size].max
549
+ matrix_hash = {}
550
+ (min_overlap..max_overlap).each do |overlap|
551
+ matrix_hash[overlap] = ViralSeq.compare_two_seq(sequence1[-overlap..-1], sequence2[0, overlap])
552
+ end
553
+ return matrix_hash
554
+ end
555
+
556
+ end