viral_seq 1.0.4 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -4
- data/README.md +95 -26
- data/bin/locator +40 -35
- data/bin/tcs +519 -0
- data/bin/tcs_json_generator +166 -0
- data/lib/viral_seq.rb +1 -1
- data/lib/viral_seq/hash.rb +1 -1
- data/lib/viral_seq/hivdr.rb +2 -0
- data/lib/viral_seq/muscle.rb +2 -2
- data/lib/viral_seq/seq_hash.rb +214 -36
- data/lib/viral_seq/seq_hash_pair.rb +10 -6
- data/lib/viral_seq/version.rb +2 -1
- data/viral_seq.gemspec +5 -1
- metadata +23 -5
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# TCS pipeline JSON params generator.
|
4
|
+
|
5
|
+
require 'viral_seq'
|
6
|
+
require 'colorize'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
def get_ref
|
10
|
+
puts "Choose reference genome (1-3):"
|
11
|
+
puts "1. HIV-1 HXB2".red.bold
|
12
|
+
puts "2. HIV-1 NL4-3".blue.bold
|
13
|
+
puts "3. SIV MAC239".magenta.bold
|
14
|
+
print "> "
|
15
|
+
ref_option = gets.chomp.rstrip
|
16
|
+
while ![1,2,3].include?(ref_option.to_i)
|
17
|
+
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
18
|
+
ref_option = gets.chomp.rstrip.to_i
|
19
|
+
end
|
20
|
+
ref = case ref_option.to_i
|
21
|
+
when 1
|
22
|
+
:HXB2
|
23
|
+
when 2
|
24
|
+
:NL43
|
25
|
+
when 3
|
26
|
+
:MAC239
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
puts "\n" + '-'*58
|
31
|
+
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
32
|
+
puts '-'*58 + "\n"
|
33
|
+
|
34
|
+
param = {}
|
35
|
+
|
36
|
+
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
37
|
+
print '> '
|
38
|
+
param[:raw_sequence_dir] = gets.chomp.rstrip
|
39
|
+
|
40
|
+
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
41
|
+
print '> '
|
42
|
+
input_error = gets.chomp.rstrip.to_f
|
43
|
+
if input_error == 0.0
|
44
|
+
param[:platform_error_rate] = 0.02
|
45
|
+
else
|
46
|
+
param[:platform_error_rate] = input_error
|
47
|
+
end
|
48
|
+
|
49
|
+
param[:primer_pairs] = []
|
50
|
+
|
51
|
+
loop do
|
52
|
+
data = {}
|
53
|
+
puts "Enter the name for the sequenced region: "
|
54
|
+
print '> '
|
55
|
+
data[:region] = gets.chomp.rstrip
|
56
|
+
|
57
|
+
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
58
|
+
print '> '
|
59
|
+
data[:cdna] = gets.chomp.rstrip
|
60
|
+
|
61
|
+
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
62
|
+
print '> '
|
63
|
+
data[:forward] = gets.chomp.rstrip
|
64
|
+
|
65
|
+
puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
|
66
|
+
print '> '
|
67
|
+
mj = gets.chomp.rstrip.to_f
|
68
|
+
if (0.5..0.9).include?(mj)
|
69
|
+
data[:majority] = mj
|
70
|
+
else
|
71
|
+
data[:majority] = 0.5
|
72
|
+
end
|
73
|
+
|
74
|
+
print "Need end-join? Y/N \n> "
|
75
|
+
ej = gets.chomp.rstrip
|
76
|
+
if ej =~ /y|yes/i
|
77
|
+
data[:end_join] = true
|
78
|
+
|
79
|
+
print "End-join option? Choose from (1-4):\n
|
80
|
+
1: simple join, no overlap
|
81
|
+
2: known overlap \n
|
82
|
+
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
83
|
+
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
84
|
+
> "
|
85
|
+
ej_option = gets.chomp.rstrip
|
86
|
+
while ![1,2,3,4].include?(ej_option.to_i)
|
87
|
+
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
88
|
+
ej_option = gets.chomp.rstrip.to_i
|
89
|
+
end
|
90
|
+
case ej_option.to_i
|
91
|
+
when 1
|
92
|
+
data[:end_join_option] = 1
|
93
|
+
data[:overlap] = 0
|
94
|
+
when 2
|
95
|
+
data[:end_join_option] = 1
|
96
|
+
print "overlap bases: \n> "
|
97
|
+
ol = gets.chomp.rstrip.to_i
|
98
|
+
data[:overlap] = ol
|
99
|
+
when 3
|
100
|
+
data[:end_join_option] = 3
|
101
|
+
when 4
|
102
|
+
data[:end_join_option] = 4
|
103
|
+
end
|
104
|
+
|
105
|
+
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
106
|
+
qc = gets.chomp.rstrip
|
107
|
+
if qc =~ /y|yes/i
|
108
|
+
data[:TCS_QC] = true
|
109
|
+
|
110
|
+
data[:ref_genome] = get_ref
|
111
|
+
|
112
|
+
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
113
|
+
data[:ref_start] = gets.chomp.rstrip.to_i
|
114
|
+
|
115
|
+
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
116
|
+
data[:ref_end] = gets.chomp.rstrip.to_i
|
117
|
+
|
118
|
+
print "allow indels? (default as yes) Y/N \n> "
|
119
|
+
indel = gets.chomp.rstrip
|
120
|
+
if indel =~ /n|no/i
|
121
|
+
data[:indel] = false
|
122
|
+
else
|
123
|
+
data[:indel] = true
|
124
|
+
end
|
125
|
+
else
|
126
|
+
data[:TCS_QC] = false
|
127
|
+
end
|
128
|
+
|
129
|
+
print "Need trimming to a reference genome? Y/N \n> "
|
130
|
+
trim_option = gets.chomp.rstrip
|
131
|
+
if trim_option =~ /y|yes/i
|
132
|
+
data[:trim] = true
|
133
|
+
data[:trim_ref] = get_ref
|
134
|
+
|
135
|
+
print "reference 5'end ref position \n> "
|
136
|
+
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
137
|
+
|
138
|
+
print "reference 3'end ref position \n> "
|
139
|
+
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
140
|
+
|
141
|
+
else
|
142
|
+
data[:trim] = false
|
143
|
+
end
|
144
|
+
|
145
|
+
else
|
146
|
+
data[:end_join] = false
|
147
|
+
end
|
148
|
+
|
149
|
+
param[:primer_pairs] << data
|
150
|
+
print "Do you wish to conintue? Y/N \n> "
|
151
|
+
continue_sig = gets.chomp.rstrip
|
152
|
+
break unless continue_sig =~ /y|yes/i
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
puts "\nYour JSON string is:"
|
157
|
+
puts JSON.pretty_generate(param)
|
158
|
+
|
159
|
+
print "\nDo you wish to save it as a file? Y/N \n> "
|
160
|
+
save_option = gets.chomp.rstrip
|
161
|
+
|
162
|
+
if save_option =~ /y|yes/i
|
163
|
+
print "Path to save JSON file:\n> "
|
164
|
+
path = gets.chomp.rstrip
|
165
|
+
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
166
|
+
end
|
data/lib/viral_seq.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
2
2
|
#
|
3
3
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
4
|
# of this software and associated documentation files (the "Software"), to deal
|
data/lib/viral_seq/hash.rb
CHANGED
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -5,6 +5,8 @@ module ViralSeq
|
|
5
5
|
# functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
|
6
6
|
# works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
|
7
7
|
# PR codon 1-99
|
8
|
+
# RT codon 34-122 (HXB2 2650-2914) and 152-236(3001-3257)
|
9
|
+
# IN codon 53-174 (HXB2 4384-4751)
|
8
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
9
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
10
12
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
data/lib/viral_seq/muscle.rb
CHANGED
@@ -39,8 +39,8 @@ module ViralSeq
|
|
39
39
|
|
40
40
|
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
41
|
temp_dir = Dir.home
|
42
|
-
temp_file = temp_dir
|
43
|
-
temp_aln = temp_dir
|
42
|
+
temp_file = File.join(temp_dir, "_temp_muscle_in")
|
43
|
+
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
44
44
|
name = ">test"
|
45
45
|
temp_in = File.open(temp_file,"w")
|
46
46
|
temp_in.puts ">ref"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -130,8 +130,8 @@ module ViralSeq
|
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
133
|
-
sequence_hash = Hash[
|
134
|
-
quality_hash = Hash[
|
133
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
134
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
135
135
|
|
136
136
|
seq_hash = ViralSeq::SeqHash.new
|
137
137
|
seq_hash.dna_hash = sequence_hash
|
@@ -166,6 +166,40 @@ module ViralSeq
|
|
166
166
|
alias_method :array, :new_from_array
|
167
167
|
end
|
168
168
|
|
169
|
+
# the size of nt sequence hash of the SeqHash object
|
170
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
171
|
+
|
172
|
+
def size
|
173
|
+
self.dna_hash.size
|
174
|
+
end
|
175
|
+
|
176
|
+
# combine SeqHash objects
|
177
|
+
# @param sh2 [ViralSeq::SeqHash] another SeqHash
|
178
|
+
# @return [ViralSeq::SeqHash] combined SeqHash
|
179
|
+
|
180
|
+
def +(sh2)
|
181
|
+
new_seqhash = ViralSeq::SeqHash.new
|
182
|
+
new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
|
183
|
+
new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
|
184
|
+
new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
|
185
|
+
new_seqhash.title = self.title + "_with_" + sh2.title
|
186
|
+
new_seqhash.file = self.file + "," + sh2.file
|
187
|
+
return new_seqhash
|
188
|
+
end
|
189
|
+
|
190
|
+
# write the nt sequences to a FASTA format file
|
191
|
+
# @param file [String] path to the FASTA output file
|
192
|
+
# @return [NilClass]
|
193
|
+
|
194
|
+
def write_nt_fa(file)
|
195
|
+
File.open(file, 'w') do |f|
|
196
|
+
self.dna_hash.each do |k,v|
|
197
|
+
f.puts k
|
198
|
+
f.puts v
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
169
203
|
# generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
|
170
204
|
# @return [String] relaxed sequencial phylip format in a String object
|
171
205
|
# @example convert fasta format to relaxed sequencial phylip format
|
@@ -215,10 +249,12 @@ module ViralSeq
|
|
215
249
|
def translate(codon_position = 0)
|
216
250
|
seqs = self.dna_hash
|
217
251
|
@aa_hash = {}
|
218
|
-
seqs.each do |
|
219
|
-
s = ViralSeq::Sequence.new(name, seq)
|
252
|
+
seqs.uniq_hash.each do |seq, array_of_name|
|
253
|
+
s = ViralSeq::Sequence.new('name', seq)
|
220
254
|
s.translate(codon_position)
|
221
|
-
|
255
|
+
array_of_name.each do |name|
|
256
|
+
@aa_hash[name] = s.aa_string
|
257
|
+
end
|
222
258
|
end
|
223
259
|
return nil
|
224
260
|
end # end of #translate
|
@@ -277,36 +313,40 @@ module ViralSeq
|
|
277
313
|
|
278
314
|
# screen for sequences with stop codons.
|
279
315
|
# @param (see #translate)
|
280
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
281
317
|
#
|
282
|
-
# #
|
283
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
284
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
285
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
286
322
|
# my_seqhash.dna_hash
|
287
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
288
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
289
325
|
# stop_codon_seqhash.dna_hash
|
290
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
291
327
|
# stop_codon_seqhash.aa_hash
|
292
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
293
329
|
# stop_codon_seqhash.title
|
294
330
|
# => "my_fasta_file_stop"
|
295
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
296
332
|
# filtered_seqhash.aa_hash
|
297
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
298
334
|
|
299
335
|
def stop_codon(codon_position = 0)
|
300
336
|
self.translate(codon_position)
|
301
337
|
keys = []
|
302
|
-
self.aa_hash
|
303
|
-
|
338
|
+
aa_seqs = self.aa_hash
|
339
|
+
aa_seqs.uniq_hash.each do |seq,array_of_name|
|
340
|
+
keys += array_of_name if seq.include?('*')
|
304
341
|
end
|
305
342
|
seqhash1 = self.sub(keys)
|
306
343
|
seqhash1.title = self.title + "_stop"
|
307
|
-
keys2 =
|
344
|
+
keys2 = aa_seqs.keys - keys
|
308
345
|
seqhash2 = self.sub(keys2)
|
309
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
310
350
|
end #end of #stop_codon
|
311
351
|
|
312
352
|
|
@@ -362,10 +402,10 @@ module ViralSeq
|
|
362
402
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
363
403
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
364
404
|
# # b/c Poisson model does not do well on small sample size.
|
365
|
-
# @return [
|
366
|
-
#
|
367
|
-
#
|
368
|
-
#
|
405
|
+
# @return [Hash] three paris.
|
406
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
407
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
408
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
369
409
|
# # sequence tag
|
370
410
|
# # G to A mutation numbers at potential a3g positions
|
371
411
|
# # total potential a3g G positions
|
@@ -376,17 +416,17 @@ module ViralSeq
|
|
376
416
|
# @example identify apobec3gf mutations from a sequence fasta file
|
377
417
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
378
418
|
# hypermut = my_seqhash.a3g
|
379
|
-
# hypermut[
|
419
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
380
420
|
# => [">Seq7", ">Seq14"]
|
381
|
-
# hypermut[
|
421
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
382
422
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
383
|
-
# hypermut[
|
423
|
+
# hypermut[:stats]
|
384
424
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
385
425
|
#
|
386
426
|
# @example identify apobec3gf mutations from another sequence fasta file
|
387
427
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
388
428
|
# hypermut = my_seqhash.a3g
|
389
|
-
# hypermut[
|
429
|
+
# hypermut[:stats]
|
390
430
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
391
431
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
392
432
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -479,7 +519,10 @@ module ViralSeq
|
|
479
519
|
hm_seq_hash.title = self.title + "_hypermut"
|
480
520
|
hm_seq_hash.file = self.file
|
481
521
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
482
|
-
return
|
522
|
+
return { a3g_seq: hm_seq_hash,
|
523
|
+
filtered_seq: filtered_seq_hash,
|
524
|
+
stats: hm_hash.values
|
525
|
+
}
|
483
526
|
end #end of #a3g_hypermut
|
484
527
|
|
485
528
|
alias_method :a3g, :a3g_hypermut
|
@@ -693,6 +736,7 @@ module ViralSeq
|
|
693
736
|
|
694
737
|
seq_hash_unique.each do |seq|
|
695
738
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
739
|
+
next unless loc # if locator tool fails, skip this seq.
|
696
740
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
697
741
|
if indel
|
698
742
|
seq_hash_unique_pass << seq
|
@@ -729,6 +773,8 @@ module ViralSeq
|
|
729
773
|
#
|
730
774
|
# containing_indel? (Boolean)
|
731
775
|
#
|
776
|
+
# direction ('forward' or 'reverse')
|
777
|
+
#
|
732
778
|
# aligned_input_sequence (String)
|
733
779
|
#
|
734
780
|
# aligned_reference_sequence (String)
|
@@ -742,9 +788,13 @@ module ViralSeq
|
|
742
788
|
|
743
789
|
uniq_dna.each do |seq,names|
|
744
790
|
s = ViralSeq::Sequence.new('',seq)
|
745
|
-
|
791
|
+
loc1 = s.locator(ref_option)
|
792
|
+
s.rc!
|
793
|
+
loc2 = s.locator(ref_option)
|
794
|
+
loc1[2] >= loc2[2] ? (direction = :+; loc = loc1): (direction = :-; loc = loc2)
|
795
|
+
|
746
796
|
names.each do |name|
|
747
|
-
out_array << ([title, name, ref_option.to_s] + loc)
|
797
|
+
out_array << ([title, name, ref_option.to_s, direction.to_s] + loc)
|
748
798
|
end
|
749
799
|
end
|
750
800
|
return out_array
|
@@ -865,11 +915,11 @@ module ViralSeq
|
|
865
915
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
866
916
|
# @example gap strip for an array of sequences
|
867
917
|
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
868
|
-
# array = { AACCGGTT
|
869
|
-
#
|
870
|
-
#
|
871
|
-
#
|
872
|
-
#
|
918
|
+
# array = %w{ AACCGGTT
|
919
|
+
# A-CCGGTT
|
920
|
+
# AAC-GGTT
|
921
|
+
# AACCG-TT
|
922
|
+
# AACCGGT- }
|
873
923
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
874
924
|
# puts my_seqhash.gap_strip.dna_hash.values
|
875
925
|
# ACGT
|
@@ -924,12 +974,11 @@ module ViralSeq
|
|
924
974
|
# @param (see #gap_strip)
|
925
975
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
926
976
|
# @example gap strip for an array of sequences only at the ends
|
927
|
-
# array =
|
928
|
-
#
|
929
|
-
#
|
930
|
-
#
|
931
|
-
#
|
932
|
-
# AACCGGT- }
|
977
|
+
# array = %w{ AACCGGTT
|
978
|
+
# A-CCGGTT
|
979
|
+
# AAC-GGTT
|
980
|
+
# AACCG-TT
|
981
|
+
# AACCGGT- }
|
933
982
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
934
983
|
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
935
984
|
# AACCGGT
|
@@ -993,8 +1042,137 @@ module ViralSeq
|
|
993
1042
|
end
|
994
1043
|
|
995
1044
|
|
1045
|
+
# mutate @dna_hash based on the error_rate
|
1046
|
+
# @param error_rate [Float] error rate used to mutate sequences.
|
1047
|
+
# @return [ViralSeq::SeqHash] new SeqHash object of mutated sequences.
|
1048
|
+
|
1049
|
+
def mutation(error_rate = 0.01)
|
1050
|
+
new_seqhash = ViralSeq::SeqHash.new
|
1051
|
+
dna = {}
|
1052
|
+
self.dna_hash.each do |name, seq|
|
1053
|
+
dna[name + '_mut-' + error_rate.to_s] = seq.mutation(error_rate)
|
1054
|
+
end
|
1055
|
+
new_seqhash.dna_hash = dna
|
1056
|
+
new_seqhash.title = self.title + "_mut-" + error_rate.to_s
|
1057
|
+
new_seqhash.file = self.file
|
1058
|
+
return new_seqhash
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
# return an table of frequencies of nucleotides at each position.
|
1062
|
+
# @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
|
1063
|
+
# @param head [Boolean] if the head of table is included.
|
1064
|
+
# @return [Array] a two-dimension array of the frequency table,
|
1065
|
+
# including the following info:
|
1066
|
+
# position on the sequence (starting from 1)
|
1067
|
+
# consensus nucleotide
|
1068
|
+
# total sequence numbers
|
1069
|
+
# percentage of A, shows "-" if agrees with consensus
|
1070
|
+
# percentage of C, shows "-" if agrees with consensus
|
1071
|
+
# percentage of G, shows "-" if agrees with consensus
|
1072
|
+
# percentage of T, shows "-" if agrees with consensus
|
1073
|
+
#
|
1074
|
+
# @example error table for an array of sequences
|
1075
|
+
# array = %w{ AACCGGTT
|
1076
|
+
# AGCCGGTT
|
1077
|
+
# AACTGCTT
|
1078
|
+
# AACCGTTA
|
1079
|
+
# AACCGGTA }
|
1080
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
1081
|
+
# my_seqhash.error_table.each {|r| puts r.join(',')}
|
1082
|
+
# position,consensus,total_seq_number,A,C,G,T
|
1083
|
+
# 1,A,5,-,,,
|
1084
|
+
# 2,A,5,-,,0.2,
|
1085
|
+
# 3,C,5,,-,,
|
1086
|
+
# 4,C,5,,-,,0.2
|
1087
|
+
# 5,G,5,,,-,
|
1088
|
+
# 6,G,5,,0.2,-,0.2
|
1089
|
+
# 7,T,5,,,,-
|
1090
|
+
# 8,T,5,0.4,,,-
|
1091
|
+
|
1092
|
+
def error_table(ref = self.consensus, head = true)
|
1093
|
+
|
1094
|
+
table = []
|
1095
|
+
if head
|
1096
|
+
table << %w{
|
1097
|
+
position
|
1098
|
+
consensus
|
1099
|
+
total_seq_number
|
1100
|
+
A
|
1101
|
+
C
|
1102
|
+
G
|
1103
|
+
T
|
1104
|
+
}
|
1105
|
+
end
|
1106
|
+
ref_size = ref.size
|
1107
|
+
|
1108
|
+
(0..(ref_size - 1)).each do |position|
|
1109
|
+
ref_base = ref[position]
|
1110
|
+
nts = []
|
1111
|
+
|
1112
|
+
self.dna_hash.each do |_k,v|
|
1113
|
+
nts << v[position]
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
freq = nts.count_freq
|
1117
|
+
freq2 = {}
|
1118
|
+
|
1119
|
+
freq.each do |nt,c|
|
1120
|
+
if nt == ref_base
|
1121
|
+
freq2[nt] = '-'
|
1122
|
+
else
|
1123
|
+
freq2[nt] = (c/(self.size).to_f)
|
1124
|
+
end
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
|
1128
|
+
end
|
996
1129
|
|
1130
|
+
return table
|
997
1131
|
|
1132
|
+
end # end of error_table
|
1133
|
+
|
1134
|
+
# randomly select n number of sequences from the orginal SeqHash object
|
1135
|
+
# @param n [Integer] number of sequences to randomly select
|
1136
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
|
1137
|
+
|
1138
|
+
def random_select(n = 100)
|
1139
|
+
new_sh = ViralSeq::SeqHash.new
|
1140
|
+
dna_hash = self.dna_hash
|
1141
|
+
aa_hash = self.aa_hash
|
1142
|
+
qc_hash = self.qc_hash
|
1143
|
+
|
1144
|
+
keys = dna_hash.keys.sample(n)
|
1145
|
+
|
1146
|
+
keys.each do |k|
|
1147
|
+
new_sh.dna_hash[k] = dna_hash[k]
|
1148
|
+
new_sh.aa_hash[k] = aa_hash[k]
|
1149
|
+
new_sh.qc_hash[k] = qc_hash[k]
|
1150
|
+
end
|
1151
|
+
new_sh.title = self.title + "_" + n.to_s
|
1152
|
+
return new_sh
|
1153
|
+
end
|
1154
|
+
|
1155
|
+
# trim dna sequences based on the provided reference coordinates.
|
1156
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1157
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
1158
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1159
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1160
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1161
|
+
|
1162
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1163
|
+
seq_hash = self.dna_hash.dup
|
1164
|
+
seq_hash_unique = seq_hash.uniq_hash
|
1165
|
+
trimmed_seq_hash = {}
|
1166
|
+
seq_hash_unique.each do |seq, names|
|
1167
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
1168
|
+
names.each do |name|
|
1169
|
+
trimmed_seq_hash[name] = trimmed_seq
|
1170
|
+
end
|
1171
|
+
end
|
1172
|
+
return_seq_hash = self.dup
|
1173
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
1174
|
+
return return_seq_hash
|
1175
|
+
end
|
998
1176
|
|
999
1177
|
# start of private functions
|
1000
1178
|
private
|