viral_seq 1.0.4 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # TCS pipeline JSON params generator.
4
+
5
+ require 'viral_seq'
6
+ require 'colorize'
7
+ require 'json'
8
+
9
+ def get_ref
10
+ puts "Choose reference genome (1-3):"
11
+ puts "1. HIV-1 HXB2".red.bold
12
+ puts "2. HIV-1 NL4-3".blue.bold
13
+ puts "3. SIV MAC239".magenta.bold
14
+ print "> "
15
+ ref_option = gets.chomp.rstrip
16
+ while ![1,2,3].include?(ref_option.to_i)
17
+ print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
18
+ ref_option = gets.chomp.rstrip.to_i
19
+ end
20
+ ref = case ref_option.to_i
21
+ when 1
22
+ :HXB2
23
+ when 2
24
+ :NL43
25
+ when 3
26
+ :MAC239
27
+ end
28
+ end
29
+
30
+ puts "\n" + '-'*58
31
+ puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
32
+ puts '-'*58 + "\n"
33
+
34
+ param = {}
35
+
36
+ puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
37
+ print '> '
38
+ param[:raw_sequence_dir] = gets.chomp.rstrip
39
+
40
+ puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
41
+ print '> '
42
+ input_error = gets.chomp.rstrip.to_f
43
+ if input_error == 0.0
44
+ param[:platform_error_rate] = 0.02
45
+ else
46
+ param[:platform_error_rate] = input_error
47
+ end
48
+
49
+ param[:primer_pairs] = []
50
+
51
+ loop do
52
+ data = {}
53
+ puts "Enter the name for the sequenced region: "
54
+ print '> '
55
+ data[:region] = gets.chomp.rstrip
56
+
57
+ puts "Enter the #{"cDNA".red.bold} primer sequence: "
58
+ print '> '
59
+ data[:cdna] = gets.chomp.rstrip
60
+
61
+ puts "Enter the #{"forward".blue.bold} primer sequence: "
62
+ print '> '
63
+ data[:forward] = gets.chomp.rstrip
64
+
65
+ puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
66
+ print '> '
67
+ mj = gets.chomp.rstrip.to_f
68
+ if (0.5..0.9).include?(mj)
69
+ data[:majority] = mj
70
+ else
71
+ data[:majority] = 0.5
72
+ end
73
+
74
+ print "Need end-join? Y/N \n> "
75
+ ej = gets.chomp.rstrip
76
+ if ej =~ /y|yes/i
77
+ data[:end_join] = true
78
+
79
+ print "End-join option? Choose from (1-4):\n
80
+ 1: simple join, no overlap
81
+ 2: known overlap \n
82
+ 3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
83
+ 4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
84
+ > "
85
+ ej_option = gets.chomp.rstrip
86
+ while ![1,2,3,4].include?(ej_option.to_i)
87
+ puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
88
+ ej_option = gets.chomp.rstrip.to_i
89
+ end
90
+ case ej_option.to_i
91
+ when 1
92
+ data[:end_join_option] = 1
93
+ data[:overlap] = 0
94
+ when 2
95
+ data[:end_join_option] = 1
96
+ print "overlap bases: \n> "
97
+ ol = gets.chomp.rstrip.to_i
98
+ data[:overlap] = ol
99
+ when 3
100
+ data[:end_join_option] = 3
101
+ when 4
102
+ data[:end_join_option] = 4
103
+ end
104
+
105
+ print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
106
+ qc = gets.chomp.rstrip
107
+ if qc =~ /y|yes/i
108
+ data[:TCS_QC] = true
109
+
110
+ data[:ref_genome] = get_ref
111
+
112
+ print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
113
+ data[:ref_start] = gets.chomp.rstrip.to_i
114
+
115
+ print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
116
+ data[:ref_end] = gets.chomp.rstrip.to_i
117
+
118
+ print "allow indels? (default as yes) Y/N \n> "
119
+ indel = gets.chomp.rstrip
120
+ if indel =~ /n|no/i
121
+ data[:indel] = false
122
+ else
123
+ data[:indel] = true
124
+ end
125
+ else
126
+ data[:TCS_QC] = false
127
+ end
128
+
129
+ print "Need trimming to a reference genome? Y/N \n> "
130
+ trim_option = gets.chomp.rstrip
131
+ if trim_option =~ /y|yes/i
132
+ data[:trim] = true
133
+ data[:trim_ref] = get_ref
134
+
135
+ print "reference 5'end ref position \n> "
136
+ data[:trim_ref_start] = gets.chomp.rstrip.to_i
137
+
138
+ print "reference 3'end ref position \n> "
139
+ data[:trim_ref_end] = gets.chomp.rstrip.to_i
140
+
141
+ else
142
+ data[:trim] = false
143
+ end
144
+
145
+ else
146
+ data[:end_join] = false
147
+ end
148
+
149
+ param[:primer_pairs] << data
150
+ print "Do you wish to conintue? Y/N \n> "
151
+ continue_sig = gets.chomp.rstrip
152
+ break unless continue_sig =~ /y|yes/i
153
+
154
+ end
155
+
156
+ puts "\nYour JSON string is:"
157
+ puts JSON.pretty_generate(param)
158
+
159
+ print "\nDo you wish to save it as a file? Y/N \n> "
160
+ save_option = gets.chomp.rstrip
161
+
162
+ if save_option =~ /y|yes/i
163
+ print "Path to save JSON file:\n> "
164
+ path = gets.chomp.rstrip
165
+ File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
166
+ end
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
1
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
2
2
  #
3
3
  # Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  # of this software and associated documentation files (the "Software"), to deal
@@ -1,4 +1,4 @@
1
- # addition methods for Class::Hash required for ViralSeq
1
+ # additional methods for Class::Hash required for ViralSeq
2
2
 
3
3
  class Hash
4
4
 
@@ -5,6 +5,8 @@ module ViralSeq
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
7
7
  # PR codon 1-99
8
+ # RT codon 34-122 (HXB2 2650-2914) and 152-236(3001-3257)
9
+ # IN codon 53-174 (HXB2 4384-4751)
8
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
9
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
10
12
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
@@ -39,8 +39,8 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = temp_dir + "/_temp_muscle_in"
43
- temp_aln = temp_dir + "/_temp_muscle_aln"
42
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
43
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
44
44
  name = ">test"
45
45
  temp_in = File.open(temp_file,"w")
46
46
  temp_in.puts ">ref"
@@ -130,8 +130,8 @@ module ViralSeq
130
130
  end
131
131
  end
132
132
  end
133
- sequence_hash = Hash[*sequence_a]
134
- quality_hash = Hash[*quality_a]
133
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
134
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
135
135
 
136
136
  seq_hash = ViralSeq::SeqHash.new
137
137
  seq_hash.dna_hash = sequence_hash
@@ -166,6 +166,40 @@ module ViralSeq
166
166
  alias_method :array, :new_from_array
167
167
  end
168
168
 
169
+ # the size of nt sequence hash of the SeqHash object
170
+ # @return [Integer] size of nt sequence hash of the SeqHash object
171
+
172
+ def size
173
+ self.dna_hash.size
174
+ end
175
+
176
+ # combine SeqHash objects
177
+ # @param sh2 [ViralSeq::SeqHash] another SeqHash
178
+ # @return [ViralSeq::SeqHash] combined SeqHash
179
+
180
+ def +(sh2)
181
+ new_seqhash = ViralSeq::SeqHash.new
182
+ new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
183
+ new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
184
+ new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
185
+ new_seqhash.title = self.title + "_with_" + sh2.title
186
+ new_seqhash.file = self.file + "," + sh2.file
187
+ return new_seqhash
188
+ end
189
+
190
+ # write the nt sequences to a FASTA format file
191
+ # @param file [String] path to the FASTA output file
192
+ # @return [NilClass]
193
+
194
+ def write_nt_fa(file)
195
+ File.open(file, 'w') do |f|
196
+ self.dna_hash.each do |k,v|
197
+ f.puts k
198
+ f.puts v
199
+ end
200
+ end
201
+ end
202
+
169
203
  # generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
170
204
  # @return [String] relaxed sequencial phylip format in a String object
171
205
  # @example convert fasta format to relaxed sequencial phylip format
@@ -215,10 +249,12 @@ module ViralSeq
215
249
  def translate(codon_position = 0)
216
250
  seqs = self.dna_hash
217
251
  @aa_hash = {}
218
- seqs.each do |name, seq|
219
- s = ViralSeq::Sequence.new(name, seq)
252
+ seqs.uniq_hash.each do |seq, array_of_name|
253
+ s = ViralSeq::Sequence.new('name', seq)
220
254
  s.translate(codon_position)
221
- @aa_hash[name] = s.aa_string
255
+ array_of_name.each do |name|
256
+ @aa_hash[name] = s.aa_string
257
+ end
222
258
  end
223
259
  return nil
224
260
  end # end of #translate
@@ -277,36 +313,40 @@ module ViralSeq
277
313
 
278
314
  # screen for sequences with stop codons.
279
315
  # @param (see #translate)
280
- # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
316
+ # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
281
317
  #
282
- # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
283
- # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
318
+ # # :with_stop_codon : ViralSeq::SeqHash object with stop codons
319
+ # # :without_stop_codon: ViralSeq::SeqHash object without stop codons
284
320
  # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
285
321
  # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
286
322
  # my_seqhash.dna_hash
287
323
  # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
288
- # stop_codon_seqhash = my_seqhash.stop_codon[0]
324
+ # stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
289
325
  # stop_codon_seqhash.dna_hash
290
326
  # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
291
327
  # stop_codon_seqhash.aa_hash
292
328
  # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
293
329
  # stop_codon_seqhash.title
294
330
  # => "my_fasta_file_stop"
295
- # filtered_seqhash = my_seqhash.stop_codon[1]
331
+ # filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
296
332
  # filtered_seqhash.aa_hash
297
333
  # {">seq1"=>"IRT", ">seq3"=>"MRT"}
298
334
 
299
335
  def stop_codon(codon_position = 0)
300
336
  self.translate(codon_position)
301
337
  keys = []
302
- self.aa_hash.each do |k,v|
303
- keys << k if v.include?('*')
338
+ aa_seqs = self.aa_hash
339
+ aa_seqs.uniq_hash.each do |seq,array_of_name|
340
+ keys += array_of_name if seq.include?('*')
304
341
  end
305
342
  seqhash1 = self.sub(keys)
306
343
  seqhash1.title = self.title + "_stop"
307
- keys2 = self.aa_hash.keys - keys
344
+ keys2 = aa_seqs.keys - keys
308
345
  seqhash2 = self.sub(keys2)
309
- return [seqhash1, seqhash2]
346
+ return {
347
+ with_stop_codon: seqhash1,
348
+ without_stop_codon: seqhash2
349
+ }
310
350
  end #end of #stop_codon
311
351
 
312
352
 
@@ -362,10 +402,10 @@ module ViralSeq
362
402
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
363
403
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
364
404
  # # b/c Poisson model does not do well on small sample size.
365
- # @return [Array] three values.
366
- # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
367
- # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
368
- # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
405
+ # @return [Hash] three paris.
406
+ # :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
407
+ # :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
408
+ # :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
369
409
  # # sequence tag
370
410
  # # G to A mutation numbers at potential a3g positions
371
411
  # # total potential a3g G positions
@@ -376,17 +416,17 @@ module ViralSeq
376
416
  # @example identify apobec3gf mutations from a sequence fasta file
377
417
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
378
418
  # hypermut = my_seqhash.a3g
379
- # hypermut[0].dna_hash.keys
419
+ # hypermut[:a3g_seq].dna_hash.keys
380
420
  # => [">Seq7", ">Seq14"]
381
- # hypermut[1].dna_hash.keys
421
+ # hypermut[:filtered_seq].dna_hash.keys
382
422
  # => [">Seq1", ">Seq2", ">Seq5"]
383
- # hypermut[2]
423
+ # hypermut[:stats]
384
424
  # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
385
425
  #
386
426
  # @example identify apobec3gf mutations from another sequence fasta file
387
427
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
388
428
  # hypermut = my_seqhash.a3g
389
- # hypermut[2]
429
+ # hypermut[:stats]
390
430
  # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
391
431
  # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
392
432
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -479,7 +519,10 @@ module ViralSeq
479
519
  hm_seq_hash.title = self.title + "_hypermut"
480
520
  hm_seq_hash.file = self.file
481
521
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
482
- return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
522
+ return { a3g_seq: hm_seq_hash,
523
+ filtered_seq: filtered_seq_hash,
524
+ stats: hm_hash.values
525
+ }
483
526
  end #end of #a3g_hypermut
484
527
 
485
528
  alias_method :a3g, :a3g_hypermut
@@ -693,6 +736,7 @@ module ViralSeq
693
736
 
694
737
  seq_hash_unique.each do |seq|
695
738
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
739
+ next unless loc # if locator tool fails, skip this seq.
696
740
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
697
741
  if indel
698
742
  seq_hash_unique_pass << seq
@@ -729,6 +773,8 @@ module ViralSeq
729
773
  #
730
774
  # containing_indel? (Boolean)
731
775
  #
776
+ # direction ('forward' or 'reverse')
777
+ #
732
778
  # aligned_input_sequence (String)
733
779
  #
734
780
  # aligned_reference_sequence (String)
@@ -742,9 +788,13 @@ module ViralSeq
742
788
 
743
789
  uniq_dna.each do |seq,names|
744
790
  s = ViralSeq::Sequence.new('',seq)
745
- loc = s.locator(ref_option)
791
+ loc1 = s.locator(ref_option)
792
+ s.rc!
793
+ loc2 = s.locator(ref_option)
794
+ loc1[2] >= loc2[2] ? (direction = :+; loc = loc1): (direction = :-; loc = loc2)
795
+
746
796
  names.each do |name|
747
- out_array << ([title, name, ref_option.to_s] + loc)
797
+ out_array << ([title, name, ref_option.to_s, direction.to_s] + loc)
748
798
  end
749
799
  end
750
800
  return out_array
@@ -865,11 +915,11 @@ module ViralSeq
865
915
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
866
916
  # @example gap strip for an array of sequences
867
917
  # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
868
- # array = { AACCGGTT
869
- # A-CCGGTT
870
- # AAC-GGTT
871
- # AACCG-TT
872
- # AACCGGT- }
918
+ # array = %w{ AACCGGTT
919
+ # A-CCGGTT
920
+ # AAC-GGTT
921
+ # AACCG-TT
922
+ # AACCGGT- }
873
923
  # my_seqhash = ViralSeq::SeqHash.array(array)
874
924
  # puts my_seqhash.gap_strip.dna_hash.values
875
925
  # ACGT
@@ -924,12 +974,11 @@ module ViralSeq
924
974
  # @param (see #gap_strip)
925
975
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
926
976
  # @example gap strip for an array of sequences only at the ends
927
- # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
928
- # array = { AACCGGTT
929
- # A-CCGGTT
930
- # AAC-GGTT
931
- # AACCG-TT
932
- # AACCGGT- }
977
+ # array = %w{ AACCGGTT
978
+ # A-CCGGTT
979
+ # AAC-GGTT
980
+ # AACCG-TT
981
+ # AACCGGT- }
933
982
  # my_seqhash = ViralSeq::SeqHash.array(array)
934
983
  # puts my_seqhash.gap_strip_ends.dna_hash.values
935
984
  # AACCGGT
@@ -993,8 +1042,137 @@ module ViralSeq
993
1042
  end
994
1043
 
995
1044
 
1045
+ # mutate @dna_hash based on the error_rate
1046
+ # @param error_rate [Float] error rate used to mutate sequences.
1047
+ # @return [ViralSeq::SeqHash] new SeqHash object of mutated sequences.
1048
+
1049
+ def mutation(error_rate = 0.01)
1050
+ new_seqhash = ViralSeq::SeqHash.new
1051
+ dna = {}
1052
+ self.dna_hash.each do |name, seq|
1053
+ dna[name + '_mut-' + error_rate.to_s] = seq.mutation(error_rate)
1054
+ end
1055
+ new_seqhash.dna_hash = dna
1056
+ new_seqhash.title = self.title + "_mut-" + error_rate.to_s
1057
+ new_seqhash.file = self.file
1058
+ return new_seqhash
1059
+ end
1060
+
1061
+ # return an table of frequencies of nucleotides at each position.
1062
+ # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
1063
+ # @param head [Boolean] if the head of table is included.
1064
+ # @return [Array] a two-dimension array of the frequency table,
1065
+ # including the following info:
1066
+ # position on the sequence (starting from 1)
1067
+ # consensus nucleotide
1068
+ # total sequence numbers
1069
+ # percentage of A, shows "-" if agrees with consensus
1070
+ # percentage of C, shows "-" if agrees with consensus
1071
+ # percentage of G, shows "-" if agrees with consensus
1072
+ # percentage of T, shows "-" if agrees with consensus
1073
+ #
1074
+ # @example error table for an array of sequences
1075
+ # array = %w{ AACCGGTT
1076
+ # AGCCGGTT
1077
+ # AACTGCTT
1078
+ # AACCGTTA
1079
+ # AACCGGTA }
1080
+ # my_seqhash = ViralSeq::SeqHash.array(array)
1081
+ # my_seqhash.error_table.each {|r| puts r.join(',')}
1082
+ # position,consensus,total_seq_number,A,C,G,T
1083
+ # 1,A,5,-,,,
1084
+ # 2,A,5,-,,0.2,
1085
+ # 3,C,5,,-,,
1086
+ # 4,C,5,,-,,0.2
1087
+ # 5,G,5,,,-,
1088
+ # 6,G,5,,0.2,-,0.2
1089
+ # 7,T,5,,,,-
1090
+ # 8,T,5,0.4,,,-
1091
+
1092
+ def error_table(ref = self.consensus, head = true)
1093
+
1094
+ table = []
1095
+ if head
1096
+ table << %w{
1097
+ position
1098
+ consensus
1099
+ total_seq_number
1100
+ A
1101
+ C
1102
+ G
1103
+ T
1104
+ }
1105
+ end
1106
+ ref_size = ref.size
1107
+
1108
+ (0..(ref_size - 1)).each do |position|
1109
+ ref_base = ref[position]
1110
+ nts = []
1111
+
1112
+ self.dna_hash.each do |_k,v|
1113
+ nts << v[position]
1114
+ end
1115
+
1116
+ freq = nts.count_freq
1117
+ freq2 = {}
1118
+
1119
+ freq.each do |nt,c|
1120
+ if nt == ref_base
1121
+ freq2[nt] = '-'
1122
+ else
1123
+ freq2[nt] = (c/(self.size).to_f)
1124
+ end
1125
+ end
1126
+
1127
+ table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
1128
+ end
996
1129
 
1130
+ return table
997
1131
 
1132
+ end # end of error_table
1133
+
1134
+ # randomly select n number of sequences from the orginal SeqHash object
1135
+ # @param n [Integer] number of sequences to randomly select
1136
+ # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
1137
+
1138
+ def random_select(n = 100)
1139
+ new_sh = ViralSeq::SeqHash.new
1140
+ dna_hash = self.dna_hash
1141
+ aa_hash = self.aa_hash
1142
+ qc_hash = self.qc_hash
1143
+
1144
+ keys = dna_hash.keys.sample(n)
1145
+
1146
+ keys.each do |k|
1147
+ new_sh.dna_hash[k] = dna_hash[k]
1148
+ new_sh.aa_hash[k] = aa_hash[k]
1149
+ new_sh.qc_hash[k] = qc_hash[k]
1150
+ end
1151
+ new_sh.title = self.title + "_" + n.to_s
1152
+ return new_sh
1153
+ end
1154
+
1155
+ # trim dna sequences based on the provided reference coordinates.
1156
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1157
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
1158
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1159
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1160
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1161
+
1162
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1163
+ seq_hash = self.dna_hash.dup
1164
+ seq_hash_unique = seq_hash.uniq_hash
1165
+ trimmed_seq_hash = {}
1166
+ seq_hash_unique.each do |seq, names|
1167
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
1168
+ names.each do |name|
1169
+ trimmed_seq_hash[name] = trimmed_seq
1170
+ end
1171
+ end
1172
+ return_seq_hash = self.dup
1173
+ return_seq_hash.dna_hash = trimmed_seq_hash
1174
+ return return_seq_hash
1175
+ end
998
1176
 
999
1177
  # start of private functions
1000
1178
  private