viral_seq 1.0.5 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
1
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
2
2
  #
3
3
  # Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  # of this software and associated documentation files (the "Software"), to deal
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
35
35
  require_relative "viral_seq/sequence"
36
36
  require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
+ require_relative "viral_seq/tcs_core"
39
+ require_relative "viral_seq/tcs_json"
40
+
38
41
 
39
42
  require "muscle_bio"
@@ -1,4 +1,4 @@
1
- # addition methods for Class::Hash required for ViralSeq
1
+ # additional methods for Class::Hash required for ViralSeq
2
2
 
3
3
  class Hash
4
4
 
@@ -5,6 +5,8 @@ module ViralSeq
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
7
7
  # PR codon 1-99
8
+ # RT codon 34-122 (HXB2 2650-2914) and 152-236(3001-3257)
9
+ # IN codon 53-174 (HXB2 4384-4751)
8
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
9
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
10
12
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
@@ -39,8 +39,8 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = temp_dir + "/_temp_muscle_in"
43
- temp_aln = temp_dir + "/_temp_muscle_aln"
42
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
43
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
44
44
  name = ">test"
45
45
  temp_in = File.open(temp_file,"w")
46
46
  temp_in.puts ">ref"
@@ -9,7 +9,7 @@ module ViralSeq
9
9
  # # align with MUSCLE
10
10
  # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
11
  # # filter nt sequences with the reference coordinates
12
- # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
14
  # filtered_seqhash = filtered_seqhash.a3g[1]
15
15
  # # further filter out sequences with A3G hypermutations
@@ -130,8 +130,8 @@ module ViralSeq
130
130
  end
131
131
  end
132
132
  end
133
- sequence_hash = Hash[*sequence_a]
134
- quality_hash = Hash[*quality_a]
133
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
134
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
135
135
 
136
136
  seq_hash = ViralSeq::SeqHash.new
137
137
  seq_hash.dna_hash = sequence_hash
@@ -166,6 +166,40 @@ module ViralSeq
166
166
  alias_method :array, :new_from_array
167
167
  end
168
168
 
169
+ # the size of nt sequence hash of the SeqHash object
170
+ # @return [Integer] size of nt sequence hash of the SeqHash object
171
+
172
+ def size
173
+ self.dna_hash.size
174
+ end
175
+
176
+ # combine SeqHash objects
177
+ # @param sh2 [ViralSeq::SeqHash] another SeqHash
178
+ # @return [ViralSeq::SeqHash] combined SeqHash
179
+
180
+ def +(sh2)
181
+ new_seqhash = ViralSeq::SeqHash.new
182
+ new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
183
+ new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
184
+ new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
185
+ new_seqhash.title = self.title + "_with_" + sh2.title
186
+ new_seqhash.file = self.file + "," + sh2.file
187
+ return new_seqhash
188
+ end
189
+
190
+ # write the nt sequences to a FASTA format file
191
+ # @param file [String] path to the FASTA output file
192
+ # @return [NilClass]
193
+
194
+ def write_nt_fa(file)
195
+ File.open(file, 'w') do |f|
196
+ self.dna_hash.each do |k,v|
197
+ f.puts k
198
+ f.puts v
199
+ end
200
+ end
201
+ end
202
+
169
203
  # generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
170
204
  # @return [String] relaxed sequencial phylip format in a String object
171
205
  # @example convert fasta format to relaxed sequencial phylip format
@@ -215,10 +249,12 @@ module ViralSeq
215
249
  def translate(codon_position = 0)
216
250
  seqs = self.dna_hash
217
251
  @aa_hash = {}
218
- seqs.each do |name, seq|
219
- s = ViralSeq::Sequence.new(name, seq)
252
+ seqs.uniq_hash.each do |seq, array_of_name|
253
+ s = ViralSeq::Sequence.new('name', seq)
220
254
  s.translate(codon_position)
221
- @aa_hash[name] = s.aa_string
255
+ array_of_name.each do |name|
256
+ @aa_hash[name] = s.aa_string
257
+ end
222
258
  end
223
259
  return nil
224
260
  end # end of #translate
@@ -277,41 +313,45 @@ module ViralSeq
277
313
 
278
314
  # screen for sequences with stop codons.
279
315
  # @param (see #translate)
280
- # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
316
+ # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
281
317
  #
282
- # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
283
- # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
318
+ # # :with_stop_codon : ViralSeq::SeqHash object with stop codons
319
+ # # :without_stop_codon: ViralSeq::SeqHash object without stop codons
284
320
  # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
285
321
  # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
286
322
  # my_seqhash.dna_hash
287
323
  # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
288
- # stop_codon_seqhash = my_seqhash.stop_codon[0]
324
+ # stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
289
325
  # stop_codon_seqhash.dna_hash
290
326
  # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
291
327
  # stop_codon_seqhash.aa_hash
292
328
  # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
293
329
  # stop_codon_seqhash.title
294
330
  # => "my_fasta_file_stop"
295
- # filtered_seqhash = my_seqhash.stop_codon[1]
331
+ # filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
296
332
  # filtered_seqhash.aa_hash
297
333
  # {">seq1"=>"IRT", ">seq3"=>"MRT"}
298
334
 
299
335
  def stop_codon(codon_position = 0)
300
336
  self.translate(codon_position)
301
337
  keys = []
302
- self.aa_hash.each do |k,v|
303
- keys << k if v.include?('*')
338
+ aa_seqs = self.aa_hash
339
+ aa_seqs.uniq_hash.each do |seq,array_of_name|
340
+ keys += array_of_name if seq.include?('*')
304
341
  end
305
342
  seqhash1 = self.sub(keys)
306
343
  seqhash1.title = self.title + "_stop"
307
- keys2 = self.aa_hash.keys - keys
344
+ keys2 = aa_seqs.keys - keys
308
345
  seqhash2 = self.sub(keys2)
309
- return [seqhash1, seqhash2]
346
+ return {
347
+ with_stop_codon: seqhash1,
348
+ without_stop_codon: seqhash2
349
+ }
310
350
  end #end of #stop_codon
311
351
 
312
352
 
313
353
  # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
314
- # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
354
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
315
355
  # @return [String] consensus sequence
316
356
  # @example consensus sequence from an array of sequences.
317
357
  # seq_array = %w{ ATTTTTTTTT
@@ -343,11 +383,18 @@ module ViralSeq
343
383
  base_count = all_base.count_freq
344
384
  max_base_list = []
345
385
 
346
- base_count.each do |k,v|
347
- if v/seq_size.to_f >= cutoff
348
- max_base_list << k
386
+ if cutoff.zero?
387
+ max_count = base_count.values.max
388
+ max_base_hash = base_count.select {|_k,v| v == max_count}
389
+ max_base_list = max_base_hash.keys
390
+ else
391
+ base_count.each do |k,v|
392
+ if v/seq_size.to_f >= cutoff
393
+ max_base_list << k
394
+ end
349
395
  end
350
396
  end
397
+
351
398
  consensus_seq += call_consensus_base(max_base_list)
352
399
  end
353
400
  return consensus_seq
@@ -358,14 +405,14 @@ module ViralSeq
358
405
  # # control pattern: G[YN|RC] -> A[YN|RC]
359
406
  # # use the sample consensus to determine potential a3g sites
360
407
  # # Two criteria to identify hypermutation
361
- # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
408
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
362
409
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
363
410
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
364
411
  # # b/c Poisson model does not do well on small sample size.
365
- # @return [Array] three values.
366
- # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
367
- # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
368
- # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
412
+ # @return [Hash] three paris.
413
+ # :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
414
+ # :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
415
+ # :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
369
416
  # # sequence tag
370
417
  # # G to A mutation numbers at potential a3g positions
371
418
  # # total potential a3g G positions
@@ -376,17 +423,17 @@ module ViralSeq
376
423
  # @example identify apobec3gf mutations from a sequence fasta file
377
424
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
378
425
  # hypermut = my_seqhash.a3g
379
- # hypermut[0].dna_hash.keys
426
+ # hypermut[:a3g_seq].dna_hash.keys
380
427
  # => [">Seq7", ">Seq14"]
381
- # hypermut[1].dna_hash.keys
428
+ # hypermut[:filtered_seq].dna_hash.keys
382
429
  # => [">Seq1", ">Seq2", ">Seq5"]
383
- # hypermut[2]
430
+ # hypermut[:stats]
384
431
  # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
385
432
  #
386
433
  # @example identify apobec3gf mutations from another sequence fasta file
387
434
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
388
435
  # hypermut = my_seqhash.a3g
389
- # hypermut[2]
436
+ # hypermut[:stats]
390
437
  # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
391
438
  # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
392
439
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -479,7 +526,10 @@ module ViralSeq
479
526
  hm_seq_hash.title = self.title + "_hypermut"
480
527
  hm_seq_hash.file = self.file
481
528
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
482
- return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
529
+ return { a3g_seq: hm_seq_hash,
530
+ filtered_seq: filtered_seq_hash,
531
+ stats: hm_hash.values
532
+ }
483
533
  end #end of #a3g_hypermut
484
534
 
485
535
  alias_method :a3g, :a3g_hypermut
@@ -693,6 +743,7 @@ module ViralSeq
693
743
 
694
744
  seq_hash_unique.each do |seq|
695
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
+ next unless loc # if locator tool fails, skip this seq.
696
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
697
748
  if indel
698
749
  seq_hash_unique_pass << seq
@@ -748,7 +799,7 @@ module ViralSeq
748
799
  s.rc!
749
800
  loc2 = s.locator(ref_option)
750
801
  loc1[2] >= loc2[2] ? (direction = :+; loc = loc1): (direction = :-; loc = loc2)
751
-
802
+
752
803
  names.each do |name|
753
804
  out_array << ([title, name, ref_option.to_s, direction.to_s] + loc)
754
805
  end
@@ -871,11 +922,11 @@ module ViralSeq
871
922
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
872
923
  # @example gap strip for an array of sequences
873
924
  # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
874
- # array = { AACCGGTT
875
- # A-CCGGTT
876
- # AAC-GGTT
877
- # AACCG-TT
878
- # AACCGGT- }
925
+ # array = %w{ AACCGGTT
926
+ # A-CCGGTT
927
+ # AAC-GGTT
928
+ # AACCG-TT
929
+ # AACCGGT- }
879
930
  # my_seqhash = ViralSeq::SeqHash.array(array)
880
931
  # puts my_seqhash.gap_strip.dna_hash.values
881
932
  # ACGT
@@ -930,12 +981,11 @@ module ViralSeq
930
981
  # @param (see #gap_strip)
931
982
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
932
983
  # @example gap strip for an array of sequences only at the ends
933
- # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
934
- # array = { AACCGGTT
935
- # A-CCGGTT
936
- # AAC-GGTT
937
- # AACCG-TT
938
- # AACCGGT- }
984
+ # array = %w{ AACCGGTT
985
+ # A-CCGGTT
986
+ # AAC-GGTT
987
+ # AACCG-TT
988
+ # AACCGGT- }
939
989
  # my_seqhash = ViralSeq::SeqHash.array(array)
940
990
  # puts my_seqhash.gap_strip_ends.dna_hash.values
941
991
  # AACCGGT
@@ -999,8 +1049,137 @@ module ViralSeq
999
1049
  end
1000
1050
 
1001
1051
 
1052
+ # mutate @dna_hash based on the error_rate
1053
+ # @param error_rate [Float] error rate used to mutate sequences.
1054
+ # @return [ViralSeq::SeqHash] new SeqHash object of mutated sequences.
1055
+
1056
+ def mutation(error_rate = 0.01)
1057
+ new_seqhash = ViralSeq::SeqHash.new
1058
+ dna = {}
1059
+ self.dna_hash.each do |name, seq|
1060
+ dna[name + '_mut-' + error_rate.to_s] = seq.mutation(error_rate)
1061
+ end
1062
+ new_seqhash.dna_hash = dna
1063
+ new_seqhash.title = self.title + "_mut-" + error_rate.to_s
1064
+ new_seqhash.file = self.file
1065
+ return new_seqhash
1066
+ end
1067
+
1068
+ # return an table of frequencies of nucleotides at each position.
1069
+ # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
1070
+ # @param head [Boolean] if the head of table is included.
1071
+ # @return [Array] a two-dimension array of the frequency table,
1072
+ # including the following info:
1073
+ # position on the sequence (starting from 1)
1074
+ # consensus nucleotide
1075
+ # total sequence numbers
1076
+ # percentage of A, shows "-" if agrees with consensus
1077
+ # percentage of C, shows "-" if agrees with consensus
1078
+ # percentage of G, shows "-" if agrees with consensus
1079
+ # percentage of T, shows "-" if agrees with consensus
1080
+ #
1081
+ # @example error table for an array of sequences
1082
+ # array = %w{ AACCGGTT
1083
+ # AGCCGGTT
1084
+ # AACTGCTT
1085
+ # AACCGTTA
1086
+ # AACCGGTA }
1087
+ # my_seqhash = ViralSeq::SeqHash.array(array)
1088
+ # my_seqhash.error_table.each {|r| puts r.join(',')}
1089
+ # position,consensus,total_seq_number,A,C,G,T
1090
+ # 1,A,5,-,,,
1091
+ # 2,A,5,-,,0.2,
1092
+ # 3,C,5,,-,,
1093
+ # 4,C,5,,-,,0.2
1094
+ # 5,G,5,,,-,
1095
+ # 6,G,5,,0.2,-,0.2
1096
+ # 7,T,5,,,,-
1097
+ # 8,T,5,0.4,,,-
1098
+
1099
+ def error_table(ref = self.consensus, head = true)
1100
+
1101
+ table = []
1102
+ if head
1103
+ table << %w{
1104
+ position
1105
+ consensus
1106
+ total_seq_number
1107
+ A
1108
+ C
1109
+ G
1110
+ T
1111
+ }
1112
+ end
1113
+ ref_size = ref.size
1114
+
1115
+ (0..(ref_size - 1)).each do |position|
1116
+ ref_base = ref[position]
1117
+ nts = []
1118
+
1119
+ self.dna_hash.each do |_k,v|
1120
+ nts << v[position]
1121
+ end
1122
+
1123
+ freq = nts.count_freq
1124
+ freq2 = {}
1125
+
1126
+ freq.each do |nt,c|
1127
+ if nt == ref_base
1128
+ freq2[nt] = '-'
1129
+ else
1130
+ freq2[nt] = (c/(self.size).to_f)
1131
+ end
1132
+ end
1133
+
1134
+ table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
1135
+ end
1136
+
1137
+ return table
1138
+
1139
+ end # end of error_table
1140
+
1141
+ # randomly select n number of sequences from the orginal SeqHash object
1142
+ # @param n [Integer] number of sequences to randomly select
1143
+ # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
1144
+
1145
+ def random_select(n = 100)
1146
+ new_sh = ViralSeq::SeqHash.new
1147
+ dna_hash = self.dna_hash
1148
+ aa_hash = self.aa_hash
1149
+ qc_hash = self.qc_hash
1150
+
1151
+ keys = dna_hash.keys.sample(n)
1002
1152
 
1153
+ keys.each do |k|
1154
+ new_sh.dna_hash[k] = dna_hash[k]
1155
+ new_sh.aa_hash[k] = aa_hash[k]
1156
+ new_sh.qc_hash[k] = qc_hash[k]
1157
+ end
1158
+ new_sh.title = self.title + "_" + n.to_s
1159
+ return new_sh
1160
+ end
1003
1161
 
1162
+ # trim dna sequences based on the provided reference coordinates.
1163
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1164
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
1165
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1166
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1167
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1168
+
1169
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1170
+ seq_hash = self.dna_hash.dup
1171
+ seq_hash_unique = seq_hash.uniq_hash
1172
+ trimmed_seq_hash = {}
1173
+ seq_hash_unique.each do |seq, names|
1174
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
1175
+ names.each do |name|
1176
+ trimmed_seq_hash[name] = trimmed_seq
1177
+ end
1178
+ end
1179
+ return_seq_hash = self.dup
1180
+ return_seq_hash.dna_hash = trimmed_seq_hash
1181
+ return return_seq_hash
1182
+ end
1004
1183
 
1005
1184
  # start of private functions
1006
1185
  private
@@ -7,7 +7,7 @@ module ViralSeq
7
7
  # @example join the paired-end sequences with an overlap of 100 bp
8
8
  # my_seqhashpair.join1(100)
9
9
  # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
- # my_seqhashpair.join1(:indiv)
10
+ # my_seqhashpair.join2(model: :indiv)
11
11
 
12
12
  class SeqHashPair
13
13
 
@@ -80,6 +80,12 @@ module ViralSeq
80
80
  alias_method :fa, :new_from_fasta
81
81
  end
82
82
 
83
+ # the size of nt sequence hash of the SeqHashPair object
84
+ # @return [Integer] size of nt sequence hash of the SeqHash object
85
+ def size
86
+ self.dna_hash.size
87
+ end
88
+
83
89
  # Pair-end join function for KNOWN overlap size.
84
90
  # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
91
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -104,17 +110,21 @@ module ViralSeq
104
110
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
111
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
112
  joined_seq = {}
107
- seq_pair_hash.each do |seq_name, seq_pair|
113
+ seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
108
114
  r1_seq = seq_pair[0]
109
115
  r2_seq = seq_pair[1]
110
116
  if overlap.zero?
111
- joined_seq[seq_name] = r1_seq + r2_seq
117
+ joined_sequence = r1_seq + r2_seq
112
118
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
- joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
119
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
114
120
  else
115
121
  next
116
122
  end
123
+ seq_names.each do |seq_name|
124
+ joined_seq[seq_name] = joined_sequence
125
+ end
117
126
  end
127
+
118
128
  joined_seq_hash = ViralSeq::SeqHash.new
119
129
  joined_seq_hash.dna_hash = joined_seq
120
130
  joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +149,7 @@ module ViralSeq
139
149
  # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
150
  # my_seqhashpair.join2.dna_hash
141
151
  # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
- # my_seqhashpair.join2(model :indiv).dna_hash
152
+ # my_seqhashpair.join2(model: :indiv).dna_hash
143
153
  # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
154
 
145
155
  def join2(model: :con, diff: 0.0)
@@ -207,7 +217,7 @@ module ViralSeq
207
217
  # {minimal overlap set to 4. }
208
218
  def overlap_matrix(sequence1, sequence2)
209
219
  min_overlap = 4
210
- max_overlap = [sequence1.size, sequence2.size].max
220
+ max_overlap = [sequence1.size, sequence2.size].min
211
221
  matrix_hash = {}
212
222
  (min_overlap..max_overlap).each do |overlap|
213
223
  matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])