RubyGems - viral_seq - Versions diffs - 1.0.5 → 1.0.10 - Mend

viral_seq 1.0.5 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/Gemfile.lock +6 -4
data/README.md +110 -38
data/bin/locator +31 -9
data/bin/tcs +450 -0
data/lib/viral_seq.rb +4 -1
data/lib/viral_seq/hash.rb +1 -1
data/lib/viral_seq/hivdr.rb +2 -0
data/lib/viral_seq/muscle.rb +2 -2
data/lib/viral_seq/seq_hash.rb +220 -41
data/lib/viral_seq/seq_hash_pair.rb +16 -6
data/lib/viral_seq/tcs_core.rb +303 -0
data/lib/viral_seq/tcs_json.rb +178 -0
data/lib/viral_seq/version.rb +2 -1
data/viral_seq.gemspec +5 -1
metadata +23 -5

data/lib/viral_seq.rb CHANGED

@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
+# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
 require_relative "viral_seq/sequence"
 require_relative "viral_seq/string"
 require_relative "viral_seq/version"
+require_relative "viral_seq/tcs_core"
+require_relative "viral_seq/tcs_json"
 require "muscle_bio"

data/lib/viral_seq/hash.rb CHANGED

@@ -1,4 +1,4 @@
-# addition methods for Class::Hash required for ViralSeq
+# additional methods for Class::Hash required for ViralSeq
 class Hash

data/lib/viral_seq/hivdr.rb CHANGED

@@ -5,6 +5,8 @@ module ViralSeq
     # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
     #   works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
     #   PR codon 1-99
+    #   RT codon 34-122 (HXB2 2650-2914) and 152-236(3001-3257)
+    #   IN codon 53-174 (HXB2 4384-4751)
     # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
     #   can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
     # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`

data/lib/viral_seq/muscle.rb CHANGED

@@ -39,8 +39,8 @@ module ViralSeq
     def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
       temp_dir = Dir.home
-      temp_file = temp_dir + "/_temp_muscle_in"
-      temp_aln = temp_dir + "/_temp_muscle_aln"
+      temp_file = File.join(temp_dir, "_temp_muscle_in")
+      temp_aln = File.join(temp_dir, "_temp_muscle_aln")
       name = ">test"
       temp_in = File.open(temp_file,"w")
       temp_in.puts ">ref"

data/lib/viral_seq/seq_hash.rb CHANGED

@@ -9,7 +9,7 @@ module ViralSeq
   #     # align with MUSCLE
   #   filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
   #     # filter nt sequences with the reference coordinates
-  #   filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
+  #   filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
   #     # return a new ViralSeq::SeqHash object without stop codons
   #   filtered_seqhash = filtered_seqhash.a3g[1]
   #     # further filter out sequences with A3G hypermutations
@@ -130,8 +130,8 @@ module ViralSeq
           end
         end
       end
-      sequence_hash = Hash[*sequence_a]
-      quality_hash = Hash[*quality_a]
+      sequence_hash = Hash[sequence_a.each_slice(2).to_a]
+      quality_hash = Hash[quality_a.each_slice(2).to_a]
       seq_hash = ViralSeq::SeqHash.new
       seq_hash.dna_hash = sequence_hash
@@ -166,6 +166,40 @@ module ViralSeq
       alias_method :array, :new_from_array
     end
+    # the size of nt sequence hash of the SeqHash object
+    # @return [Integer] size of nt sequence hash of the SeqHash object
+    def size
+      self.dna_hash.size
+    end
+    # combine SeqHash objects
+    # @param sh2 [ViralSeq::SeqHash] another SeqHash
+    # @return [ViralSeq::SeqHash] combined SeqHash
+    def +(sh2)
+      new_seqhash = ViralSeq::SeqHash.new
+      new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
+      new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
+      new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
+      new_seqhash.title = self.title + "_with_" + sh2.title
+      new_seqhash.file = self.file + "," + sh2.file
+      return new_seqhash
+    end
+    # write the nt sequences to a FASTA format file
+    # @param file [String] path to the FASTA output file
+    # @return [NilClass]
+    def write_nt_fa(file)
+      File.open(file, 'w') do |f|
+        self.dna_hash.each do |k,v|
+          f.puts k
+          f.puts v
+        end
+      end
+    end
     # generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
     # @return [String] relaxed sequencial phylip format in a String object
     # @example convert fasta format to relaxed sequencial phylip format
@@ -215,10 +249,12 @@ module ViralSeq
     def translate(codon_position = 0)
       seqs = self.dna_hash
       @aa_hash = {}
-      seqs.each do |name, seq|
-        s = ViralSeq::Sequence.new(name, seq)
+      seqs.uniq_hash.each do |seq, array_of_name|
+        s = ViralSeq::Sequence.new('name', seq)
         s.translate(codon_position)
-        @aa_hash[name] = s.aa_string
+        array_of_name.each do |name|
+          @aa_hash[name] = s.aa_string
+        end
       end
       return nil
     end # end of #translate
@@ -277,41 +313,45 @@ module ViralSeq
     # screen for sequences with stop codons.
     # @param (see #translate)
-    # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
+    # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
     #
-    #   # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
-    #   # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
+    #   # :with_stop_codon : ViralSeq::SeqHash object with stop codons
+    #   # :without_stop_codon: ViralSeq::SeqHash object without stop codons
     # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
     #   my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
     #   my_seqhash.dna_hash
     #   => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
-    #   stop_codon_seqhash = my_seqhash.stop_codon[0]
+    #   stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
     #   stop_codon_seqhash.dna_hash
     #   => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
     #   stop_codon_seqhash.aa_hash
     #   => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
     #   stop_codon_seqhash.title
     #   => "my_fasta_file_stop"
-    #   filtered_seqhash = my_seqhash.stop_codon[1]
+    #   filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
     #   filtered_seqhash.aa_hash
     #   {">seq1"=>"IRT", ">seq3"=>"MRT"}
     def stop_codon(codon_position = 0)
       self.translate(codon_position)
       keys = []
-      self.aa_hash.each do |k,v|
-        keys << k if v.include?('*')
+      aa_seqs = self.aa_hash
+      aa_seqs.uniq_hash.each do |seq,array_of_name|
+        keys += array_of_name if seq.include?('*')
       end
       seqhash1 = self.sub(keys)
       seqhash1.title = self.title + "_stop"
-      keys2 = self.aa_hash.keys - keys
+      keys2 = aa_seqs.keys - keys
       seqhash2 = self.sub(keys2)
-      return [seqhash1, seqhash2]
+      return {
+        with_stop_codon: seqhash1,
+        without_stop_codon: seqhash2
+      }
     end #end of #stop_codon
     # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
-    # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
+    # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
     # @return [String] consensus sequence
     # @example consensus sequence from an array of sequences.
     #   seq_array = %w{ ATTTTTTTTT
@@ -343,11 +383,18 @@ module ViralSeq
         base_count = all_base.count_freq
         max_base_list = []
-        base_count.each do |k,v|
-          if v/seq_size.to_f >= cutoff
-            max_base_list << k
+        if cutoff.zero?
+          max_count = base_count.values.max
+          max_base_hash = base_count.select {|_k,v| v == max_count}
+          max_base_list = max_base_hash.keys
+        else
+          base_count.each do |k,v|
+            if v/seq_size.to_f >= cutoff
+              max_base_list << k
+            end
           end
         end
         consensus_seq += call_consensus_base(max_base_list)
       end
       return consensus_seq
@@ -358,14 +405,14 @@ module ViralSeq
     #   # control pattern: G[YN|RC] -> A[YN|RC]
     #   # use the sample consensus to determine potential a3g sites
     #   # Two criteria to identify hypermutation
-    #   # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
+    #   # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
     #   # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
     #   # note:  criteria 2 only applies on a sequence file containing more than 20 sequences,
     #   #        b/c Poisson model does not do well on small sample size.
-    # @return [Array] three values.
-    #   first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
-    #   second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
-    #   third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
+    # @return [Hash] three paris.
+    #   :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
+    #   :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
+    #   :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
     #     # sequence tag
     #     # G to A mutation numbers at potential a3g positions
     #     # total potential a3g G positions
@@ -376,17 +423,17 @@ module ViralSeq
     # @example identify apobec3gf mutations from a sequence fasta file
     #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
     #   hypermut = my_seqhash.a3g
-    #   hypermut[0].dna_hash.keys
+    #   hypermut[:a3g_seq].dna_hash.keys
     #   => [">Seq7", ">Seq14"]
-    #   hypermut[1].dna_hash.keys
+    #   hypermut[:filtered_seq].dna_hash.keys
     #   => [">Seq1", ">Seq2", ">Seq5"]
-    #   hypermut[2]
+    #   hypermut[:stats]
     #   => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
     #
     # @example identify apobec3gf mutations from another sequence fasta file
     #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
     #   hypermut = my_seqhash.a3g
-    #   hypermut[2]
+    #   hypermut[:stats]
     #   => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
     #   # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
     #   # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -479,7 +526,10 @@ module ViralSeq
       hm_seq_hash.title = self.title + "_hypermut"
       hm_seq_hash.file = self.file
       filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
-      return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
+      return { a3g_seq: hm_seq_hash,
+               filtered_seq: filtered_seq_hash,
+               stats: hm_hash.values
+              }
     end #end of #a3g_hypermut
     alias_method :a3g, :a3g_hypermut
@@ -693,6 +743,7 @@ module ViralSeq
       seq_hash_unique.each do |seq|
         loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
+        next unless loc # if locator tool fails, skip this seq.
         if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
           if indel
             seq_hash_unique_pass << seq
@@ -748,7 +799,7 @@ module ViralSeq
         s.rc!
         loc2 = s.locator(ref_option)
         loc1[2] >= loc2[2] ? (direction = :+; loc = loc1): (direction = :-; loc = loc2)
         names.each do |name|
           out_array << ([title, name, ref_option.to_s, direction.to_s] + loc)
         end
@@ -871,11 +922,11 @@ module ViralSeq
     # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
     # @example gap strip for an array of sequences
     #   array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
-    #   array = { AACCGGTT
-    #             A-CCGGTT
-    #             AAC-GGTT
-    #             AACCG-TT
-    #             AACCGGT- }
+    #   array = %w{ AACCGGTT
+    #               A-CCGGTT
+    #               AAC-GGTT
+    #               AACCG-TT
+    #               AACCGGT- }
     #   my_seqhash = ViralSeq::SeqHash.array(array)
     #   puts my_seqhash.gap_strip.dna_hash.values
     #     ACGT
@@ -930,12 +981,11 @@ module ViralSeq
     # @param (see #gap_strip)
     # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
     # @example gap strip for an array of sequences only at the ends
-    #   array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
-    #   array = { AACCGGTT
-    #             A-CCGGTT
-    #             AAC-GGTT
-    #             AACCG-TT
-    #             AACCGGT- }
+    #   array = %w{ AACCGGTT
+    #               A-CCGGTT
+    #               AAC-GGTT
+    #               AACCG-TT
+    #               AACCGGT- }
     #   my_seqhash = ViralSeq::SeqHash.array(array)
     #   puts my_seqhash.gap_strip_ends.dna_hash.values
     #     AACCGGT
@@ -999,8 +1049,137 @@ module ViralSeq
     end
+    # mutate @dna_hash based on the error_rate
+    # @param error_rate [Float] error rate used to mutate sequences.
+    # @return [ViralSeq::SeqHash] new SeqHash object of mutated sequences.
+    def mutation(error_rate = 0.01)
+      new_seqhash = ViralSeq::SeqHash.new
+      dna = {}
+      self.dna_hash.each do |name, seq|
+        dna[name + '_mut-' + error_rate.to_s] = seq.mutation(error_rate)
+      end
+      new_seqhash.dna_hash = dna
+      new_seqhash.title = self.title + "_mut-" + error_rate.to_s
+      new_seqhash.file = self.file
+      return new_seqhash
+    end
+    # return an table of frequencies of nucleotides at each position.
+    # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
+    # @param head [Boolean] if the head of table is included.
+    # @return [Array] a two-dimension array of the frequency table,
+    #  including the following info:
+    #    position on the sequence (starting from 1)
+    #    consensus nucleotide
+    #    total sequence numbers
+    #    percentage of A, shows "-" if agrees with consensus
+    #    percentage of C, shows "-" if agrees with consensus
+    #    percentage of G, shows "-" if agrees with consensus
+    #    percentage of T, shows "-" if agrees with consensus
+    #
+    # @example error table for an array of sequences
+    #   array = %w{ AACCGGTT
+    #               AGCCGGTT
+    #               AACTGCTT
+    #               AACCGTTA
+    #               AACCGGTA }
+    #   my_seqhash = ViralSeq::SeqHash.array(array)
+    #   my_seqhash.error_table.each {|r| puts r.join(',')}
+    #     position,consensus,total_seq_number,A,C,G,T
+    #     1,A,5,-,,,
+    #     2,A,5,-,,0.2,
+    #     3,C,5,,-,,
+    #     4,C,5,,-,,0.2
+    #     5,G,5,,,-,
+    #     6,G,5,,0.2,-,0.2
+    #     7,T,5,,,,-
+    #     8,T,5,0.4,,,-
+    def error_table(ref = self.consensus, head = true)
+      table = []
+      if head
+        table << %w{
+          position
+          consensus
+          total_seq_number
+          A
+          C
+          G
+          T
+        }
+      end
+      ref_size = ref.size
+      (0..(ref_size - 1)).each do |position|
+        ref_base = ref[position]
+        nts = []
+        self.dna_hash.each do |_k,v|
+          nts << v[position]
+        end
+        freq = nts.count_freq
+        freq2 = {}
+        freq.each do |nt,c|
+          if nt == ref_base
+            freq2[nt] = '-'
+          else
+            freq2[nt] = (c/(self.size).to_f)
+          end
+        end
+        table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
+      end
+      return table
+    end # end of error_table
+    # randomly select n number of sequences from the orginal SeqHash object
+    # @param n [Integer] number of sequences to randomly select
+    # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
+    def random_select(n = 100)
+      new_sh = ViralSeq::SeqHash.new
+      dna_hash = self.dna_hash
+      aa_hash = self.aa_hash
+      qc_hash = self.qc_hash
+      keys = dna_hash.keys.sample(n)
+      keys.each do |k|
+        new_sh.dna_hash[k] = dna_hash[k]
+        new_sh.aa_hash[k] = aa_hash[k]
+        new_sh.qc_hash[k] = qc_hash[k]
+      end
+      new_sh.title = self.title + "_" + n.to_s
+      return new_sh
+    end
+    # trim dna sequences based on the provided reference coordinates.
+    # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
+    # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
+    # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
+    # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
+    # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
+    def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
+      seq_hash = self.dna_hash.dup
+      seq_hash_unique = seq_hash.uniq_hash
+      trimmed_seq_hash = {}
+      seq_hash_unique.each do |seq, names|
+        trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
+        names.each do |name|
+          trimmed_seq_hash[name] = trimmed_seq
+        end
+      end
+      return_seq_hash = self.dup
+      return_seq_hash.dna_hash = trimmed_seq_hash
+      return return_seq_hash
+    end
     # start of private functions
     private

data/lib/viral_seq/seq_hash_pair.rb CHANGED

@@ -7,7 +7,7 @@ module ViralSeq
   # @example join the paired-end sequences with an overlap of 100 bp
   #   my_seqhashpair.join1(100)
   # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
-  #   my_seqhashpair.join1(:indiv)
+  #   my_seqhashpair.join2(model: :indiv)
   class SeqHashPair
@@ -80,6 +80,12 @@ module ViralSeq
       alias_method :fa, :new_from_fasta
     end
+    # the size of nt sequence hash of the SeqHashPair object
+    # @return [Integer] size of nt sequence hash of the SeqHash object
+    def size
+      self.dna_hash.size
+    end
     # Pair-end join function for KNOWN overlap size.
     # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
     # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -104,17 +110,21 @@ module ViralSeq
       raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
       raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
       joined_seq = {}
-      seq_pair_hash.each do |seq_name, seq_pair|
+      seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
         r1_seq = seq_pair[0]
         r2_seq = seq_pair[1]
         if overlap.zero?
-          joined_seq[seq_name] = r1_seq + r2_seq
+          joined_sequence = r1_seq + r2_seq
         elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
-          joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
+          joined_sequence= r1_seq + r2_seq[overlap..-1]
         else
           next
         end
+        seq_names.each do |seq_name|
+          joined_seq[seq_name] = joined_sequence
+        end
       end
       joined_seq_hash = ViralSeq::SeqHash.new
       joined_seq_hash.dna_hash = joined_seq
       joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +149,7 @@ module ViralSeq
     #   my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
     #   my_seqhashpair.join2.dna_hash
     #   => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
-    #   my_seqhashpair.join2(model :indiv).dna_hash
+    #   my_seqhashpair.join2(model: :indiv).dna_hash
     #   => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
     def join2(model: :con, diff: 0.0)
@@ -207,7 +217,7 @@ module ViralSeq
     # {minimal overlap set to 4. }
     def overlap_matrix(sequence1, sequence2)
       min_overlap = 4
-      max_overlap = [sequence1.size, sequence2.size].max
+      max_overlap = [sequence1.size, sequence2.size].min
       matrix_hash = {}
       (min_overlap..max_overlap).each do |overlap|
         matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])