RubyGems - viral_seq - Versions diffs - 0.3.2 → 1.0.0 - Mend

viral_seq 0.3.2 → 1.0.0

Files changed (30) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +7 -1
data/lib/viral_seq/Integer.rb +16 -0
data/lib/viral_seq/constant.rb +7 -0
data/lib/viral_seq/enumerable.rb +132 -0
data/lib/viral_seq/hash.rb +45 -0
data/lib/viral_seq/hivdr.rb +454 -0
data/lib/viral_seq/math.rb +128 -380
data/lib/viral_seq/muscle.rb +60 -82
data/lib/viral_seq/pid.rb +26 -0
data/lib/viral_seq/ref_seq.rb +35 -0
data/lib/viral_seq/rubystats.rb +172 -0
data/lib/viral_seq/seq_hash.rb +1043 -0
data/lib/viral_seq/seq_hash_pair.rb +219 -0
data/lib/viral_seq/sequence.rb +571 -348
data/lib/viral_seq/string.rb +119 -0
data/lib/viral_seq/version.rb +1 -1
data/lib/viral_seq.rb +14 -15
metadata +13 -12
data/lib/viral_seq/a3g.rb +0 -172
data/lib/viral_seq/fasta.rb +0 -154
data/lib/viral_seq/hcv_dr.rb +0 -54
data/lib/viral_seq/locator.rb +0 -299
data/lib/viral_seq/misc.rb +0 -103
data/lib/viral_seq/nt_variation.rb +0 -148
data/lib/viral_seq/poisson_cutoff.rb +0 -68
data/lib/viral_seq/refseq.rb +0 -45
data/lib/viral_seq/sdrm_core.rb +0 -652
data/lib/viral_seq/tcs_core.rb +0 -556

data/lib/viral_seq/seq_hash_pair.rb ADDED Viewed

@@ -0,0 +1,219 @@
+module ViralSeq
+  # Class for paired-end sequences.
+  # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
+  #   my_seqhashpair = ViralSeq::SeqHashPair.fa('my_seq_directory')
+  # @example join the paired-end sequences with an overlap of 100 bp
+  #   my_seqhashpair.join1(100)
+  # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
+  #   my_seqhashpair.join1(:indiv)
+  class SeqHashPair
+    # initialize SeqHashPair object with @dna_hash, @title and @file
+    def initialize (dna_hash = {}, title = "", file = [])
+      @dna_hash = dna_hash
+      @title = title
+      @file = file
+    end
+    # @return [Hash] Hash object for :name => [:r1_sequence_string, :r2_sequence_string]
+    attr_accessor :dna_hash
+    # @return [String] the title of the SeqHash object.
+    # default as the directory basename if SeqHash object is initialized using ::fa
+    attr_accessor :title
+    # @return [String] the r1 and r2 files that are used to initialize SeqHash object, if they exist
+    attr_accessor :file
+    # initialize a new ViralSeq::SeqHashPair object from a directory containing paired sequence files in the FASTA format
+    # @param indir [String] directory containing paired sequence files in the FASTA format,
+    #
+    #     Paired sequence files need to have "r1" and "r2" in their file names
+    #
+    #     Example for the file structure
+    #       ├───lib1
+    #           │     lib1_r1.txt
+    #           │     lib1_r2.txt
+    #     The sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
+    # @return [ViralSeq::SeqHashPair] new SeqHashPair object from the paired FASTA sequence files
+    # @example initialize a new SeqHashPair object from a directory containing paired-end sequences
+    #   my_seqhashpair = ViralSeq::SeqHashPair.fa('spec/sample_paired_seq')
+    def self.new_from_fasta(indir)
+      files = Dir[indir + "/*"]
+      r1_file = ""
+      r2_file = ""
+      files.each do |f|
+        if File.basename(f) =~ /r1/i
+          r1_file = f
+        elsif File.basename(f) =~ /r2/i
+          r2_file = f
+        end
+      end
+      seq1 = ViralSeq::SeqHash.fa(r1_file).dna_hash
+      seq2 = ViralSeq::SeqHash.fa(r2_file).dna_hash
+      new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
+      new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
+      seq_pair_hash = {}
+      new_seq1.each do |seq_name,seq|
+        seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
+      end
+      seq_hash = ViralSeq::SeqHashPair.new
+      seq_hash.dna_hash = seq_pair_hash
+      seq_hash.title = File.basename(indir,".*")
+      seq_hash.file = [r1_file, r2_file]
+      return seq_hash
+    end # end of .new_from_fasta
+    class << self
+      alias_method :fa, :new_from_fasta
+    end
+    # Pair-end join function for KNOWN overlap size.
+    # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
+    # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
+    # @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
+    # @example join paired-end sequences with different :diff cut-offs, overlap provided.
+    #   paired_seqs = {">pair1"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+    #                             "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
+    #                  ">pair2"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+    #                             "AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"],
+    #                  ">pair3"=>["GGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+    #                             "AAAAAAAAAAGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTT"]}
+    #   my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seqs)
+    #   my_seqhashpair.join1(100).dna_hash.keys
+    #   => [">pair1"]
+    #   my_seqhashpair.join1(100,0.01).dna_hash.keys
+    #   => [">pair1", ">pair2"]
+    #   my_seqhashpair.join1(100,0.02).dna_hash.keys
+    #   => [">pair1", ">pair2", ">pair3"]
+    def join1(overlap = 0, diff = 0.0)
+      seq_pair_hash = self.dna_hash
+      raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
+      raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
+      joined_seq = {}
+      seq_pair_hash.each do |seq_name, seq_pair|
+        r1_seq = seq_pair[0]
+        r2_seq = seq_pair[1]
+        if overlap.zero?
+          joined_seq[seq_name] = r1_seq + r2_seq
+        elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
+          joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
+        else
+          next
+        end
+      end
+      joined_seq_hash = ViralSeq::SeqHash.new
+      joined_seq_hash.dna_hash = joined_seq
+      joined_seq_hash.title = self.title + "_joined"
+      joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
+      return joined_seq_hash
+    end # end of join1
+    # Pair-end join function for UNKNOWN overlap.
+    # @param model [Symbol] models used to determine the overlap, `:con`, `:indiv`
+    #
+    #   model `:con`: overlap is determined based on consensus, all sequence pairs are supposed to have the same overlap size
+    #
+    #     note: minimal overlap as 4 bases.
+    #   model `:indiv`: overlap is determined for each sequence pair, sequence pairs can have different size of overlap
+    # @param diff (see #join1)
+    # @return (see #join1)
+    # @example join paired-end sequences, overlap NOT provided
+    #   paired_seq2 = {">pair4" => ["AAAGGGGGGG", "GGGGGGGTT"],
+    #                  ">pair5" => ["AAAAAAGGGG", "GGGGTTTTT"],
+    #                  ">pair6" => ["AAACAAGGGG", "GGGGTTTTT"] }
+    #   my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
+    #   my_seqhashpair.join2.dna_hash
+    #   => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
+    #   my_seqhashpair.join2(:indiv).dna_hash
+    #   => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
+    def join2(model = :con, diff = 0.0)
+      seq_pair_hash = self.dna_hash
+      begin
+        raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
+        if model == :con
+          overlap = determine_overlap_pid_pair(seq_pair_hash, diff)
+          return self.join1(overlap, diff)
+        elsif model == :indiv
+          joined_seq = {}
+          seq_pair_hash.each do |seq_name, seq_pair|
+            overlap_list = []
+            overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
+              cut_off_base = overlap1 * diff
+              overlap_list << overlap1 if diff_nt <= cut_off_base
+            end
+            if overlap_list.empty?
+              joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
+            else
+              overlap = overlap_list.max
+              joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
+            end
+          end
+          joined_seq_hash = ViralSeq::SeqHash.new
+          joined_seq_hash.dna_hash = joined_seq
+          joined_seq_hash.title = self.title + "_joined"
+          joined_seq_hash.file = File.dirname(self.file[0]) if self.file.size > 0
+          return joined_seq_hash
+        else
+          raise ArgumentError.new("Error::Wrong Overlap Model Argument. Given \`#{model}\`, expected `:con` or `:indiv`.")
+        end
+      rescue ArgumentError => e
+        puts e
+        return nil
+      end
+    end # end of join2
+    private
+    # determine overlap size from @dna_hash
+    def determine_overlap_pid_pair(seq_pair_hash, diff = 0.0)
+      overlaps = []
+      seq_pair_hash.each do |_seq_name, seq_pair|
+        overlap_list = []
+        matrix = overlap_matrix(seq_pair[0], seq_pair[1])
+        matrix.each do |overlap, diff_nt|
+          cut_off_base = overlap * diff
+          overlap_list << overlap if diff_nt <= cut_off_base
+        end
+        if overlap_list.empty?
+          overlaps << 0
+        else
+          overlaps << overlap_list.max
+        end
+      end
+      count_overlaps = overlaps.count_freq
+      max_value = count_overlaps.values.max
+      max_overlap_list = []
+      count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
+      max_overlap_list.max
+    end # end pf determine_overlap_pid_pair
+    # input a pair of sequences as String, return a Hash object of overlapping Hash object
+    # {:overlap_size => number_of_differnt_positions, ...}
+    # {minimal overlap set to 4. }
+    def overlap_matrix(sequence1, sequence2)
+      min_overlap = 4
+      max_overlap = [sequence1.size, sequence2.size].max
+      matrix_hash = {}
+      (min_overlap..max_overlap).each do |overlap|
+        matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
+      end
+      return matrix_hash
+    end # end of overlap_matrix
+  end # end of SeqHashPair
+end # end of ViralSeq

data/lib/viral_seq/sequence.rb CHANGED Viewed

@@ -1,392 +1,615 @@
-# lib/sequence.rb
-# Includes functions for sequence operations
-# Including methods as:
-#   ViralSeq::AMINO_ACID_LIST
-#   ViralSeq::Sequence
-#   ViralSeq::Sequence#rev_complement
-#   ViralSeq::Sequence#get_aa_sequence
-#   ViralSeq::Sequence#get_aa_array
-#   ViralSeq::Sequence#name
-#   ViralSeq::Sequence#dna_sequence
-#   ViralSeq::Sequence#aa_sequence
-#   ViralSeq::Sequence#aa_array
-#   ViralSeq::amino_acid
-#   ViralSeq::amino_acid_2
-#   ViralSeq::to_list
-#   ViralSeq::uniq_sequence_hash
-#   ViralSeq::stop_codon_seq_hash
-#   String#rc
-#   String#mutation
-#   String#nt_parser
-# ViralSeq::AMINO_ACID_LIST
-#   # Array of all amino acid one letter abbreviations
-# ViralSeq::Sequence
-#   # Sequence class
-# =USAGE
-#   # create a sequence object
-#   seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
-#
-#   # print dna sequence
-#   puts seq.dna_sequence
-#
-#   # reserce complement sequence of DNA sequence, return as a string
-#   seq.rev_complement
-#
-#   # change @dna_sequence to reverse complement DNA sequence
-#   seq.rev_complement!
-#
-#   # generate amino acid sequences. either return string or array.
-#   # starting codon option 0, 1, 2 for 1st, 2nd, 3rd reading frame.
-#   # if sequence contains ambiguities, Sequence.get_aa_array will return all possible amino acids.
-#   seq.get_aa_sequence
-#   # or
-#   seq.get_aa_array
-#
-#   # print amino acid sequence
-#   puts seq.aa_sequence
-# ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
-#   # collapse sequence hash to unique sequence hash.
-#   # input_sequence_hash is a sequence Hash object {:name => :sequence, ...}
-#   # master_sequence_tag is the master tag for unique sequences
-#   # sequences will be named as (master_sequence_tag + "_" + Integer + "_" + Counts)
-# =USAGE
-#   sequences = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA',
-#                '>seq4' => 'CCCC', '>seq5' => 'CCCC',
-#                '>seq6' => 'TTTT' }
-#   uniq_sequence = ViralSeq.uniq_sequence_hash(sequences)
-#   => {">sequence_1_3"=>"AAAA", ">sequence_2_2"=>"CCCC", ">sequence_3_1"=>"TTTT"}
 module ViralSeq
-  # array for all amino acid one letter abbreviations
-  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
-  # sequence class
+  # ViralSeq::Sequence class for sequence operation
+  #
+  # @example create a sequence object
+  #   seq = ViralSeq::Sequence.new('my_sequence', 'ACCTAGGTTCGGAGC')
+  #   => #<ViralSeq::Sequence:0x00007fd03c8c10b8 @name="my_sequence", @dna="ACCTAGGTTCGGAGC", @aa_string="", @aa_array=[]>
+  #
+  # @example return dna sequence as String
+  #   seq.dna
+  #   => "ACCTAGGTTCGGAGC"
+  #
+  # @example reverse complement sequence of DNA sequence
+  #   seq.rc
+  #   => "GCTCCGAACCTAGGT"
+  #
+  # @example change @dna to reverse complement DNA sequence
+  #   seq.rc!
+  #
+  # @example translate the DNA sequence, return values for @aa_string and @aa_array
+  #   seq = ViralSeq::Sequence.new('my_sequence', 'AWTCGRAGAG')
+  #   seq.translate(1)
+  #   seq.aa_string
+  #   => "##E"
+  #   seq.aa_array
+  #   => ["IF", "EG", "E"]
   class Sequence
+    # initialize a ViralSeq::Sequence class with sequence name (default as '>sequence')
+    # and DNA sequence as String object
     def initialize (name = ">sequence",dna_sequence ="")
       @name = name
-      @dna_sequence = dna_sequence.upcase
-      @aa_sequence = ""
+      @dna = dna_sequence.upcase
+      @aa_string = ""
       @aa_array = []
     end
-    attr_accessor :name, :dna_sequence, :aa_sequence, :aa_array
+    # @return [String] sequence tag name
+    attr_accessor :name
+    # @return [String] DNA sequence
+    attr_accessor :dna
+    # @return [String] amino acid sequence
+    attr_accessor :aa_string
+    # @return [Array] amino acid sequence as an Array object,
+    # ambiguity dna sequence will be translated in all possible amino acid sequence at the position
+    attr_accessor :aa_array
+    # @return [String] reverse compliment sequence of the @dna.
     def rev_complement
-      @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
+      @dna.rc
     end
+    # replace the @dna with reverse complement DNA sequence.
     def rev_complement!
-      @dna_sequence = @dna_sequence.reverse.upcase.tr('ATCG','TAGC')
+      @dna = @dna.rc
     end
-    def get_aa_sequence(initial_position = 0)
-      @aa_sequence = ""
-      require_sequence = @dna_sequence[initial_position..-1]
+    alias_method :rc, :rev_complement
+    alias_method :rc!, :rev_complement!
+    # translate @dna to amino acid sequence.
+    # generate values for @aa_string and @aa_array
+    # @param initial_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
+    def translate(initial_position = 0)
+      @aa_string = ""
+      require_sequence = @dna[initial_position..-1]
       base_array = []
       require_sequence.each_char {|base| base_array << base}
       while (base_array.length>=3) do
         base_3= ""
         3.times {base_3 += base_array.shift}
-        @aa_sequence << amino_acid(base_3)
+        @aa_string << amino_acid(base_3)
       end
-      return @aa_sequence
-    end
-    # get amino acid calls, return a array.keep ambiguity calls.
-    def get_aa_array(initial_position = 0)
       @aa_array = []
-      require_sequence = @dna_sequence[initial_position..-1].tr('-','N')
+      require_sequence = @dna[initial_position..-1].tr('-','N')
       base_array = []
       require_sequence.each_char {|base| base_array << base}
       while (base_array.length>=3) do
         base_3= ""
         3.times{base_3 += base_array.shift}
-        @aa_array<< ViralSeq.amino_acid_2(base_3)
+        @aa_array<< amino_acid_2(base_3)
       end
-      return @aa_array
     end
+    # @return [Integer] length of DNA sequence
     def dna_length
-      @dna_sequence.length
+      @dna.length
     end
+    # @return [Integer] length of amino acid sequence
     def aa_length
-      @aa_sequence.length
+      @aa_string.length
     end
-  end
-  # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
-  def self.amino_acid (bases)
-    case bases
-    when /^TT[TCY]$/
-      return "F"
-    when /^TT[AGR]$/
-      return "L"
-    when /^CT.$/
-      return "L"
-    when /^AT[TCAHYWM]$/
-      return "I"
-    when "ATG"
-      return "M"
-    when /^GT.$/
-      return "V"
-    when /^TC.$/
-      return "S"
-    when /^CC.$/
-      return "P"
-    when /^AC.$/
-      return "T"
-    when /^GC.$/
-      return "A"
-    when /^TA[TCY]$/
-      return "Y"
-    when /^TA[AGR]$/
-      return "*"
-    when /^T[GR]A$/
-      return "*"
-    when /^CA[TCY]$/
-      return "H"
-    when /^CA[AGR]$/
-      return "Q"
-    when /^AA[TCY]$/
-      return "N"
-    when /^AA[AGR]$/
-      return "K"
-    when /^GA[TCY]$/
-      return "D"
-    when /^GA[AGR]$/
-      return "E"
-    when /^TG[TCY]$/
-      return "C"
-    when "TGG"
-      return "W"
-    when /^CG.$/
-      return "R"
-    when /^AG[TCY]$/
-      return "S"
-    when /^[AM]G[AGR]$/
-      return "R"
-    when /^GG.$/
-      return "G"
-    when /^[ATW][CGS][CTY]$/
-      return "S"
-    when /^[TCY]T[AGR]$/
-      return "L"
-    else
-      return "#"
-    end
-  end
-  # keep ambiguities, return all possible amino acids.
-  def self.amino_acid_2 (bases)
-    bases_to_aa = []
-    aa_list = []
-    base1 = ViralSeq.to_list(bases[0])
-    base2 = ViralSeq.to_list(bases[1])
-    base3 = ViralSeq.to_list(bases[2])
-    l1 = base1.size - 1
-    l2 = base2.size - 1
-    l3 = base3.size - 1
-    (0..l1).each do |n1|
-      b1 = base1[n1]
-      (0..l2).each do |n2|
-        b2 = base2[n2]
-        (0..l3).each do |n3|
-          b3 = base3[n3]
-          bases_all = b1 + b2 + b3
-          bases_to_aa << bases_all
+    # resistant mutation interpretation for a chosen region from a translated ViralSeq::Sequence object
+    # @param option [Symbol] option of region to interpret, `:hcv_ns5a`, `:hiv_pr`, `:nrti`, `:nnrti`, `hiv_in`
+    # @param start_aa [Integer] the starting aa number of the input sequence
+    # @return [Hash] return a Hash object for SDRMs identified. :posiiton => [:wildtype_codon, :mutation_codon]
+    # @example examine an HIV PR region sequence for drug resistance mutations
+    #   my_seq_name = 'a_pr_seq'
+    #   my_seq = 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAAATAGGAGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATAATACAGTATTAGAAGACATGGAGTTACCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATCTGTGGGCATAAAACTACAGGTACAGTGTTAATAGGACCTACACCCGTCAACATAATTGGAAGAGATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTT'
+    #   s = ViralSeq::Sequence.new(my_seq_name, my_seq)
+    #   s.translate
+    #   s.sdrm(:hiv_pr)
+    #   => {30=>["D", "N"], 88=>["N", "D"]}
+    def sdrm(option, start_aa = 1)
+      aa_array = self.aa_array
+      out_hash = {}
+      sdrm = sdrm_hash(option)
+      aa_length = aa_array.size
+      end_aa = start_aa + aa_length - 1
+      (start_aa..end_aa).each do |position|
+        array_position = position - start_aa
+        if sdrm.keys.include?(position)
+          wt_aa = sdrm[position][0]
+          test_aa = aa_array[array_position]
+          if test_aa.size == 1
+            unless wt_aa == test_aa
+              if sdrm[position][1].include?(test_aa)
+                out_hash[position] = [wt_aa,test_aa]
+              end
+            end
+          else
+            test_aa_array = test_aa.split("")
+            if (test_aa_array & sdrm[position][1])
+              out_hash[position] = [wt_aa,test_aa]
+            end
+          end
         end
       end
-    end
+      return out_hash
+    end # end of #hcv_ns5a
-    bases_to_aa.each do |base|
-    case base
-    when /^TT[TCY]$/
-      aa =  "F"
-    when /^TT[AGR]$/
-      aa =  "L"
-    when /^CT.$/
-      aa =  "L"
-    when /^AT[TCAHYWM]$/
-      aa =  "I"
-    when "ATG"
-      aa =  "M"
-    when /^GT.$/
-      aa =  "V"
-    when /^TC.$/
-      aa =  "S"
-    when /^CC.$/
-      aa =  "P"
-    when /^AC.$/
-      aa =  "T"
-    when /^GC.$/
-      aa =  "A"
-    when /^TA[TCY]$/
-      aa =  "Y"
-    when /^TA[AGR]$/
-      aa =  "*"
-    when /^T[GR]A$/
-      aa =  "*"
-    when /^CA[TCY]$/
-      aa =  "H"
-    when /^CA[AGR]$/
-      aa =  "Q"
-    when /^AA[TCY]$/
-      aa =  "N"
-    when /^AA[AGR]$/
-      aa =  "K"
-    when /^GA[TCY]$/
-      aa =  "D"
-    when /^GA[AGR]$/
-      aa =  "E"
-    when /^TG[TCY]$/
-      aa =  "C"
-    when "TGG"
-      aa =  "W"
-    when /^CG.$/
-      aa =  "R"
-    when /^AG[TCY]$/
-      aa =  "S"
-    when /^[AM]G[AGR]$/
-      aa =  "R"
-    when /^GG.$/
-      aa =  "G"
-    when /^[ATW][CGS][CTY]$/
-      aa =  "S"
-    when /^[TCY]T[AGR]$/
-      aa =  "L"
-    else
-      aa =  "-"
-    end
-    aa_list << aa
-  end
-    aa_out = aa_list.uniq.join('/')
-    return aa_out
-  end
-  # parse ambiguity bases, aka %w{W S M K R Y B D H V N}
-  def self.to_list(base = "")
-    list = []
-    case base
-    when /[A|T|C|G]/
-      list << base
-    when "W"
-      list = ['A','T']
-    when "S"
-      list = ['C','G']
-    when "M"
-      list = ['A','C']
-    when 'K'
-      list = ['G','C']
-    when 'R'
-      list = ['A','G']
-    when 'Y'
-      list = ['C','T']
-    when 'B'
-      list = ['C','G','T']
-    when 'D'
-      list = ['A','G','T']
-    when 'H'
-      list = ['A','C','T']
-    when 'V'
-      list = ['A','C','G']
-    when 'N'
-      list = ['A','T','C','G']
-    end
-    return list
-  end
-  # ViralSeq.uniq_sequence_hash(input_sequence_hash, master_sequence_tag)
-  # collapse sequence hash to unique sequence hash.
-  # input_sequence_hash is a sequence hash {:name => :sequence, ...}
-  # master_sequence_tag is the master tag for unique sequences
-  # sequences will be named as (master_sequence_tag + "_" + Integer)
-  def self.uniq_sequence_hash(seq = {}, sequence_name = "sequence")
-    uni = ViralSeq.count(seq.values)
-    new_seq = {}
-    n = 1
-    uni.each do |s,c|
-      name = ">" + sequence_name + "_" + n.to_s + "_" + c.to_s
-      new_seq[name] = s
-      n += 1
-    end
-    return new_seq
-  end
-  # input a sequence hash, return a sequence hash with stop codons.
-  def self.stop_codon_seq_hash(seq_hash, rf = 0)
-    out_seq_hash = {}
-    seq_hash.each do |k,v|
-      sequence = Sequence.new(k,v)
-      sequence.get_aa_array(rf)
-      if sequence.aa_array.include?("*")
-        out_seq_hash[k] = v
+    # HIV sequence locator function, resembling HIV Sequence Locator from LANL
+    #   # current version only supports nucleotide sequence, not for amino acid sequence.
+    # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
+    # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
+    # @return [Array] an array of the following info
+    #   #   start_location (Integer)
+    #   #   end_location (Integer)
+    #   #   percentage_of_similarity_to_reference_sequence (Float)
+    #   #   containing_indel? (Boolean)
+    #   #   aligned_input_sequence (String)
+    #   #   aligned_reference_sequence (String)
+    # @example identify the location of the input sequence on the NL43 genome
+    #   sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
+    #   s = ViralSeq::Sequence.new('my_sequence', sequence)
+    #   loc = s.locator(:NL43)
+    #   h = ViralSeq::SeqHash.new; h.dna_hash['NL43'] = loc[5]; h.dna_hash[s.name] = loc[4]
+    #   rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
+    #   puts "The input sequence \"#{s.name}\" is located on the NL43 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
+    #   => The input sequence "my_sequence" is located on the NL43 nt sequence from 2333 to 2433.
+    #   => It is 98.0% similar to the reference.
+    #   => It does not have indels.
+    #   => The alignment is
+    #   => NL43         AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
+    #   => my_sequence  AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
+    # @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
+    def locator(ref_option = :HXB2, path_to_muscle = false)
+      seq = self.dna
+      ori_ref = ViralSeq::RefSeq.get(ref_option)
+      begin
+        ori_ref_l = ori_ref.size
+        l1 = 0
+        l2 = 0
+        aln_seq = ViralSeq::Muscle.align(ori_ref, seq, path_to_muscle)
+        aln_test = aln_seq[1]
+        aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
+        gap_begin = $1.size
+        gap_end = $3.size
+        aln_test2 = $2
+        ref = aln_seq[0]
+        ref = ref[gap_begin..(-gap_end-1)]
+        ref_size = ref.size
+        if ref_size > 1.3*(seq.size)
+          l1 = l1 + gap_begin
+          l2 = l2 + gap_end
+          max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
+          aln_test2 =~ /#{max_seq}/
+          before_aln_seq = $`
+          before_aln = $`.size
+          post_aln_seq = $'
+          post_aln = $'.size
+          before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
+          b1 = (1.3 * before_aln_seq_size).to_i
+          post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
+          b2 = (1.3 * post_aln_seq_size).to_i
+          if (before_aln > seq.size) and (post_aln <= seq.size)
+            ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
+            l1 = l1 + (before_aln - b1)
+          elsif (post_aln > seq.size) and (before_aln <= seq.size)
+            ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
+            l2 = l2 + post_aln - b2
+          elsif (post_aln > seq.size) and (before_aln > seq.size)
+            ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
+            l1 = l1 + (before_aln - b1)
+            l2 = l2 + (post_aln - b2)
+          end
+          aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
+          aln_test = aln_seq[1]
+          aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
+          gap_begin = $1.size
+          gap_end = $3.size
+          ref = aln_seq[0]
+          ref = ref[gap_begin..(-gap_end-1)]
+        end
+        aln_test = aln_seq[1]
+        aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
+        gap_begin = $1.size
+        gap_end = $3.size
+        aln_test = $2
+        aln_test =~ /^(\w+)(\-*)\w/
+        s1 = $1.size
+        g1 = $2.size
+        aln_test =~ /\w(\-*)(\w+)$/
+        s2 = $2.size
+        g2 = $1.size
+        l1 = l1 + gap_begin
+        l2 = l2 + gap_end
+        repeat = 0
+        if g1 == g2 and (s1 + g1 + s2) == ref.size
+          if s1 > s2 and g2 > 2*s2
+            ref = ref[0..(-g2-1)]
+            repeat = 1
+            l2 = l2 + g2
+          elsif s1 < s2 and g1 > 2*s1
+            ref = ref[g1..-1]
+            repeat = 1
+            l1 = l1 + g1
+          end
+        else
+          if g1 > 2*s1
+            ref = ref[g1..-1]
+            repeat = 1
+            l1 = l1 + g1
+          end
+          if g2 > 2*s2
+            ref = ref[0..(-g2 - 1)]
+            repeat = 1
+            l2 = l2 + g2
+          end
+        end
+        while repeat == 1
+          aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
+          aln_test = aln_seq[1]
+          aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
+          gap_begin = $1.size
+          gap_end = $3.size
+          aln_test = $2
+          aln_test =~ /^(\w+)(\-*)\w/
+          s1 = $1.size
+          g1 = $2.size
+          aln_test =~ /\w(\-*)(\w+)$/
+          s2 = $2.size
+          g2 = $1.size
+          ref = aln_seq[0]
+          ref = ref[gap_begin..(-gap_end-1)]
+          l1 = l1 + gap_begin
+          l2 = l2 + gap_end
+          repeat = 0
+          if g1 > 2*s1
+            ref = ref[g1..-1]
+            repeat = 1
+            l1 = l1 + g1
+          end
+          if g2 > 2*s2
+            ref = ref[0..(-g2 - 1)]
+            repeat = 1
+            l2 = l2 + g2
+          end
+        end
+        ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
+        aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
+        aln_test = aln_seq[1]
+        ref = aln_seq[0]
+        #refine alignment
+        if ref =~ /^(\-+)/
+          l1 = l1 - $1.size
+        elsif ref =~ /(\-+)$/
+          l2 = l2 + $1.size
+        end
+        if (ori_ref_l - l2 - 1) >= l1
+          ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
+          aln_seq = ViralSeq::Muscle.align(ref, seq, path_to_muscle)
+          aln_test = aln_seq[1]
+          ref = aln_seq[0]
+          ref_size = ref.size
+          sim_count = 0
+          (0..(ref_size-1)).each do |n|
+            ref_base = ref[n]
+            test_base = aln_test[n]
+            sim_count += 1 if ref_base == test_base
+          end
+          similarity = (sim_count/ref_size.to_f*100).round(1)
+          loc_p1 = l1 + 1
+          loc_p2 = ori_ref_l - l2
+          if seq.size != (loc_p2 - loc_p1 + 1)
+              indel = true
+          elsif aln_test.include?("-")
+              indel = true
+          else
+              indel = false
+          end
+          return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
+        else
+          return [0,0,0,0,0,0,0]
+        end
+      rescue => e
+        puts "Unexpected error occured."
+        puts "Exception Class: #{ e.class.name }"
+        puts "Exception Message: #{ e.message }"
+        puts "Exception Backtrace: #{ e.backtrace[0] }"
+        puts "ViralSeq.sequence_locator returns nil"
+        return nil
       end
-    end
-    return out_seq_hash
-  end
-end
-# functions added to Class::String for direct operation on sequence if it is a String object
-# String.rc
-#   # reverse complement
-#   # example
-#   "ACAGA".rc
-#   => "TCTGT"
-#
-# String.mutation(error_rate)
-#   # mutate a nt sequence (String class) randomly
-#   # must define error rate, default value 0.01, aka 1%
-# =USAGE
-#   # example
-#   seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
-#   seq.mutation(0.05)
-#   => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
-#
-# String.nt_parser
-#   # parse the nucleotide sequences as a String object and return a Regexp object for possible matches
-# =USAGE
-#   "ATRWCG".nt_parser
-#   => /AT[A|G][A|T]CG/
-class String
-    # direct function of calling reverse complement on String class
-  def rc
-      self.reverse.tr("ACTG","TGAC")
-  end
-  def mutation(error_rate = 0.01)
-    new_string = ""
-    self.split("").each do |nt|
-      pool = ["A","C","T","G"]
-      pool.delete(nt)
-      s = error_rate * 10000
-      r = rand(10000)
-      if r < s
-        nt = pool.sample
+    end # end of locator
+    # Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
+    # @param p1 [Integer] start position number on the reference genome
+    # @param p2 [Integer] end position number on the reference genome
+    # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
+    # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
+    # @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
+    #   if either the start or end position is beyond the range of the target sequence.
+    # @example trim a sequence to fit in the range of [2333, 2433] on the HXB2 nt reference
+    #   seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
+    #   s = ViralSeq::Sequence.new('my_seq', seq)
+    #   s.sequence_clip(2333, 2433, :HXB2).dna
+    #   => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
+    def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
+      loc = self.locator(ref_option, path_to_muscle)
+      l1 = loc[0]
+      l2 = loc[1]
+      if (p1 >= l1) & (p2 <= l2)
+          seq = loc[4]
+          ref = loc[5]
+          g1 = 0
+          ref.each_char do |char|
+              break if l1 == p1
+              g1 += 1
+              l1 += 1 unless char == "-"
+          end
+          g2 = 1
+          ref.reverse.each_char do |char|
+              break if l2 == p2
+              g2 += 1
+              l2 -= 1 unless char == "-"
+          end
+          return ViralSeq::Sequence.new(self.name,seq[g1..(-g2)].tr("-",""))
+      else
+          return nil
       end
-      new_string << nt
     end
-    return new_string
-  end
-  def nt_parser
-    match = ""
-    self.each_char.each do |base|
-      base_array = ViralSeq.to_list(base)
-      if base_array.size == 1
-        match += base_array[0]
+    # start of private functions
+    private
+    # generate amino acid abbreviations from 3 bases, ambiguity will return "#"
+    def amino_acid (bases)
+      case bases
+      when /^TT[TCY]$/
+        return "F"
+      when /^TT[AGR]$/
+        return "L"
+      when /^CT.$/
+        return "L"
+      when /^AT[TCAHYWM]$/
+        return "I"
+      when "ATG"
+        return "M"
+      when /^GT.$/
+        return "V"
+      when /^TC.$/
+        return "S"
+      when /^CC.$/
+        return "P"
+      when /^AC.$/
+        return "T"
+      when /^GC.$/
+        return "A"
+      when /^TA[TCY]$/
+        return "Y"
+      when /^TA[AGR]$/
+        return "*"
+      when /^T[GR]A$/
+        return "*"
+      when /^CA[TCY]$/
+        return "H"
+      when /^CA[AGR]$/
+        return "Q"
+      when /^AA[TCY]$/
+        return "N"
+      when /^AA[AGR]$/
+        return "K"
+      when /^GA[TCY]$/
+        return "D"
+      when /^GA[AGR]$/
+        return "E"
+      when /^TG[TCY]$/
+        return "C"
+      when "TGG"
+        return "W"
+      when /^CG.$/
+        return "R"
+      when /^AG[TCY]$/
+        return "S"
+      when /^[AM]G[AGR]$/
+        return "R"
+      when /^GG.$/
+        return "G"
+      when /^[ATW][CGS][CTY]$/
+        return "S"
+      when /^[TCY]T[AGR]$/
+        return "L"
       else
-        pattern = "[" + base_array.join("|") + "]"
-        match += pattern
+        return "#"
+      end
+    end # end of amino_acid
+    # keep ambiguities, return all possible amino acids.
+    def amino_acid_2 (bases)
+      bases_to_aa = []
+      aa_list = []
+      base1 = bases[0].to_list
+      base2 = bases[1].to_list
+      base3 = bases[2].to_list
+      l1 = base1.size - 1
+      l2 = base2.size - 1
+      l3 = base3.size - 1
+      (0..l1).each do |n1|
+        b1 = base1[n1]
+        (0..l2).each do |n2|
+          b2 = base2[n2]
+          (0..l3).each do |n3|
+            b3 = base3[n3]
+            bases_all = b1 + b2 + b3
+            bases_to_aa << bases_all
+          end
+        end
+      end
+      bases_to_aa.each do |base|
+      case base
+      when /^TT[TCY]$/
+        aa =  "F"
+      when /^TT[AGR]$/
+        aa =  "L"
+      when /^CT.$/
+        aa =  "L"
+      when /^AT[TCAHYWM]$/
+        aa =  "I"
+      when "ATG"
+        aa =  "M"
+      when /^GT.$/
+        aa =  "V"
+      when /^TC.$/
+        aa =  "S"
+      when /^CC.$/
+        aa =  "P"
+      when /^AC.$/
+        aa =  "T"
+      when /^GC.$/
+        aa =  "A"
+      when /^TA[TCY]$/
+        aa =  "Y"
+      when /^TA[AGR]$/
+        aa =  "*"
+      when /^T[GR]A$/
+        aa =  "*"
+      when /^CA[TCY]$/
+        aa =  "H"
+      when /^CA[AGR]$/
+        aa =  "Q"
+      when /^AA[TCY]$/
+        aa =  "N"
+      when /^AA[AGR]$/
+        aa =  "K"
+      when /^GA[TCY]$/
+        aa =  "D"
+      when /^GA[AGR]$/
+        aa =  "E"
+      when /^TG[TCY]$/
+        aa =  "C"
+      when "TGG"
+        aa =  "W"
+      when /^CG.$/
+        aa =  "R"
+      when /^AG[TCY]$/
+        aa =  "S"
+      when /^[AM]G[AGR]$/
+        aa =  "R"
+      when /^GG.$/
+        aa =  "G"
+      when /^[ATW][CGS][CTY]$/
+        aa =  "S"
+      when /^[TCY]T[AGR]$/
+        aa =  "L"
+      else
+        aa =  "-"
+      end
+      aa_list << aa
+    end
+      aa_out = aa_list.uniq.join
+      return aa_out
+    end # end of #amino_acid_2
+    # sdrm position hash
+    def sdrm_hash(options)
+      sdrm = {}
+      case options
+      when :hcv_ns5a
+        sdrm[28] = ['M',['T']]
+        sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
+        sdrm[31] = ['L',['M','V','F']]
+        sdrm[32] = ['P',['L']]
+        sdrm[44] = ['K',['R']]
+        sdrm[58] = ['H',['D','P','S']]
+        sdrm[64] = ['T',['A','S']]
+        sdrm[77] = ['P',['A','S']]
+        sdrm[78] = ['R',['K']]
+        sdrm[79] = ['T',['A']]
+        sdrm[83] = ['T',['M']]
+        sdrm[85] = ['S',['N','H','Y']]
+        sdrm[92] = ['A',['P','T','K','E']]
+        sdrm[93] = ['Y',['C','F','H','N']]
+        sdrm[107] = ['K',['T','S']]
+        sdrm[121] = ['I',['V']]
+        sdrm[135] = ['T',['A']]
+      when :nrti
+        sdrm[41] = ['M',['L']]
+        sdrm[65] = ['K',['R']]
+        sdrm[67] = ['D',['N','G','E']]
+        sdrm[69] = ['T',['D']]
+        sdrm[70] = ['K',['R','E']]
+        sdrm[74] = ['L',['V','I']]
+        sdrm[75] = ['V',['M','T','A','S']]
+        sdrm[77] = ['F',['L']]
+        sdrm[115] = ['Y',['F']]
+        sdrm[116] = ['F',['Y']]
+        sdrm[151] = ['Q',['M']]
+        sdrm[184] = ['M',['V','I']]
+        sdrm[210] = ['L',['W']]
+        sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
+        sdrm[219] = ["K",["Q","E","N","R"]]
+      when :nnrti
+        sdrm[100] = ['L',['I']]
+        sdrm[101] = ['K',['E','P']]
+        sdrm[103] = ['K',['N','S']]
+        sdrm[106] = ['V',['M','A']]
+        sdrm[179] = ['V',['F','D']]
+        sdrm[181] = ['Y',['C','I','V']]
+        sdrm[188] = ['Y',['L','H','C']]
+        sdrm[190] = ['G',['A','S','E']]
+        sdrm[225] = ['P',['H']]
+        sdrm[230] = ['M',['L']]
+      when :hiv_pr
+        sdrm[23] = ['L',['I']]
+        sdrm[24] = ['L',['I']]
+        sdrm[30] = ['D',['N']]
+        sdrm[32] = ['V',['I']]
+        sdrm[46] = ['M',['I','L']]
+        sdrm[47] = ['I',['V','A']]
+        sdrm[48] = ['G',['V','M']]
+        sdrm[50] = ['I',['V','L']]
+        sdrm[53] = ['F',['L']]
+        sdrm[54] = ['I',['V','L','M','T','A','S']]
+        sdrm[73] = ['G',['S','T','C','A']]
+        sdrm[76] = ['L',['V']]
+        sdrm[82] = ['V',['A','T','S','F','L','C','M']]
+        sdrm[83] = ['N',['D']]
+        sdrm[84] = ['I',['V','A','C']]
+        sdrm[88] = ['N',['D','S']]
+        sdrm[90] = ['L',['M']]
+      when :hiv_in
+        sdrm[66] = ['T',['A','I','K']]
+        sdrm[74] = ['L',['M']]
+        sdrm[92] = ['E',['Q']]
+        sdrm[95] = ['Q',['K']]
+        sdrm[97] = ['T',['A']]
+        sdrm[121] = ['F',['Y']]
+        sdrm[140] = ['G',['A','S','C']]
+        sdrm[143] = ["Y",["C","H","R"]]
+        sdrm[147] = ['S',['G']]
+        sdrm[148] = ['Q',['H','K','R']]
+        sdrm[155] = ['N',['S','H']]
+      else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
       end
+      return sdrm
     end
-    Regexp.new match
-  end
-end
+  end # end of ViralSeq::Sequence
+end # end of ViralSeq