viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Gemfile.lock +1 -1
 - data/README.md +7 -1
 - data/lib/viral_seq/Integer.rb +16 -0
 - data/lib/viral_seq/constant.rb +7 -0
 - data/lib/viral_seq/enumerable.rb +132 -0
 - data/lib/viral_seq/hash.rb +45 -0
 - data/lib/viral_seq/hivdr.rb +454 -0
 - data/lib/viral_seq/math.rb +128 -380
 - data/lib/viral_seq/muscle.rb +60 -82
 - data/lib/viral_seq/pid.rb +26 -0
 - data/lib/viral_seq/ref_seq.rb +35 -0
 - data/lib/viral_seq/rubystats.rb +172 -0
 - data/lib/viral_seq/seq_hash.rb +1043 -0
 - data/lib/viral_seq/seq_hash_pair.rb +219 -0
 - data/lib/viral_seq/sequence.rb +571 -348
 - data/lib/viral_seq/string.rb +119 -0
 - data/lib/viral_seq/version.rb +1 -1
 - data/lib/viral_seq.rb +14 -15
 - metadata +13 -12
 - data/lib/viral_seq/a3g.rb +0 -172
 - data/lib/viral_seq/fasta.rb +0 -154
 - data/lib/viral_seq/hcv_dr.rb +0 -54
 - data/lib/viral_seq/locator.rb +0 -299
 - data/lib/viral_seq/misc.rb +0 -103
 - data/lib/viral_seq/nt_variation.rb +0 -148
 - data/lib/viral_seq/poisson_cutoff.rb +0 -68
 - data/lib/viral_seq/refseq.rb +0 -45
 - data/lib/viral_seq/sdrm_core.rb +0 -652
 - data/lib/viral_seq/tcs_core.rb +0 -556
 
    
        data/lib/viral_seq/hcv_dr.rb
    DELETED
    
    | 
         @@ -1,54 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # viral_seq/hcv_dr
         
     | 
| 
       2 
     | 
    
         
            -
            # HCV resistant mutation interpretation
         
     | 
| 
       3 
     | 
    
         
            -
            # ViralSeq::hcv_ns5a
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
            # ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
         
     | 
| 
       6 
     | 
    
         
            -
            #   # amino_acid_sequence_array is Array object of the amino acid sequence.
         
     | 
| 
       7 
     | 
    
         
            -
            #   # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
         
     | 
| 
       8 
     | 
    
         
            -
            #   # start_aa_position is the starting aa number of the input sequence as Integer
         
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
            module ViralSeq
         
     | 
| 
       11 
     | 
    
         
            -
              def self.hcv_ns5a(aa_array,start_aa=1)
         
     | 
| 
       12 
     | 
    
         
            -
                out_hash = {}
         
     | 
| 
       13 
     | 
    
         
            -
                sdrm = {}
         
     | 
| 
       14 
     | 
    
         
            -
                sdrm[28] = ['M',['T']]
         
     | 
| 
       15 
     | 
    
         
            -
                sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
         
     | 
| 
       16 
     | 
    
         
            -
                sdrm[31] = ['L',['M','V','F']]
         
     | 
| 
       17 
     | 
    
         
            -
                sdrm[32] = ['P',['L']]
         
     | 
| 
       18 
     | 
    
         
            -
                sdrm[44] = ['K',['R']]
         
     | 
| 
       19 
     | 
    
         
            -
                sdrm[58] = ['H',['D','P','S']]
         
     | 
| 
       20 
     | 
    
         
            -
                sdrm[64] = ['T',['A','S']]
         
     | 
| 
       21 
     | 
    
         
            -
                sdrm[77] = ['P',['A','S']]
         
     | 
| 
       22 
     | 
    
         
            -
                sdrm[78] = ['R',['K']]
         
     | 
| 
       23 
     | 
    
         
            -
                sdrm[79] = ['T',['A']]
         
     | 
| 
       24 
     | 
    
         
            -
                sdrm[83] = ['T',['M']]
         
     | 
| 
       25 
     | 
    
         
            -
                sdrm[85] = ['S',['N','H','Y']]
         
     | 
| 
       26 
     | 
    
         
            -
                sdrm[92] = ['A',['P','T','K','E']]
         
     | 
| 
       27 
     | 
    
         
            -
                sdrm[93] = ['Y',['C','F','H','N']]
         
     | 
| 
       28 
     | 
    
         
            -
                sdrm[107] = ['K',['T','S']]
         
     | 
| 
       29 
     | 
    
         
            -
                sdrm[121] = ['I',['V']]
         
     | 
| 
       30 
     | 
    
         
            -
                sdrm[135] = ['T',['A']]
         
     | 
| 
       31 
     | 
    
         
            -
                aa_length = aa_array.size
         
     | 
| 
       32 
     | 
    
         
            -
                end_aa = start_aa + aa_length - 1
         
     | 
| 
       33 
     | 
    
         
            -
                (start_aa..end_aa).each do |position|
         
     | 
| 
       34 
     | 
    
         
            -
                  array_position = position - start_aa
         
     | 
| 
       35 
     | 
    
         
            -
                  if sdrm.keys.include?(position)
         
     | 
| 
       36 
     | 
    
         
            -
                    wt_aa = sdrm[position][0]
         
     | 
| 
       37 
     | 
    
         
            -
                    test_aa = aa_array[array_position]
         
     | 
| 
       38 
     | 
    
         
            -
                    if test_aa.size == 1
         
     | 
| 
       39 
     | 
    
         
            -
                      unless wt_aa == test_aa
         
     | 
| 
       40 
     | 
    
         
            -
                        if sdrm[position][1].include?(test_aa)
         
     | 
| 
       41 
     | 
    
         
            -
                          out_hash[position] = [wt_aa,test_aa]
         
     | 
| 
       42 
     | 
    
         
            -
                        end
         
     | 
| 
       43 
     | 
    
         
            -
                      end
         
     | 
| 
       44 
     | 
    
         
            -
                    else
         
     | 
| 
       45 
     | 
    
         
            -
                      test_aa_array = test_aa.split("/")
         
     | 
| 
       46 
     | 
    
         
            -
                      if (test_aa_array & sdrm[position][1])
         
     | 
| 
       47 
     | 
    
         
            -
                        out_hash[position] = [wt_aa,test_aa]
         
     | 
| 
       48 
     | 
    
         
            -
                      end
         
     | 
| 
       49 
     | 
    
         
            -
                    end
         
     | 
| 
       50 
     | 
    
         
            -
                  end
         
     | 
| 
       51 
     | 
    
         
            -
                end
         
     | 
| 
       52 
     | 
    
         
            -
                return out_hash
         
     | 
| 
       53 
     | 
    
         
            -
              end
         
     | 
| 
       54 
     | 
    
         
            -
            end
         
     | 
    
        data/lib/viral_seq/locator.rb
    DELETED
    
    | 
         @@ -1,299 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # viral_seq/locator.rb
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            # Including following methods:
         
     | 
| 
       4 
     | 
    
         
            -
            #   ViralSeq::sequence_locator
         
     | 
| 
       5 
     | 
    
         
            -
            #   ViralSeq::sequence_clip
         
     | 
| 
       6 
     | 
    
         
            -
            #   ViralSeq::qc_hiv_seq_check
         
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
            #   HIV sequence locator function
         
     | 
| 
       9 
     | 
    
         
            -
            #   resembling HIV Sequence Locator from LANL
         
     | 
| 
       10 
     | 
    
         
            -
            #   https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
         
     | 
| 
       11 
     | 
    
         
            -
            #   require MUSCLE (http://www.drive5.com/muscle) installed
         
     | 
| 
       12 
     | 
    
         
            -
            #   current version only supports nucleotide sequence, not for amino acid sequence.
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
            # =USAGE1
         
     | 
| 
       15 
     | 
    
         
            -
            #   # Find the location of a sequence
         
     | 
| 
       16 
     | 
    
         
            -
            #   ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
         
     | 
| 
       17 
     | 
    
         
            -
            #   # input_sequence: String of nucleotide sequence
         
     | 
| 
       18 
     | 
    
         
            -
            #   # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
         
     | 
| 
       19 
     | 
    
         
            -
            #   # path_to_muscle: path to the muscle executable.
         
     | 
| 
       20 
     | 
    
         
            -
            #   # Default as :false, will call MuscleBio to run Muscle
         
     | 
| 
       21 
     | 
    
         
            -
            #   # specify path_to_muscle if other source of muscle needed
         
     | 
| 
       22 
     | 
    
         
            -
            #   # function returns an array of
         
     | 
| 
       23 
     | 
    
         
            -
            #   #   start_location (Integer)
         
     | 
| 
       24 
     | 
    
         
            -
            #   #   end_location (Integer)
         
     | 
| 
       25 
     | 
    
         
            -
            #   #   percentage_of_similarity_to_reference_sequence (Float)
         
     | 
| 
       26 
     | 
    
         
            -
            #   #   containing_indel? (Boolean)
         
     | 
| 
       27 
     | 
    
         
            -
            #   #   aligned_input_sequence (String)
         
     | 
| 
       28 
     | 
    
         
            -
            #   #   aligned_reference_sequence (String)
         
     | 
| 
       29 
     | 
    
         
            -
            #   # example code
         
     | 
| 
       30 
     | 
    
         
            -
            #   sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
         
     | 
| 
       31 
     | 
    
         
            -
            #   p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
         
     | 
| 
       32 
     | 
    
         
            -
            #   => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
            # =USAGE2
         
     | 
| 
       35 
     | 
    
         
            -
            #   ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
         
     | 
| 
       36 
     | 
    
         
            -
            #   # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
         
     | 
| 
       37 
     | 
    
         
            -
            #   # return nil if the input sequence is not in the range
         
     | 
| 
       38 
     | 
    
         
            -
            #   # input_sequence: String of nucleotide sequence
         
     | 
| 
       39 
     | 
    
         
            -
            #   # start_position and end_position: Integer of the start and end reference number of the sub-sequence
         
     | 
| 
       40 
     | 
    
         
            -
            #   # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
         
     | 
| 
       41 
     | 
    
         
            -
            #   # path_to_muscle: path to the muscle executable.
         
     | 
| 
       42 
     | 
    
         
            -
            #   # Default as :false, will call MuscleBio to run Muscle
         
     | 
| 
       43 
     | 
    
         
            -
            #   # specify path_to_muscle if other source of muscle needed
         
     | 
| 
       44 
     | 
    
         
            -
            #   # example code
         
     | 
| 
       45 
     | 
    
         
            -
            #   seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
         
     | 
| 
       46 
     | 
    
         
            -
            #   p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
         
     | 
| 
       47 
     | 
    
         
            -
            #   => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
         
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
            # =USAGE3
         
     | 
| 
       50 
     | 
    
         
            -
            #   ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
         
     | 
| 
       51 
     | 
    
         
            -
            #   # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
         
     | 
| 
       52 
     | 
    
         
            -
            #   # and a boolean value for allowing indels,
         
     | 
| 
       53 
     | 
    
         
            -
            #   # path_to_muscle: path to the muscle executable.
         
     | 
| 
       54 
     | 
    
         
            -
            #   # Default as :false, will call MuscleBio to run Muscle
         
     | 
| 
       55 
     | 
    
         
            -
            #   # specify path_to_muscle if other source of muscle needed
         
     | 
| 
       56 
     | 
    
         
            -
            #   # return a sequence sub-hash that meets the the criteria
         
     | 
| 
       57 
     | 
    
         
            -
            #   # example code
         
     | 
| 
       58 
     | 
    
         
            -
            #   sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
         
     | 
| 
       59 
     | 
    
         
            -
            #   filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
         
     | 
| 
       60 
     | 
    
         
            -
            #   puts sequence_hash.size
         
     | 
| 
       61 
     | 
    
         
            -
            #   => 6
         
     | 
| 
       62 
     | 
    
         
            -
            #   puts filtered_sequence_hash.size
         
     | 
| 
       63 
     | 
    
         
            -
            #   => 4
         
     | 
| 
       64 
     | 
    
         
            -
             
     | 
| 
       65 
     | 
    
         
            -
            module ViralSeq
         
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
              def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
         
     | 
| 
       68 
     | 
    
         
            -
             
     | 
| 
       69 
     | 
    
         
            -
                # ViralSeq.check_muscle(path_to_muscle)
         
     | 
| 
       70 
     | 
    
         
            -
                ori_ref = ViralSeq.check_ref(ref_option)
         
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
                begin
         
     | 
| 
       73 
     | 
    
         
            -
                  ori_ref_l = ori_ref.size
         
     | 
| 
       74 
     | 
    
         
            -
                  l1 = 0
         
     | 
| 
       75 
     | 
    
         
            -
                  l2 = 0
         
     | 
| 
       76 
     | 
    
         
            -
             
     | 
| 
       77 
     | 
    
         
            -
                  aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
         
     | 
| 
       78 
     | 
    
         
            -
                  aln_test = aln_seq[1]
         
     | 
| 
       79 
     | 
    
         
            -
                  aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
         
     | 
| 
       80 
     | 
    
         
            -
                  gap_begin = $1.size
         
     | 
| 
       81 
     | 
    
         
            -
                  gap_end = $3.size
         
     | 
| 
       82 
     | 
    
         
            -
                  aln_test2 = $2
         
     | 
| 
       83 
     | 
    
         
            -
                  ref = aln_seq[0]
         
     | 
| 
       84 
     | 
    
         
            -
                  ref = ref[gap_begin..(-gap_end-1)]
         
     | 
| 
       85 
     | 
    
         
            -
                  ref_size = ref.size
         
     | 
| 
       86 
     | 
    
         
            -
                  if ref_size > 1.3*(seq.size)
         
     | 
| 
       87 
     | 
    
         
            -
                    l1 = l1 + gap_begin
         
     | 
| 
       88 
     | 
    
         
            -
                    l2 = l2 + gap_end
         
     | 
| 
       89 
     | 
    
         
            -
                    max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
         
     | 
| 
       90 
     | 
    
         
            -
                    aln_test2 =~ /#{max_seq}/
         
     | 
| 
       91 
     | 
    
         
            -
                    before_aln_seq = $`
         
     | 
| 
       92 
     | 
    
         
            -
                    before_aln = $`.size
         
     | 
| 
       93 
     | 
    
         
            -
                    post_aln_seq = $'
         
     | 
| 
       94 
     | 
    
         
            -
                    post_aln = $'.size
         
     | 
| 
       95 
     | 
    
         
            -
                    before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
         
     | 
| 
       96 
     | 
    
         
            -
                    b1 = (1.3 * before_aln_seq_size).to_i
         
     | 
| 
       97 
     | 
    
         
            -
                    post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
         
     | 
| 
       98 
     | 
    
         
            -
                    b2 = (1.3 * post_aln_seq_size).to_i
         
     | 
| 
       99 
     | 
    
         
            -
                    if (before_aln > seq.size) and (post_aln <= seq.size)
         
     | 
| 
       100 
     | 
    
         
            -
                      ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
         
     | 
| 
       101 
     | 
    
         
            -
                      l1 = l1 + (before_aln - b1)
         
     | 
| 
       102 
     | 
    
         
            -
                    elsif (post_aln > seq.size) and (before_aln <= seq.size)
         
     | 
| 
       103 
     | 
    
         
            -
                      ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
         
     | 
| 
       104 
     | 
    
         
            -
                      l2 = l2 + post_aln - b2
         
     | 
| 
       105 
     | 
    
         
            -
                    elsif (post_aln > seq.size) and (before_aln > seq.size)
         
     | 
| 
       106 
     | 
    
         
            -
                      ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
         
     | 
| 
       107 
     | 
    
         
            -
                      l1 = l1 + (before_aln - b1)
         
     | 
| 
       108 
     | 
    
         
            -
                      l2 = l2 + (post_aln - b2)
         
     | 
| 
       109 
     | 
    
         
            -
                    end
         
     | 
| 
       110 
     | 
    
         
            -
             
     | 
| 
       111 
     | 
    
         
            -
                    aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
         
     | 
| 
       112 
     | 
    
         
            -
                    aln_test = aln_seq[1]
         
     | 
| 
       113 
     | 
    
         
            -
                    aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
         
     | 
| 
       114 
     | 
    
         
            -
                    gap_begin = $1.size
         
     | 
| 
       115 
     | 
    
         
            -
                    gap_end = $3.size
         
     | 
| 
       116 
     | 
    
         
            -
                    ref = aln_seq[0]
         
     | 
| 
       117 
     | 
    
         
            -
                    ref = ref[gap_begin..(-gap_end-1)]
         
     | 
| 
       118 
     | 
    
         
            -
                  end
         
     | 
| 
       119 
     | 
    
         
            -
             
     | 
| 
       120 
     | 
    
         
            -
                  aln_test = aln_seq[1]
         
     | 
| 
       121 
     | 
    
         
            -
                  aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
         
     | 
| 
       122 
     | 
    
         
            -
                  gap_begin = $1.size
         
     | 
| 
       123 
     | 
    
         
            -
                  gap_end = $3.size
         
     | 
| 
       124 
     | 
    
         
            -
                  aln_test = $2
         
     | 
| 
       125 
     | 
    
         
            -
                  aln_test =~ /^(\w+)(\-*)\w/
         
     | 
| 
       126 
     | 
    
         
            -
                  s1 = $1.size
         
     | 
| 
       127 
     | 
    
         
            -
                  g1 = $2.size
         
     | 
| 
       128 
     | 
    
         
            -
                  aln_test =~ /\w(\-*)(\w+)$/
         
     | 
| 
       129 
     | 
    
         
            -
                  s2 = $2.size
         
     | 
| 
       130 
     | 
    
         
            -
                  g2 = $1.size
         
     | 
| 
       131 
     | 
    
         
            -
             
     | 
| 
       132 
     | 
    
         
            -
                  l1 = l1 + gap_begin
         
     | 
| 
       133 
     | 
    
         
            -
                  l2 = l2 + gap_end
         
     | 
| 
       134 
     | 
    
         
            -
                  repeat = 0
         
     | 
| 
       135 
     | 
    
         
            -
             
     | 
| 
       136 
     | 
    
         
            -
                  if g1 == g2 and (s1 + g1 + s2) == ref.size
         
     | 
| 
       137 
     | 
    
         
            -
                    if s1 > s2 and g2 > 2*s2
         
     | 
| 
       138 
     | 
    
         
            -
                      ref = ref[0..(-g2-1)]
         
     | 
| 
       139 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       140 
     | 
    
         
            -
                      l2 = l2 + g2
         
     | 
| 
       141 
     | 
    
         
            -
                    elsif s1 < s2 and g1 > 2*s1
         
     | 
| 
       142 
     | 
    
         
            -
                      ref = ref[g1..-1]
         
     | 
| 
       143 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       144 
     | 
    
         
            -
                      l1 = l1 + g1
         
     | 
| 
       145 
     | 
    
         
            -
                    end
         
     | 
| 
       146 
     | 
    
         
            -
                  else
         
     | 
| 
       147 
     | 
    
         
            -
                    if g1 > 2*s1
         
     | 
| 
       148 
     | 
    
         
            -
                      ref = ref[g1..-1]
         
     | 
| 
       149 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       150 
     | 
    
         
            -
                      l1 = l1 + g1
         
     | 
| 
       151 
     | 
    
         
            -
                    end
         
     | 
| 
       152 
     | 
    
         
            -
                    if g2 > 2*s2
         
     | 
| 
       153 
     | 
    
         
            -
                      ref = ref[0..(-g2 - 1)]
         
     | 
| 
       154 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       155 
     | 
    
         
            -
                      l2 = l2 + g2
         
     | 
| 
       156 
     | 
    
         
            -
                    end
         
     | 
| 
       157 
     | 
    
         
            -
                  end
         
     | 
| 
       158 
     | 
    
         
            -
             
     | 
| 
       159 
     | 
    
         
            -
                  while repeat == 1
         
     | 
| 
       160 
     | 
    
         
            -
                    aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
         
     | 
| 
       161 
     | 
    
         
            -
                    aln_test = aln_seq[1]
         
     | 
| 
       162 
     | 
    
         
            -
                    aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
         
     | 
| 
       163 
     | 
    
         
            -
                    gap_begin = $1.size
         
     | 
| 
       164 
     | 
    
         
            -
                    gap_end = $3.size
         
     | 
| 
       165 
     | 
    
         
            -
                    aln_test = $2
         
     | 
| 
       166 
     | 
    
         
            -
                    aln_test =~ /^(\w+)(\-*)\w/
         
     | 
| 
       167 
     | 
    
         
            -
                    s1 = $1.size
         
     | 
| 
       168 
     | 
    
         
            -
                    g1 = $2.size
         
     | 
| 
       169 
     | 
    
         
            -
                    aln_test =~ /\w(\-*)(\w+)$/
         
     | 
| 
       170 
     | 
    
         
            -
                    s2 = $2.size
         
     | 
| 
       171 
     | 
    
         
            -
                    g2 = $1.size
         
     | 
| 
       172 
     | 
    
         
            -
                    ref = aln_seq[0]
         
     | 
| 
       173 
     | 
    
         
            -
                    ref = ref[gap_begin..(-gap_end-1)]
         
     | 
| 
       174 
     | 
    
         
            -
                    l1 = l1 + gap_begin
         
     | 
| 
       175 
     | 
    
         
            -
                    l2 = l2 + gap_end
         
     | 
| 
       176 
     | 
    
         
            -
                    repeat = 0
         
     | 
| 
       177 
     | 
    
         
            -
                    if g1 > 2*s1
         
     | 
| 
       178 
     | 
    
         
            -
                      ref = ref[g1..-1]
         
     | 
| 
       179 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       180 
     | 
    
         
            -
                      l1 = l1 + g1
         
     | 
| 
       181 
     | 
    
         
            -
                    end
         
     | 
| 
       182 
     | 
    
         
            -
                    if g2 > 2*s2
         
     | 
| 
       183 
     | 
    
         
            -
                      ref = ref[0..(-g2 - 1)]
         
     | 
| 
       184 
     | 
    
         
            -
                      repeat = 1
         
     | 
| 
       185 
     | 
    
         
            -
                      l2 = l2 + g2
         
     | 
| 
       186 
     | 
    
         
            -
                    end
         
     | 
| 
       187 
     | 
    
         
            -
                  end
         
     | 
| 
       188 
     | 
    
         
            -
                  ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
         
     | 
| 
       189 
     | 
    
         
            -
             
     | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
                  aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
         
     | 
| 
       192 
     | 
    
         
            -
                  aln_test = aln_seq[1]
         
     | 
| 
       193 
     | 
    
         
            -
                  ref = aln_seq[0]
         
     | 
| 
       194 
     | 
    
         
            -
             
     | 
| 
       195 
     | 
    
         
            -
                  #refine alignment
         
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
       197 
     | 
    
         
            -
                  if ref =~ /^(\-+)/
         
     | 
| 
       198 
     | 
    
         
            -
                    l1 = l1 - $1.size
         
     | 
| 
       199 
     | 
    
         
            -
                  elsif ref =~ /(\-+)$/
         
     | 
| 
       200 
     | 
    
         
            -
                    l2 = l2 + $1.size
         
     | 
| 
       201 
     | 
    
         
            -
                  end
         
     | 
| 
       202 
     | 
    
         
            -
             
     | 
| 
       203 
     | 
    
         
            -
                  if (ori_ref_l - l2 - 1) >= l1
         
     | 
| 
       204 
     | 
    
         
            -
                    ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
         
     | 
| 
       205 
     | 
    
         
            -
                    aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
         
     | 
| 
       206 
     | 
    
         
            -
                    aln_test = aln_seq[1]
         
     | 
| 
       207 
     | 
    
         
            -
                    ref = aln_seq[0]
         
     | 
| 
       208 
     | 
    
         
            -
             
     | 
| 
       209 
     | 
    
         
            -
                    ref_size = ref.size
         
     | 
| 
       210 
     | 
    
         
            -
                    sim_count = 0
         
     | 
| 
       211 
     | 
    
         
            -
                    (0..(ref_size-1)).each do |n|
         
     | 
| 
       212 
     | 
    
         
            -
                      ref_base = ref[n]
         
     | 
| 
       213 
     | 
    
         
            -
                      test_base = aln_test[n]
         
     | 
| 
       214 
     | 
    
         
            -
                      sim_count += 1 if ref_base == test_base
         
     | 
| 
       215 
     | 
    
         
            -
                    end
         
     | 
| 
       216 
     | 
    
         
            -
                    similarity = (sim_count/ref_size.to_f*100).round(1)
         
     | 
| 
       217 
     | 
    
         
            -
             
     | 
| 
       218 
     | 
    
         
            -
                    loc_p1 = l1 + 1
         
     | 
| 
       219 
     | 
    
         
            -
                    loc_p2 = ori_ref_l - l2
         
     | 
| 
       220 
     | 
    
         
            -
                    if seq.size != (loc_p2 - loc_p1 + 1)
         
     | 
| 
       221 
     | 
    
         
            -
                        indel = true
         
     | 
| 
       222 
     | 
    
         
            -
                    elsif aln_test.include?("-")
         
     | 
| 
       223 
     | 
    
         
            -
                        indel = true
         
     | 
| 
       224 
     | 
    
         
            -
                    else
         
     | 
| 
       225 
     | 
    
         
            -
                        indel = false
         
     | 
| 
       226 
     | 
    
         
            -
                    end
         
     | 
| 
       227 
     | 
    
         
            -
                    return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
         
     | 
| 
       228 
     | 
    
         
            -
                  else
         
     | 
| 
       229 
     | 
    
         
            -
                    return [0,0,0,0,0,0,0]
         
     | 
| 
       230 
     | 
    
         
            -
                  end
         
     | 
| 
       231 
     | 
    
         
            -
                rescue => e
         
     | 
| 
       232 
     | 
    
         
            -
                  puts "Unexpected error occured."
         
     | 
| 
       233 
     | 
    
         
            -
                  puts "Exception Class: #{ e.class.name }"
         
     | 
| 
       234 
     | 
    
         
            -
                  puts "Exception Message: #{ e.message }"
         
     | 
| 
       235 
     | 
    
         
            -
                  puts "Exception Backtrace: #{ e.backtrace[0] }"
         
     | 
| 
       236 
     | 
    
         
            -
                  puts "ViralSeq.sequence_locator returns nil"
         
     | 
| 
       237 
     | 
    
         
            -
                  return nil
         
     | 
| 
       238 
     | 
    
         
            -
                end
         
     | 
| 
       239 
     | 
    
         
            -
              end
         
     | 
| 
       240 
     | 
    
         
            -
             
     | 
| 
       241 
     | 
    
         
            -
              # sequence clip function
         
     | 
| 
       242 
     | 
    
         
            -
              def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
         
     | 
| 
       243 
     | 
    
         
            -
                loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
         
     | 
| 
       244 
     | 
    
         
            -
                l1 = loc[0]
         
     | 
| 
       245 
     | 
    
         
            -
                l2 = loc[1]
         
     | 
| 
       246 
     | 
    
         
            -
                if (p1 >= l1) & (p2 <= l2)
         
     | 
| 
       247 
     | 
    
         
            -
                    seq = loc[4]
         
     | 
| 
       248 
     | 
    
         
            -
                    ref = loc[5]
         
     | 
| 
       249 
     | 
    
         
            -
                    g1 = 0
         
     | 
| 
       250 
     | 
    
         
            -
                    ref.each_char do |char|
         
     | 
| 
       251 
     | 
    
         
            -
                        break if l1 == p1
         
     | 
| 
       252 
     | 
    
         
            -
                        g1 += 1
         
     | 
| 
       253 
     | 
    
         
            -
                        l1 += 1 unless char == "-"
         
     | 
| 
       254 
     | 
    
         
            -
                    end
         
     | 
| 
       255 
     | 
    
         
            -
                    g2 = 1
         
     | 
| 
       256 
     | 
    
         
            -
                    ref.reverse.each_char do |char|
         
     | 
| 
       257 
     | 
    
         
            -
                        break if l2 == p2
         
     | 
| 
       258 
     | 
    
         
            -
                        g2 += 1
         
     | 
| 
       259 
     | 
    
         
            -
                        l2 -= 1 unless char == "-"
         
     | 
| 
       260 
     | 
    
         
            -
                    end
         
     | 
| 
       261 
     | 
    
         
            -
                    return seq[g1..(-g2)].tr("-","")
         
     | 
| 
       262 
     | 
    
         
            -
                else
         
     | 
| 
       263 
     | 
    
         
            -
                    return nil
         
     | 
| 
       264 
     | 
    
         
            -
                end
         
     | 
| 
       265 
     | 
    
         
            -
              end
         
     | 
| 
       266 
     | 
    
         
            -
             
     | 
| 
       267 
     | 
    
         
            -
              # batch quality check of HIV sequences based on ViralSeq.sequence_locator
         
     | 
| 
       268 
     | 
    
         
            -
              # input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
         
     | 
| 
       269 
     | 
    
         
            -
              # and allow the sequence to contain indels
         
     | 
| 
       270 
     | 
    
         
            -
              # return a hash of filtered sequences
         
     | 
| 
       271 
     | 
    
         
            -
             
     | 
| 
       272 
     | 
    
         
            -
              def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
         
     | 
| 
       273 
     | 
    
         
            -
                seq_hash_unique = seq_hash.values.uniq
         
     | 
| 
       274 
     | 
    
         
            -
                seq_hash_unique_pass = []
         
     | 
| 
       275 
     | 
    
         
            -
                start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
         
     | 
| 
       276 
     | 
    
         
            -
                end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
         
     | 
| 
       277 
     | 
    
         
            -
                seq_hash_unique.each do |seq|
         
     | 
| 
       278 
     | 
    
         
            -
                  loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
         
     | 
| 
       279 
     | 
    
         
            -
                  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
         
     | 
| 
       280 
     | 
    
         
            -
                    if indel
         
     | 
| 
       281 
     | 
    
         
            -
                      seq_hash_unique_pass << seq
         
     | 
| 
       282 
     | 
    
         
            -
                    elsif loc[3] == false
         
     | 
| 
       283 
     | 
    
         
            -
                      seq_hash_unique_pass << seq
         
     | 
| 
       284 
     | 
    
         
            -
                    end
         
     | 
| 
       285 
     | 
    
         
            -
                  end
         
     | 
| 
       286 
     | 
    
         
            -
                end
         
     | 
| 
       287 
     | 
    
         
            -
                seq_pass = {}
         
     | 
| 
       288 
     | 
    
         
            -
                seq_hash_unique_pass.each do |seq|
         
     | 
| 
       289 
     | 
    
         
            -
                  seq_hash.each do |seq_name, orginal_seq|
         
     | 
| 
       290 
     | 
    
         
            -
                    if orginal_seq == seq
         
     | 
| 
       291 
     | 
    
         
            -
                      seq_pass[seq_name] =  seq
         
     | 
| 
       292 
     | 
    
         
            -
                      seq_hash.delete(seq_name)
         
     | 
| 
       293 
     | 
    
         
            -
                    end
         
     | 
| 
       294 
     | 
    
         
            -
                  end
         
     | 
| 
       295 
     | 
    
         
            -
                end
         
     | 
| 
       296 
     | 
    
         
            -
                return seq_pass
         
     | 
| 
       297 
     | 
    
         
            -
              end
         
     | 
| 
       298 
     | 
    
         
            -
             
     | 
| 
       299 
     | 
    
         
            -
            end
         
     | 
    
        data/lib/viral_seq/misc.rb
    DELETED
    
    | 
         @@ -1,103 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # viral_seq/misc.rb
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            # miscellaneous methods
         
     | 
| 
       4 
     | 
    
         
            -
            # including
         
     | 
| 
       5 
     | 
    
         
            -
            #   Hash#copyhash
         
     | 
| 
       6 
     | 
    
         
            -
            #   Hash#difference
         
     | 
| 
       7 
     | 
    
         
            -
            #   Hash#uniq_hash
         
     | 
| 
       8 
     | 
    
         
            -
            #   ViralSeq::tail
         
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
            class Hash
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
              # Hash#copyhash
         
     | 
| 
       13 
     | 
    
         
            -
              # copy a hash
         
     | 
| 
       14 
     | 
    
         
            -
              # different from "="
         
     | 
| 
       15 
     | 
    
         
            -
              #   # example
         
     | 
| 
       16 
     | 
    
         
            -
              #   h1 = {1=>'a'}
         
     | 
| 
       17 
     | 
    
         
            -
              #   h2 = h1
         
     | 
| 
       18 
     | 
    
         
            -
              #   h3 = h1.copyhash
         
     | 
| 
       19 
     | 
    
         
            -
              #   h1.object_id == h2.object_id
         
     | 
| 
       20 
     | 
    
         
            -
              #   => true
         
     | 
| 
       21 
     | 
    
         
            -
              #   h1.object_id == h3.object_id
         
     | 
| 
       22 
     | 
    
         
            -
              #   => false
         
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
              def copyhash
         
     | 
| 
       25 
     | 
    
         
            -
                h = Hash.new
         
     | 
| 
       26 
     | 
    
         
            -
                self.each do |pair|
         
     | 
| 
       27 
     | 
    
         
            -
                  h.store(pair[0], pair[1])
         
     | 
| 
       28 
     | 
    
         
            -
                end
         
     | 
| 
       29 
     | 
    
         
            -
                return h
         
     | 
| 
       30 
     | 
    
         
            -
              end
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
              # subtract one hash (h2) from the other (h1) if the keys are identical
         
     | 
| 
       33 
     | 
    
         
            -
              # example:
         
     | 
| 
       34 
     | 
    
         
            -
              # h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
         
     | 
| 
       35 
     | 
    
         
            -
              # h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
         
     | 
| 
       36 
     | 
    
         
            -
              # h1.difference(h2) = {"Bird" => 2, "Snake" => 10}
         
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
              def difference(other)
         
     | 
| 
       39 
     | 
    
         
            -
                reject do |k,_v|
         
     | 
| 
       40 
     | 
    
         
            -
                  other.has_key? k
         
     | 
| 
       41 
     | 
    
         
            -
                end
         
     | 
| 
       42 
     | 
    
         
            -
              end
         
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
              # input hash A, return hash B with the unique values of hash A as keys,
         
     | 
| 
       45 
     | 
    
         
            -
              # and the keys of the unique values of hash A as values of hash B
         
     | 
| 
       46 
     | 
    
         
            -
              #   # example
         
     | 
| 
       47 
     | 
    
         
            -
              #   hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
         
     | 
| 
       48 
     | 
    
         
            -
              #   p hash.uniq_hash
         
     | 
| 
       49 
     | 
    
         
            -
              #   => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
         
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
              def uniq_hash
         
     | 
| 
       52 
     | 
    
         
            -
                uniq_values = self.values.uniq
         
     | 
| 
       53 
     | 
    
         
            -
                out_hash = {}
         
     | 
| 
       54 
     | 
    
         
            -
                uniq_values.each do |uniq_va|
         
     | 
| 
       55 
     | 
    
         
            -
                  self.each do |k,v|
         
     | 
| 
       56 
     | 
    
         
            -
                    if v == uniq_va
         
     | 
| 
       57 
     | 
    
         
            -
                      if out_hash[uniq_va]
         
     | 
| 
       58 
     | 
    
         
            -
                        out_hash[uniq_va] << k
         
     | 
| 
       59 
     | 
    
         
            -
                      else
         
     | 
| 
       60 
     | 
    
         
            -
                        out_hash[uniq_va] = []
         
     | 
| 
       61 
     | 
    
         
            -
                        out_hash[uniq_va] << k
         
     | 
| 
       62 
     | 
    
         
            -
                      end
         
     | 
| 
       63 
     | 
    
         
            -
                    end
         
     | 
| 
       64 
     | 
    
         
            -
                  end
         
     | 
| 
       65 
     | 
    
         
            -
                end
         
     | 
| 
       66 
     | 
    
         
            -
                return out_hash
         
     | 
| 
       67 
     | 
    
         
            -
              end
         
     | 
| 
       68 
     | 
    
         
            -
            end
         
     | 
| 
       69 
     | 
    
         
            -
             
     | 
| 
       70 
     | 
    
         
            -
            # Tail function for file as 'tail' in bash.
         
     | 
| 
       71 
     | 
    
         
            -
            def ViralSeq.tail(path, n)
         
     | 
| 
       72 
     | 
    
         
            -
              file = File.open(path, "r")
         
     | 
| 
       73 
     | 
    
         
            -
              buffer_s = 512
         
     | 
| 
       74 
     | 
    
         
            -
              line_count = 0
         
     | 
| 
       75 
     | 
    
         
            -
              file.seek(0, IO::SEEK_END)
         
     | 
| 
       76 
     | 
    
         
            -
             
     | 
| 
       77 
     | 
    
         
            -
              offset = file.pos # we start at the end
         
     | 
| 
       78 
     | 
    
         
            -
             
     | 
| 
       79 
     | 
    
         
            -
              while line_count <= n && offset > 0
         
     | 
| 
       80 
     | 
    
         
            -
                to_read = if (offset - buffer_s) < 0
         
     | 
| 
       81 
     | 
    
         
            -
                            offset
         
     | 
| 
       82 
     | 
    
         
            -
                          else
         
     | 
| 
       83 
     | 
    
         
            -
                            buffer_s
         
     | 
| 
       84 
     | 
    
         
            -
                          end
         
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
       86 
     | 
    
         
            -
                file.seek(offset-to_read)
         
     | 
| 
       87 
     | 
    
         
            -
                data = file.read(to_read)
         
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                data.reverse.each_char do |c|
         
     | 
| 
       90 
     | 
    
         
            -
                  if line_count > n
         
     | 
| 
       91 
     | 
    
         
            -
                    offset += 1
         
     | 
| 
       92 
     | 
    
         
            -
                    break
         
     | 
| 
       93 
     | 
    
         
            -
                  end
         
     | 
| 
       94 
     | 
    
         
            -
                  offset -= 1
         
     | 
| 
       95 
     | 
    
         
            -
                  if c == "\n"
         
     | 
| 
       96 
     | 
    
         
            -
                    line_count += 1
         
     | 
| 
       97 
     | 
    
         
            -
                  end
         
     | 
| 
       98 
     | 
    
         
            -
                end
         
     | 
| 
       99 
     | 
    
         
            -
              end
         
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
              file.seek(offset)
         
     | 
| 
       102 
     | 
    
         
            -
              file.read
         
     | 
| 
       103 
     | 
    
         
            -
            end
         
     |