viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
| @@ -0,0 +1,119 @@ | |
| 1 | 
            +
            # functions added to Class::String for direct operation on sequence as a String object
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            class String
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              # reverse complement
         | 
| 6 | 
            +
              # @return [String] reverse complement sequence
         | 
| 7 | 
            +
              # @example Reverse complement
         | 
| 8 | 
            +
              #   "ACAGA".rc
         | 
| 9 | 
            +
              #   => "TCTGT"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              def rc
         | 
| 12 | 
            +
                  self.reverse.tr("ACTG","TGAC")
         | 
| 13 | 
            +
              end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              # mutate a nt sequence (String class) randomly
         | 
| 16 | 
            +
              # @param error_rate [Float] define an error rate for mutation, default to `0.01`
         | 
| 17 | 
            +
              # @return [String] mutated sequence as String
         | 
| 18 | 
            +
              # @example mutate a sequence at an error rate of 0.05
         | 
| 19 | 
            +
              #   seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
         | 
| 20 | 
            +
              #   seq.mutation(0.05)
         | 
| 21 | 
            +
              #   => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              def mutation(error_rate = 0.01)
         | 
| 24 | 
            +
                new_string = ""
         | 
| 25 | 
            +
                self.split("").each do |nt|
         | 
| 26 | 
            +
                  pool = ["A","C","T","G"]
         | 
| 27 | 
            +
                  pool.delete(nt)
         | 
| 28 | 
            +
                  s = error_rate * 10000
         | 
| 29 | 
            +
                  r = rand(10000)
         | 
| 30 | 
            +
                  if r < s
         | 
| 31 | 
            +
                    nt = pool.sample
         | 
| 32 | 
            +
                  end
         | 
| 33 | 
            +
                  new_string << nt
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
                return new_string
         | 
| 36 | 
            +
              end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
              # parse the nucleotide sequences as a String object
         | 
| 39 | 
            +
              #   and return a Regexp object for possible matches
         | 
| 40 | 
            +
              # @return [Regexp] as possible matches
         | 
| 41 | 
            +
              # @example parse a sequence with ambiguities
         | 
| 42 | 
            +
              #   "ATRWCG".nt_parser
         | 
| 43 | 
            +
              #   => /AT[A|G][A|T]CG/
         | 
| 44 | 
            +
             | 
| 45 | 
            +
              def nt_parser
         | 
| 46 | 
            +
                match = ""
         | 
| 47 | 
            +
                self.each_char.each do |base|
         | 
| 48 | 
            +
                  base_array = base.to_list
         | 
| 49 | 
            +
                  if base_array.size == 1
         | 
| 50 | 
            +
                    match += base_array[0]
         | 
| 51 | 
            +
                  else
         | 
| 52 | 
            +
                    pattern = "[" + base_array.join("|") + "]"
         | 
| 53 | 
            +
                    match += pattern
         | 
| 54 | 
            +
                  end
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
                Regexp.new match
         | 
| 57 | 
            +
              end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              # parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
         | 
| 60 | 
            +
              # @return [Array] parsed nt bases
         | 
| 61 | 
            +
              # @example parse IUPAC `R`
         | 
| 62 | 
            +
              #   'R'.to_list
         | 
| 63 | 
            +
              #   => ["A", "G"]
         | 
| 64 | 
            +
             | 
| 65 | 
            +
              def to_list
         | 
| 66 | 
            +
                list = []
         | 
| 67 | 
            +
                case self.upcase
         | 
| 68 | 
            +
                when /[A|T|C|G]/
         | 
| 69 | 
            +
                  list << self
         | 
| 70 | 
            +
                when "W"
         | 
| 71 | 
            +
                  list = ['A','T']
         | 
| 72 | 
            +
                when "S"
         | 
| 73 | 
            +
                  list = ['C','G']
         | 
| 74 | 
            +
                when "M"
         | 
| 75 | 
            +
                  list = ['A','C']
         | 
| 76 | 
            +
                when 'K'
         | 
| 77 | 
            +
                  list = ['G','C']
         | 
| 78 | 
            +
                when 'R'
         | 
| 79 | 
            +
                  list = ['A','G']
         | 
| 80 | 
            +
                when 'Y'
         | 
| 81 | 
            +
                  list = ['C','T']
         | 
| 82 | 
            +
                when 'B'
         | 
| 83 | 
            +
                  list = ['C','G','T']
         | 
| 84 | 
            +
                when 'D'
         | 
| 85 | 
            +
                  list = ['A','G','T']
         | 
| 86 | 
            +
                when 'H'
         | 
| 87 | 
            +
                  list = ['A','C','T']
         | 
| 88 | 
            +
                when 'V'
         | 
| 89 | 
            +
                  list = ['A','C','G']
         | 
| 90 | 
            +
                when 'N'
         | 
| 91 | 
            +
                  list = ['A','T','C','G']
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
                return list
         | 
| 94 | 
            +
              end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
              # compare two sequences as String objects, two sequence strings need to aligned first
         | 
| 97 | 
            +
              # @param seq2 [String] the sequence string to compare with
         | 
| 98 | 
            +
              # @return [Integer] the total number of differences as integer
         | 
| 99 | 
            +
              # @example compare two sequence strings, without alignment and with alignment
         | 
| 100 | 
            +
              #   seq1 = 'AAGGCGTAGGAC'
         | 
| 101 | 
            +
              #   seq2 = 'AAGCTTAGGACG'
         | 
| 102 | 
            +
              #   seq1.compare_with(seq2) # no alignment
         | 
| 103 | 
            +
              #   => 8
         | 
| 104 | 
            +
              #   aligned_seqs = ViralSeq::Muscle.align(seq1,seq2) # align using MUSCLE
         | 
| 105 | 
            +
              #   aligned_seqs[0].compare_with(aligned_seqs[1])
         | 
| 106 | 
            +
              #   => 4
         | 
| 107 | 
            +
             | 
| 108 | 
            +
              def compare_with(seq2)
         | 
| 109 | 
            +
                seq1 = self
         | 
| 110 | 
            +
                length = seq1.size
         | 
| 111 | 
            +
                diff = 0
         | 
| 112 | 
            +
                (0..(length-1)).each do |position|
         | 
| 113 | 
            +
                  nt1 = seq1[position]
         | 
| 114 | 
            +
                  nt2 = seq2[position]
         | 
| 115 | 
            +
                  diff += 1 unless nt1 == nt2
         | 
| 116 | 
            +
                end
         | 
| 117 | 
            +
                return diff
         | 
| 118 | 
            +
              end
         | 
| 119 | 
            +
            end
         | 
    
        data/lib/viral_seq/version.rb
    CHANGED
    
    
    
        data/lib/viral_seq.rb
    CHANGED
    
    | @@ -18,24 +18,23 @@ | |
| 18 18 | 
             
            # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
         | 
| 19 19 | 
             
            # THE SOFTWARE.
         | 
| 20 20 |  | 
| 21 | 
            -
            # viral_seq main
         | 
| 22 21 | 
             
            module ViralSeq; end
         | 
| 23 22 |  | 
| 24 | 
            -
            # load all  | 
| 25 | 
            -
             | 
| 26 | 
            -
            require "viral_seq/ | 
| 27 | 
            -
            require "viral_seq/ | 
| 23 | 
            +
            # load all classes
         | 
| 24 | 
            +
            require "viral_seq/constant"
         | 
| 25 | 
            +
            require "viral_seq/enumerable"
         | 
| 26 | 
            +
            require "viral_seq/hash"
         | 
| 27 | 
            +
            require "viral_seq/hivdr"
         | 
| 28 | 
            +
            require "viral_seq/integer"
         | 
| 28 29 | 
             
            require "viral_seq/math"
         | 
| 29 | 
            -
            require "viral_seq/fasta"
         | 
| 30 | 
            -
            require "viral_seq/misc"
         | 
| 31 | 
            -
            require "viral_seq/refseq"
         | 
| 32 | 
            -
            require "viral_seq/locator"
         | 
| 33 30 | 
             
            require "viral_seq/muscle"
         | 
| 34 | 
            -
            require "viral_seq/ | 
| 35 | 
            -
            require "viral_seq/ | 
| 36 | 
            -
            require "viral_seq/ | 
| 37 | 
            -
            require "viral_seq/ | 
| 38 | 
            -
            require "viral_seq/ | 
| 39 | 
            -
            require "viral_seq/ | 
| 31 | 
            +
            require "viral_seq/pid"
         | 
| 32 | 
            +
            require "viral_seq/ref_seq"
         | 
| 33 | 
            +
            require "viral_seq/rubystats"
         | 
| 34 | 
            +
            require "viral_seq/seq_hash"
         | 
| 35 | 
            +
            require "viral_seq/seq_hash_pair"
         | 
| 36 | 
            +
            require "viral_seq/sequence"
         | 
| 37 | 
            +
            require "viral_seq/string"
         | 
| 38 | 
            +
            require "viral_seq/version"
         | 
| 40 39 |  | 
| 41 40 | 
             
            require "muscle_bio"
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: viral_seq
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 1.0.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Shuntai Zhou
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: exe
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2019- | 
| 12 | 
            +
            date: 2019-07-09 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: bundler
         | 
| @@ -89,19 +89,20 @@ files: | |
| 89 89 | 
             
            - bin/console
         | 
| 90 90 | 
             
            - bin/setup
         | 
| 91 91 | 
             
            - lib/viral_seq.rb
         | 
| 92 | 
            -
            - lib/viral_seq/ | 
| 93 | 
            -
            - lib/viral_seq/ | 
| 94 | 
            -
            - lib/viral_seq/ | 
| 95 | 
            -
            - lib/viral_seq/ | 
| 92 | 
            +
            - lib/viral_seq/Integer.rb
         | 
| 93 | 
            +
            - lib/viral_seq/constant.rb
         | 
| 94 | 
            +
            - lib/viral_seq/enumerable.rb
         | 
| 95 | 
            +
            - lib/viral_seq/hash.rb
         | 
| 96 | 
            +
            - lib/viral_seq/hivdr.rb
         | 
| 96 97 | 
             
            - lib/viral_seq/math.rb
         | 
| 97 | 
            -
            - lib/viral_seq/misc.rb
         | 
| 98 98 | 
             
            - lib/viral_seq/muscle.rb
         | 
| 99 | 
            -
            - lib/viral_seq/ | 
| 100 | 
            -
            - lib/viral_seq/ | 
| 101 | 
            -
            - lib/viral_seq/ | 
| 102 | 
            -
            - lib/viral_seq/ | 
| 99 | 
            +
            - lib/viral_seq/pid.rb
         | 
| 100 | 
            +
            - lib/viral_seq/ref_seq.rb
         | 
| 101 | 
            +
            - lib/viral_seq/rubystats.rb
         | 
| 102 | 
            +
            - lib/viral_seq/seq_hash.rb
         | 
| 103 | 
            +
            - lib/viral_seq/seq_hash_pair.rb
         | 
| 103 104 | 
             
            - lib/viral_seq/sequence.rb
         | 
| 104 | 
            -
            - lib/viral_seq/ | 
| 105 | 
            +
            - lib/viral_seq/string.rb
         | 
| 105 106 | 
             
            - lib/viral_seq/version.rb
         | 
| 106 107 | 
             
            - viral_seq.gemspec
         | 
| 107 108 | 
             
            homepage: https://github.com/ViralSeq/viral_seq
         | 
    
        data/lib/viral_seq/a3g.rb
    DELETED
    
    | @@ -1,172 +0,0 @@ | |
| 1 | 
            -
            # viral_seq/a3g
         | 
| 2 | 
            -
            # APOBEC3g/f hypermutation function including
         | 
| 3 | 
            -
            # ViralSeq::a3g_hypermut_seq_hash
         | 
| 4 | 
            -
            # ViralSeq::apobec3gf
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            # APOBEC3g/f G to A hypermutation
         | 
| 7 | 
            -
            # APOBEC3G/F pattern: GRD -> ARD
         | 
| 8 | 
            -
            # control pattern: G[YN|RC] -> A[YN|RC]
         | 
| 9 | 
            -
            # use the sample consensus to determine potential a3g sites
         | 
| 10 | 
            -
             | 
| 11 | 
            -
            # Two criteria to identify hypermutation
         | 
| 12 | 
            -
            # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
         | 
| 13 | 
            -
            # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
         | 
| 14 | 
            -
            # note:  criteria 2 only applies on a sequence file containing more than 20 sequences
         | 
| 15 | 
            -
            #        b/c Poisson model does not do well on small sample size.
         | 
| 16 | 
            -
             | 
| 17 | 
            -
            # ViralSeq.a3g_hypermut_seq_hash(sequence_hash)
         | 
| 18 | 
            -
            # sequence_hash is a Hash object for sequences. {:name => :sequence, ...}
         | 
| 19 | 
            -
            # return array [hypermutation_hash, statistic_info]
         | 
| 20 | 
            -
            # hypermutation_hash is a Hash object for sequences
         | 
| 21 | 
            -
            # statistic_info is a hash object of [sequence_name, stats],
         | 
| 22 | 
            -
            # in which stats String object in csv format (separated by ',') containing
         | 
| 23 | 
            -
            #   sequence tag
         | 
| 24 | 
            -
            #   G to A mutation numbers at potential a3g positions
         | 
| 25 | 
            -
            #   total potential a3g G positions
         | 
| 26 | 
            -
            #   G to A mutation numbers at non a3g positions
         | 
| 27 | 
            -
            #   total non a3g G positions
         | 
| 28 | 
            -
            #   a3g G to A mutation rate / non-a3g G to A mutation rate
         | 
| 29 | 
            -
            #   Fishers Exact P-value
         | 
| 30 | 
            -
            #
         | 
| 31 | 
            -
            # =USAGE
         | 
| 32 | 
            -
            #   # example 1
         | 
| 33 | 
            -
            #   sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence1.fasta')
         | 
| 34 | 
            -
            #   hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
         | 
| 35 | 
            -
            #   hypermut[0].keys
         | 
| 36 | 
            -
            #   => [">Seq7", ">Seq14"]
         | 
| 37 | 
            -
            #   stats = hypermut[1]
         | 
| 38 | 
            -
            #   stats.values
         | 
| 39 | 
            -
            #   => [">Seq7,23,68,1,54,18.26,4.308329383112348e-06", ">Seq14,45,68,9,54,3.97,5.2143571971582974e-08"]
         | 
| 40 | 
            -
            #
         | 
| 41 | 
            -
            #   # example 2
         | 
| 42 | 
            -
            #   sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence2.fasta')
         | 
| 43 | 
            -
            #   hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
         | 
| 44 | 
            -
            #   stats = hypermut[1]
         | 
| 45 | 
            -
            #   stats = values
         | 
| 46 | 
            -
            #   => [">CTAACACTCA_134_a3g-sample2,4,35,0,51,Infinity,0.02465676660128911", ">ATAGTGCCCA_60_a3g-sample2,4,35,1,51,5.83,0.1534487353839561"]
         | 
| 47 | 
            -
            #   # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05, but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
         | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
            # ViralSeq.apobec3gf(sequence)
         | 
| 51 | 
            -
            # APOBEC3G/F pattern: GRD -> ARD
         | 
| 52 | 
            -
            # control pattern: G[YN|RC] -> A[YN|RC]
         | 
| 53 | 
            -
            # input a sequence String object
         | 
| 54 | 
            -
            # return all two arrays of position numbers of
         | 
| 55 | 
            -
            #   a3g G positions (a3g)
         | 
| 56 | 
            -
            #   non-a3g G positions (control)
         | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
            module ViralSeq
         | 
| 60 | 
            -
              def ViralSeq.a3g_hypermut_seq_hash(seq_hash)
         | 
| 61 | 
            -
                # mut_hash number of apobec3g/f mutations per sequence
         | 
| 62 | 
            -
                mut_hash = {}
         | 
| 63 | 
            -
                hm_hash = {}
         | 
| 64 | 
            -
                out_hash = {}
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                # total G->A mutations at apobec3g/f positions.
         | 
| 67 | 
            -
                total = 0
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                # make consensus sequence for the input sequence hash
         | 
| 70 | 
            -
                ref = ViralSeq.consensus(seq_hash.values)
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                # obtain apobec3g positions and control positions
         | 
| 73 | 
            -
                apobec = ViralSeq.apobec3gf(ref)
         | 
| 74 | 
            -
                mut = apobec[0]
         | 
| 75 | 
            -
                control = apobec[1]
         | 
| 76 | 
            -
             | 
| 77 | 
            -
                seq_hash.each do |k,v|
         | 
| 78 | 
            -
                  a = 0 # muts
         | 
| 79 | 
            -
                  b = 0 # potential mut sites
         | 
| 80 | 
            -
                  c = 0 # control muts
         | 
| 81 | 
            -
                  d = 0 # potenrial controls
         | 
| 82 | 
            -
                  mut.each do |n|
         | 
| 83 | 
            -
                    next if v[n] == "-"
         | 
| 84 | 
            -
                    if v[n] == "A"
         | 
| 85 | 
            -
                      a += 1
         | 
| 86 | 
            -
                      b += 1
         | 
| 87 | 
            -
                    else
         | 
| 88 | 
            -
                      b += 1
         | 
| 89 | 
            -
                    end
         | 
| 90 | 
            -
                  end
         | 
| 91 | 
            -
                  mut_hash[k] = a
         | 
| 92 | 
            -
                  total += a
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                  control.each do |n|
         | 
| 95 | 
            -
                    next if v[n] == "-"
         | 
| 96 | 
            -
                    if v[n] == "A"
         | 
| 97 | 
            -
                      c += 1
         | 
| 98 | 
            -
                      d += 1
         | 
| 99 | 
            -
                    else
         | 
| 100 | 
            -
                      d += 1
         | 
| 101 | 
            -
                    end
         | 
| 102 | 
            -
                  end
         | 
| 103 | 
            -
                  rr = (a/b.to_f)/(c/d.to_f)
         | 
| 104 | 
            -
             | 
| 105 | 
            -
                  t1 = b - a
         | 
| 106 | 
            -
                  t2 = d - c
         | 
| 107 | 
            -
             | 
| 108 | 
            -
                  fet = Rubystats::FishersExactTest.new
         | 
| 109 | 
            -
                  fisher = fet.calculate(t1,t2,a,c)
         | 
| 110 | 
            -
                  perc = fisher[:twotail]
         | 
| 111 | 
            -
                  info = k + "," + a.to_s + "," + b.to_s + "," + c.to_s + "," + d.to_s + "," + rr.round(2).to_s + "," + perc.to_s
         | 
| 112 | 
            -
                  out_hash[k] = info
         | 
| 113 | 
            -
                  if perc < 0.05
         | 
| 114 | 
            -
                    hm_hash[k] = info
         | 
| 115 | 
            -
                  end
         | 
| 116 | 
            -
                end
         | 
| 117 | 
            -
             | 
| 118 | 
            -
                if seq_hash.size > 20
         | 
| 119 | 
            -
                  rate = total.to_f/(seq_hash.size)
         | 
| 120 | 
            -
             | 
| 121 | 
            -
                  count_mut = ViralSeq.count(mut_hash.values)
         | 
| 122 | 
            -
                  maxi_count = count_mut.values.max
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                  poisson_hash = ViralSeq.poisson_distribution(rate,maxi_count)
         | 
| 125 | 
            -
             | 
| 126 | 
            -
                  cut_off = 0
         | 
| 127 | 
            -
                  poisson_hash.each do |k,v|
         | 
| 128 | 
            -
                    cal = seq_hash.size * v
         | 
| 129 | 
            -
                    obs = count_mut[k]
         | 
| 130 | 
            -
                    if obs >= 20 * cal
         | 
| 131 | 
            -
                      cut_off = k
         | 
| 132 | 
            -
                      break
         | 
| 133 | 
            -
                    elsif k == maxi_count
         | 
| 134 | 
            -
                      cut_off = maxi_count
         | 
| 135 | 
            -
                    end
         | 
| 136 | 
            -
                  end
         | 
| 137 | 
            -
             | 
| 138 | 
            -
                  mut_hash.each do |k,v|
         | 
| 139 | 
            -
                    if v > cut_off
         | 
| 140 | 
            -
                      hm_hash[k] = out_hash[k]
         | 
| 141 | 
            -
                    end
         | 
| 142 | 
            -
                  end
         | 
| 143 | 
            -
                end
         | 
| 144 | 
            -
             | 
| 145 | 
            -
                hm_seq_hash = {}
         | 
| 146 | 
            -
                hm_hash.keys.each do |k|
         | 
| 147 | 
            -
                  hm_seq_hash[k] = seq_hash[k]
         | 
| 148 | 
            -
                end
         | 
| 149 | 
            -
                return [hm_seq_hash,hm_hash]
         | 
| 150 | 
            -
              end
         | 
| 151 | 
            -
             | 
| 152 | 
            -
              # APOBEC3G/F mutation position identification
         | 
| 153 | 
            -
              # APOBEC3G/F pattern: GRD -> ARD
         | 
| 154 | 
            -
              # control pattern: G[YN|RC] -> A[YN|RC]
         | 
| 155 | 
            -
             | 
| 156 | 
            -
              def self.apobec3gf(seq = "")
         | 
| 157 | 
            -
                seq.tr!("-", "")
         | 
| 158 | 
            -
                seq_length = seq.size
         | 
| 159 | 
            -
                apobec_position = []
         | 
| 160 | 
            -
                control_position = []
         | 
| 161 | 
            -
                (0..(seq_length - 3)).each do |n|
         | 
| 162 | 
            -
                  tri_base = seq[n,3]
         | 
| 163 | 
            -
                  if tri_base =~ /G[A|G][A|G|T]/
         | 
| 164 | 
            -
                    apobec_position << n
         | 
| 165 | 
            -
                  elsif seq[n] == "G"
         | 
| 166 | 
            -
                    control_position << n
         | 
| 167 | 
            -
                  end
         | 
| 168 | 
            -
                end
         | 
| 169 | 
            -
                return [apobec_position,control_position]
         | 
| 170 | 
            -
              end
         | 
| 171 | 
            -
             | 
| 172 | 
            -
            end
         | 
    
        data/lib/viral_seq/fasta.rb
    DELETED
    
    | @@ -1,154 +0,0 @@ | |
| 1 | 
            -
            # fasta.rb
         | 
| 2 | 
            -
            # methods for converting sequence formats, including
         | 
| 3 | 
            -
            #   ViralSeq::fasta_to_hash
         | 
| 4 | 
            -
            #   ViralSeq::fastq_to_fasta
         | 
| 5 | 
            -
            #   ViralSeq::fastq_to_hash
         | 
| 6 | 
            -
            #   ViralSeq::fasta_hash_to_rsphylip
         | 
| 7 | 
            -
            #   ViralSeq::pair_fasta_to_hash
         | 
| 8 | 
            -
             | 
| 9 | 
            -
            # =USAGE
         | 
| 10 | 
            -
            #   sequence_fasta_hash = ViralSeq.fasta_to_hash(input_fasta_file)
         | 
| 11 | 
            -
            #   # input a sequence file in fasta format, read as a sequence hash
         | 
| 12 | 
            -
            #   # {:sequence_name1 => sequence1, ...}
         | 
| 13 | 
            -
             | 
| 14 | 
            -
            #   sequence_fasta_hash = ViralSeq.fastq_to_fasta(input_fastq_file)
         | 
| 15 | 
            -
            #   # input a sequence file in fastq format, read as a sequence hash
         | 
| 16 | 
            -
            #   # discard sequence quality score
         | 
| 17 | 
            -
             | 
| 18 | 
            -
            #   sequence_fastq_hash = ViralSeq.fasta_to_hash(input_fastq_file)
         | 
| 19 | 
            -
            #   # input a sequence file in fastq format, read as a sequence hash
         | 
| 20 | 
            -
            #   # keep sequence quality score
         | 
| 21 | 
            -
            #   # {:sequence_name1 => [sequence1, quality1], ...}
         | 
| 22 | 
            -
             | 
| 23 | 
            -
            #   phylip_hash = ViralSeq.fasta_hash_to_rsphylip(sequence_fasta_hash)
         | 
| 24 | 
            -
            #   # convert a aligned fasta sequence hash into relaxed sequencial phylip format
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            #   paired_sequence_hash = ViralSeq.pair_fasta_to_hash(directory_of_paired_fasta)
         | 
| 27 | 
            -
            #   # input a directory containing paired sequence files in the fasta format
         | 
| 28 | 
            -
            #   # ├───lib1
         | 
| 29 | 
            -
            #         │     lib1_r1.txt
         | 
| 30 | 
            -
            #         │     lib1_r2.txt
         | 
| 31 | 
            -
            #   # paired sequence files need to have "r1" and "r2" in their file names
         | 
| 32 | 
            -
            #   # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
         | 
| 33 | 
            -
            #   # return a paired sequence hash :seq_name => [r1_seq, r2_seq]
         | 
| 34 | 
            -
             | 
| 35 | 
            -
            module ViralSeq
         | 
| 36 | 
            -
             | 
| 37 | 
            -
              def self.fasta_to_hash(infile)
         | 
| 38 | 
            -
                f=File.open(infile,"r")
         | 
| 39 | 
            -
                return_hash = {}
         | 
| 40 | 
            -
                name = ""
         | 
| 41 | 
            -
                while line = f.gets do
         | 
| 42 | 
            -
                  line.tr!("\u0000","")
         | 
| 43 | 
            -
                  next if line == "\n"
         | 
| 44 | 
            -
                  next if line =~ /^\=/
         | 
| 45 | 
            -
                  if line =~ /^\>/
         | 
| 46 | 
            -
                    name = line.chomp
         | 
| 47 | 
            -
                    return_hash[name] = ""
         | 
| 48 | 
            -
                  else
         | 
| 49 | 
            -
                    return_hash[name] += line.chomp.upcase
         | 
| 50 | 
            -
                  end
         | 
| 51 | 
            -
                end
         | 
| 52 | 
            -
                f.close
         | 
| 53 | 
            -
                return return_hash
         | 
| 54 | 
            -
              end
         | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
              # fastq file to fasta, discard quality, return a sequence hash
         | 
| 58 | 
            -
             | 
| 59 | 
            -
              def self.fastq_to_fasta(fastq_file)
         | 
| 60 | 
            -
                  count = 0
         | 
| 61 | 
            -
                  sequence_a = []
         | 
| 62 | 
            -
                  count_seq = 0
         | 
| 63 | 
            -
             | 
| 64 | 
            -
                  File.open(fastq_file,'r') do |file|
         | 
| 65 | 
            -
                    file.readlines.collect do |line|
         | 
| 66 | 
            -
                      count +=1
         | 
| 67 | 
            -
                      count_m = count % 4
         | 
| 68 | 
            -
                      if count_m == 1
         | 
| 69 | 
            -
                        line.tr!('@','>')
         | 
| 70 | 
            -
                        sequence_a << line.chomp
         | 
| 71 | 
            -
                        count_seq += 1
         | 
| 72 | 
            -
                      elsif count_m == 2
         | 
| 73 | 
            -
                        sequence_a << line.chomp
         | 
| 74 | 
            -
                      end
         | 
| 75 | 
            -
                    end
         | 
| 76 | 
            -
                  end
         | 
| 77 | 
            -
                  Hash[*sequence_a]
         | 
| 78 | 
            -
              end
         | 
| 79 | 
            -
             | 
| 80 | 
            -
              # fastq file to hash, including quality. {:seq_name => [seq,quality]}
         | 
| 81 | 
            -
             | 
| 82 | 
            -
              def self.fastq_to_hash(fastq_file)
         | 
| 83 | 
            -
                  count = 0
         | 
| 84 | 
            -
                  sequence_a = []
         | 
| 85 | 
            -
                  quality_a = []
         | 
| 86 | 
            -
                  count_seq = 0
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                  File.open(fastq_file,'r') do |file|
         | 
| 89 | 
            -
                    file.readlines.collect do |line|
         | 
| 90 | 
            -
                      count +=1
         | 
| 91 | 
            -
                      count_m = count % 4
         | 
| 92 | 
            -
                      if count_m == 1
         | 
| 93 | 
            -
                        line.tr!('@','>')
         | 
| 94 | 
            -
                        sequence_a << line.chomp
         | 
| 95 | 
            -
                        quality_a << line.chomp
         | 
| 96 | 
            -
                        count_seq += 1
         | 
| 97 | 
            -
                      elsif count_m == 2
         | 
| 98 | 
            -
                        sequence_a << line.chomp
         | 
| 99 | 
            -
                      elsif count_m == 0
         | 
| 100 | 
            -
                        quality_a << line.chomp
         | 
| 101 | 
            -
                      end
         | 
| 102 | 
            -
                    end
         | 
| 103 | 
            -
                  end
         | 
| 104 | 
            -
                  sequence_hash = Hash[*sequence_a]
         | 
| 105 | 
            -
                  quality_hash = Hash[*quality_a]
         | 
| 106 | 
            -
                  return_hash = {}
         | 
| 107 | 
            -
                  sequence_hash.each do |k,v|
         | 
| 108 | 
            -
                    return_hash[k] = [v, quality_hash[k]]
         | 
| 109 | 
            -
                  end
         | 
| 110 | 
            -
                  return return_hash
         | 
| 111 | 
            -
              end
         | 
| 112 | 
            -
             | 
| 113 | 
            -
              # fasta sequence hash to relaxed sequencial phylip format
         | 
| 114 | 
            -
             | 
| 115 | 
            -
              def self.fasta_hash_to_rsphylip(seqs)
         | 
| 116 | 
            -
                outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
         | 
| 117 | 
            -
                names = seqs.keys
         | 
| 118 | 
            -
                max_name_l = (names.max.size - 1)
         | 
| 119 | 
            -
                max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
         | 
| 120 | 
            -
                seqs.each do |k,v|
         | 
| 121 | 
            -
                  outline += k[1..-1] + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
         | 
| 122 | 
            -
                end
         | 
| 123 | 
            -
                return outline
         | 
| 124 | 
            -
              end
         | 
| 125 | 
            -
             | 
| 126 | 
            -
              # input a directory with r1 and r2 sequences, return a hash :seq_name => [r1_seq, r2_seq]
         | 
| 127 | 
            -
              # r1 and r2 file names should contain "r1" and "r2" respectively
         | 
| 128 | 
            -
              # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
         | 
| 129 | 
            -
              def self.pair_fasta_to_hash(indir)
         | 
| 130 | 
            -
                files = Dir[indir + "/*"]
         | 
| 131 | 
            -
                r1_file = ""
         | 
| 132 | 
            -
                r2_file = ""
         | 
| 133 | 
            -
                files.each do |f|
         | 
| 134 | 
            -
                  if File.basename(f) =~ /r1/i
         | 
| 135 | 
            -
                    r1_file = f
         | 
| 136 | 
            -
                  elsif File.basename(f) =~ /r2/i
         | 
| 137 | 
            -
                    r2_file = f
         | 
| 138 | 
            -
                  end
         | 
| 139 | 
            -
                end
         | 
| 140 | 
            -
             | 
| 141 | 
            -
                seq1 = ViralSeq.fasta_to_hash(r1_file)
         | 
| 142 | 
            -
                seq2 = ViralSeq.fasta_to_hash(r2_file)
         | 
| 143 | 
            -
             | 
| 144 | 
            -
                new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
         | 
| 145 | 
            -
                new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
         | 
| 146 | 
            -
             | 
| 147 | 
            -
                seq_pair_hash = {}
         | 
| 148 | 
            -
             | 
| 149 | 
            -
                new_seq1.each do |seq_name,seq|
         | 
| 150 | 
            -
                  seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
         | 
| 151 | 
            -
                end
         | 
| 152 | 
            -
                return seq_pair_hash
         | 
| 153 | 
            -
              end
         | 
| 154 | 
            -
            end
         |