RubyGems - viral_seq - Versions diffs - 0.3.2 → 1.0.0 - Mend

viral_seq 0.3.2 → 1.0.0

Files changed (30) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +7 -1
data/lib/viral_seq/Integer.rb +16 -0
data/lib/viral_seq/constant.rb +7 -0
data/lib/viral_seq/enumerable.rb +132 -0
data/lib/viral_seq/hash.rb +45 -0
data/lib/viral_seq/hivdr.rb +454 -0
data/lib/viral_seq/math.rb +128 -380
data/lib/viral_seq/muscle.rb +60 -82
data/lib/viral_seq/pid.rb +26 -0
data/lib/viral_seq/ref_seq.rb +35 -0
data/lib/viral_seq/rubystats.rb +172 -0
data/lib/viral_seq/seq_hash.rb +1043 -0
data/lib/viral_seq/seq_hash_pair.rb +219 -0
data/lib/viral_seq/sequence.rb +571 -348
data/lib/viral_seq/string.rb +119 -0
data/lib/viral_seq/version.rb +1 -1
data/lib/viral_seq.rb +14 -15
metadata +13 -12
data/lib/viral_seq/a3g.rb +0 -172
data/lib/viral_seq/fasta.rb +0 -154
data/lib/viral_seq/hcv_dr.rb +0 -54
data/lib/viral_seq/locator.rb +0 -299
data/lib/viral_seq/misc.rb +0 -103
data/lib/viral_seq/nt_variation.rb +0 -148
data/lib/viral_seq/poisson_cutoff.rb +0 -68
data/lib/viral_seq/refseq.rb +0 -45
data/lib/viral_seq/sdrm_core.rb +0 -652
data/lib/viral_seq/tcs_core.rb +0 -556

data/lib/viral_seq/string.rb ADDED Viewed

@@ -0,0 +1,119 @@
+# functions added to Class::String for direct operation on sequence as a String object
+class String
+  # reverse complement
+  # @return [String] reverse complement sequence
+  # @example Reverse complement
+  #   "ACAGA".rc
+  #   => "TCTGT"
+  def rc
+      self.reverse.tr("ACTG","TGAC")
+  end
+  # mutate a nt sequence (String class) randomly
+  # @param error_rate [Float] define an error rate for mutation, default to `0.01`
+  # @return [String] mutated sequence as String
+  # @example mutate a sequence at an error rate of 0.05
+  #   seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
+  #   seq.mutation(0.05)
+  #   => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
+  def mutation(error_rate = 0.01)
+    new_string = ""
+    self.split("").each do |nt|
+      pool = ["A","C","T","G"]
+      pool.delete(nt)
+      s = error_rate * 10000
+      r = rand(10000)
+      if r < s
+        nt = pool.sample
+      end
+      new_string << nt
+    end
+    return new_string
+  end
+  # parse the nucleotide sequences as a String object
+  #   and return a Regexp object for possible matches
+  # @return [Regexp] as possible matches
+  # @example parse a sequence with ambiguities
+  #   "ATRWCG".nt_parser
+  #   => /AT[A|G][A|T]CG/
+  def nt_parser
+    match = ""
+    self.each_char.each do |base|
+      base_array = base.to_list
+      if base_array.size == 1
+        match += base_array[0]
+      else
+        pattern = "[" + base_array.join("|") + "]"
+        match += pattern
+      end
+    end
+    Regexp.new match
+  end
+  # parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
+  # @return [Array] parsed nt bases
+  # @example parse IUPAC `R`
+  #   'R'.to_list
+  #   => ["A", "G"]
+  def to_list
+    list = []
+    case self.upcase
+    when /[A|T|C|G]/
+      list << self
+    when "W"
+      list = ['A','T']
+    when "S"
+      list = ['C','G']
+    when "M"
+      list = ['A','C']
+    when 'K'
+      list = ['G','C']
+    when 'R'
+      list = ['A','G']
+    when 'Y'
+      list = ['C','T']
+    when 'B'
+      list = ['C','G','T']
+    when 'D'
+      list = ['A','G','T']
+    when 'H'
+      list = ['A','C','T']
+    when 'V'
+      list = ['A','C','G']
+    when 'N'
+      list = ['A','T','C','G']
+    end
+    return list
+  end
+  # compare two sequences as String objects, two sequence strings need to aligned first
+  # @param seq2 [String] the sequence string to compare with
+  # @return [Integer] the total number of differences as integer
+  # @example compare two sequence strings, without alignment and with alignment
+  #   seq1 = 'AAGGCGTAGGAC'
+  #   seq2 = 'AAGCTTAGGACG'
+  #   seq1.compare_with(seq2) # no alignment
+  #   => 8
+  #   aligned_seqs = ViralSeq::Muscle.align(seq1,seq2) # align using MUSCLE
+  #   aligned_seqs[0].compare_with(aligned_seqs[1])
+  #   => 4
+  def compare_with(seq2)
+    seq1 = self
+    length = seq1.size
+    diff = 0
+    (0..(length-1)).each do |position|
+      nt1 = seq1[position]
+      nt2 = seq2[position]
+      diff += 1 unless nt1 == nt2
+    end
+    return diff
+  end
+end

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "0.3.2"
+  VERSION = "1.0.0"
 end

data/lib/viral_seq.rb CHANGED Viewed

@@ -18,24 +18,23 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-# viral_seq main
 module ViralSeq; end
-# load all modules
-require "viral_seq/version"
-require "viral_seq/sequence"
+# load all classes
+require "viral_seq/constant"
+require "viral_seq/enumerable"
+require "viral_seq/hash"
+require "viral_seq/hivdr"
+require "viral_seq/integer"
 require "viral_seq/math"
-require "viral_seq/fasta"
-require "viral_seq/misc"
-require "viral_seq/refseq"
-require "viral_seq/locator"
 require "viral_seq/muscle"
-require "viral_seq/tcs_core.rb"
-require "viral_seq/poisson_cutoff"
-require "viral_seq/a3g"
-require "viral_seq/sdrm_core"
-require "viral_seq/hcv_dr"
-require "viral_seq/nt_variation"
+require "viral_seq/pid"
+require "viral_seq/ref_seq"
+require "viral_seq/rubystats"
+require "viral_seq/seq_hash"
+require "viral_seq/seq_hash_pair"
+require "viral_seq/sequence"
+require "viral_seq/string"
+require "viral_seq/version"
 require "muscle_bio"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 0.3.2
+  version: 1.0.0
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-06-21 00:00:00.000000000 Z
+date: 2019-07-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -89,19 +89,20 @@ files:
 - bin/console
 - bin/setup
 - lib/viral_seq.rb
-- lib/viral_seq/a3g.rb
-- lib/viral_seq/fasta.rb
-- lib/viral_seq/hcv_dr.rb
-- lib/viral_seq/locator.rb
+- lib/viral_seq/Integer.rb
+- lib/viral_seq/constant.rb
+- lib/viral_seq/enumerable.rb
+- lib/viral_seq/hash.rb
+- lib/viral_seq/hivdr.rb
 - lib/viral_seq/math.rb
-- lib/viral_seq/misc.rb
 - lib/viral_seq/muscle.rb
-- lib/viral_seq/nt_variation.rb
-- lib/viral_seq/poisson_cutoff.rb
-- lib/viral_seq/refseq.rb
-- lib/viral_seq/sdrm_core.rb
+- lib/viral_seq/pid.rb
+- lib/viral_seq/ref_seq.rb
+- lib/viral_seq/rubystats.rb
+- lib/viral_seq/seq_hash.rb
+- lib/viral_seq/seq_hash_pair.rb
 - lib/viral_seq/sequence.rb
-- lib/viral_seq/tcs_core.rb
+- lib/viral_seq/string.rb
 - lib/viral_seq/version.rb
 - viral_seq.gemspec
 homepage: https://github.com/ViralSeq/viral_seq

data/lib/viral_seq/a3g.rb DELETED Viewed

@@ -1,172 +0,0 @@
-# viral_seq/a3g
-# APOBEC3g/f hypermutation function including
-# ViralSeq::a3g_hypermut_seq_hash
-# ViralSeq::apobec3gf
-# APOBEC3g/f G to A hypermutation
-# APOBEC3G/F pattern: GRD -> ARD
-# control pattern: G[YN|RC] -> A[YN|RC]
-# use the sample consensus to determine potential a3g sites
-# Two criteria to identify hypermutation
-# 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
-# 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
-# note:  criteria 2 only applies on a sequence file containing more than 20 sequences
-#        b/c Poisson model does not do well on small sample size.
-# ViralSeq.a3g_hypermut_seq_hash(sequence_hash)
-# sequence_hash is a Hash object for sequences. {:name => :sequence, ...}
-# return array [hypermutation_hash, statistic_info]
-# hypermutation_hash is a Hash object for sequences
-# statistic_info is a hash object of [sequence_name, stats],
-# in which stats String object in csv format (separated by ',') containing
-#   sequence tag
-#   G to A mutation numbers at potential a3g positions
-#   total potential a3g G positions
-#   G to A mutation numbers at non a3g positions
-#   total non a3g G positions
-#   a3g G to A mutation rate / non-a3g G to A mutation rate
-#   Fishers Exact P-value
-#
-# =USAGE
-#   # example 1
-#   sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence1.fasta')
-#   hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
-#   hypermut[0].keys
-#   => [">Seq7", ">Seq14"]
-#   stats = hypermut[1]
-#   stats.values
-#   => [">Seq7,23,68,1,54,18.26,4.308329383112348e-06", ">Seq14,45,68,9,54,3.97,5.2143571971582974e-08"]
-#
-#   # example 2
-#   sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence2.fasta')
-#   hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
-#   stats = hypermut[1]
-#   stats = values
-#   => [">CTAACACTCA_134_a3g-sample2,4,35,0,51,Infinity,0.02465676660128911", ">ATAGTGCCCA_60_a3g-sample2,4,35,1,51,5.83,0.1534487353839561"]
-#   # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05, but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
-# ViralSeq.apobec3gf(sequence)
-# APOBEC3G/F pattern: GRD -> ARD
-# control pattern: G[YN|RC] -> A[YN|RC]
-# input a sequence String object
-# return all two arrays of position numbers of
-#   a3g G positions (a3g)
-#   non-a3g G positions (control)
-module ViralSeq
-  def ViralSeq.a3g_hypermut_seq_hash(seq_hash)
-    # mut_hash number of apobec3g/f mutations per sequence
-    mut_hash = {}
-    hm_hash = {}
-    out_hash = {}
-    # total G->A mutations at apobec3g/f positions.
-    total = 0
-    # make consensus sequence for the input sequence hash
-    ref = ViralSeq.consensus(seq_hash.values)
-    # obtain apobec3g positions and control positions
-    apobec = ViralSeq.apobec3gf(ref)
-    mut = apobec[0]
-    control = apobec[1]
-    seq_hash.each do |k,v|
-      a = 0 # muts
-      b = 0 # potential mut sites
-      c = 0 # control muts
-      d = 0 # potenrial controls
-      mut.each do |n|
-        next if v[n] == "-"
-        if v[n] == "A"
-          a += 1
-          b += 1
-        else
-          b += 1
-        end
-      end
-      mut_hash[k] = a
-      total += a
-      control.each do |n|
-        next if v[n] == "-"
-        if v[n] == "A"
-          c += 1
-          d += 1
-        else
-          d += 1
-        end
-      end
-      rr = (a/b.to_f)/(c/d.to_f)
-      t1 = b - a
-      t2 = d - c
-      fet = Rubystats::FishersExactTest.new
-      fisher = fet.calculate(t1,t2,a,c)
-      perc = fisher[:twotail]
-      info = k + "," + a.to_s + "," + b.to_s + "," + c.to_s + "," + d.to_s + "," + rr.round(2).to_s + "," + perc.to_s
-      out_hash[k] = info
-      if perc < 0.05
-        hm_hash[k] = info
-      end
-    end
-    if seq_hash.size > 20
-      rate = total.to_f/(seq_hash.size)
-      count_mut = ViralSeq.count(mut_hash.values)
-      maxi_count = count_mut.values.max
-      poisson_hash = ViralSeq.poisson_distribution(rate,maxi_count)
-      cut_off = 0
-      poisson_hash.each do |k,v|
-        cal = seq_hash.size * v
-        obs = count_mut[k]
-        if obs >= 20 * cal
-          cut_off = k
-          break
-        elsif k == maxi_count
-          cut_off = maxi_count
-        end
-      end
-      mut_hash.each do |k,v|
-        if v > cut_off
-          hm_hash[k] = out_hash[k]
-        end
-      end
-    end
-    hm_seq_hash = {}
-    hm_hash.keys.each do |k|
-      hm_seq_hash[k] = seq_hash[k]
-    end
-    return [hm_seq_hash,hm_hash]
-  end
-  # APOBEC3G/F mutation position identification
-  # APOBEC3G/F pattern: GRD -> ARD
-  # control pattern: G[YN|RC] -> A[YN|RC]
-  def self.apobec3gf(seq = "")
-    seq.tr!("-", "")
-    seq_length = seq.size
-    apobec_position = []
-    control_position = []
-    (0..(seq_length - 3)).each do |n|
-      tri_base = seq[n,3]
-      if tri_base =~ /G[A|G][A|G|T]/
-        apobec_position << n
-      elsif seq[n] == "G"
-        control_position << n
-      end
-    end
-    return [apobec_position,control_position]
-  end
-end

data/lib/viral_seq/fasta.rb DELETED Viewed

@@ -1,154 +0,0 @@
-# fasta.rb
-# methods for converting sequence formats, including
-#   ViralSeq::fasta_to_hash
-#   ViralSeq::fastq_to_fasta
-#   ViralSeq::fastq_to_hash
-#   ViralSeq::fasta_hash_to_rsphylip
-#   ViralSeq::pair_fasta_to_hash
-# =USAGE
-#   sequence_fasta_hash = ViralSeq.fasta_to_hash(input_fasta_file)
-#   # input a sequence file in fasta format, read as a sequence hash
-#   # {:sequence_name1 => sequence1, ...}
-#   sequence_fasta_hash = ViralSeq.fastq_to_fasta(input_fastq_file)
-#   # input a sequence file in fastq format, read as a sequence hash
-#   # discard sequence quality score
-#   sequence_fastq_hash = ViralSeq.fasta_to_hash(input_fastq_file)
-#   # input a sequence file in fastq format, read as a sequence hash
-#   # keep sequence quality score
-#   # {:sequence_name1 => [sequence1, quality1], ...}
-#   phylip_hash = ViralSeq.fasta_hash_to_rsphylip(sequence_fasta_hash)
-#   # convert a aligned fasta sequence hash into relaxed sequencial phylip format
-#   paired_sequence_hash = ViralSeq.pair_fasta_to_hash(directory_of_paired_fasta)
-#   # input a directory containing paired sequence files in the fasta format
-#   # ├───lib1
-#         │     lib1_r1.txt
-#         │     lib1_r2.txt
-#   # paired sequence files need to have "r1" and "r2" in their file names
-#   # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
-#   # return a paired sequence hash :seq_name => [r1_seq, r2_seq]
-module ViralSeq
-  def self.fasta_to_hash(infile)
-    f=File.open(infile,"r")
-    return_hash = {}
-    name = ""
-    while line = f.gets do
-      line.tr!("\u0000","")
-      next if line == "\n"
-      next if line =~ /^\=/
-      if line =~ /^\>/
-        name = line.chomp
-        return_hash[name] = ""
-      else
-        return_hash[name] += line.chomp.upcase
-      end
-    end
-    f.close
-    return return_hash
-  end
-  # fastq file to fasta, discard quality, return a sequence hash
-  def self.fastq_to_fasta(fastq_file)
-      count = 0
-      sequence_a = []
-      count_seq = 0
-      File.open(fastq_file,'r') do |file|
-        file.readlines.collect do |line|
-          count +=1
-          count_m = count % 4
-          if count_m == 1
-            line.tr!('@','>')
-            sequence_a << line.chomp
-            count_seq += 1
-          elsif count_m == 2
-            sequence_a << line.chomp
-          end
-        end
-      end
-      Hash[*sequence_a]
-  end
-  # fastq file to hash, including quality. {:seq_name => [seq,quality]}
-  def self.fastq_to_hash(fastq_file)
-      count = 0
-      sequence_a = []
-      quality_a = []
-      count_seq = 0
-      File.open(fastq_file,'r') do |file|
-        file.readlines.collect do |line|
-          count +=1
-          count_m = count % 4
-          if count_m == 1
-            line.tr!('@','>')
-            sequence_a << line.chomp
-            quality_a << line.chomp
-            count_seq += 1
-          elsif count_m == 2
-            sequence_a << line.chomp
-          elsif count_m == 0
-            quality_a << line.chomp
-          end
-        end
-      end
-      sequence_hash = Hash[*sequence_a]
-      quality_hash = Hash[*quality_a]
-      return_hash = {}
-      sequence_hash.each do |k,v|
-        return_hash[k] = [v, quality_hash[k]]
-      end
-      return return_hash
-  end
-  # fasta sequence hash to relaxed sequencial phylip format
-  def self.fasta_hash_to_rsphylip(seqs)
-    outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
-    names = seqs.keys
-    max_name_l = (names.max.size - 1)
-    max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
-    seqs.each do |k,v|
-      outline += k[1..-1] + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
-    end
-    return outline
-  end
-  # input a directory with r1 and r2 sequences, return a hash :seq_name => [r1_seq, r2_seq]
-  # r1 and r2 file names should contain "r1" and "r2" respectively
-  # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
-  def self.pair_fasta_to_hash(indir)
-    files = Dir[indir + "/*"]
-    r1_file = ""
-    r2_file = ""
-    files.each do |f|
-      if File.basename(f) =~ /r1/i
-        r1_file = f
-      elsif File.basename(f) =~ /r2/i
-        r2_file = f
-      end
-    end
-    seq1 = ViralSeq.fasta_to_hash(r1_file)
-    seq2 = ViralSeq.fasta_to_hash(r2_file)
-    new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
-    new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
-    seq_pair_hash = {}
-    new_seq1.each do |seq_name,seq|
-      seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
-    end
-    return seq_pair_hash
-  end
-end