RubyGems - viral_seq - Versions diffs - 1.2.9 → 1.6.0 - Mend

viral_seq 1.2.9 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -208,6 +208,31 @@ module ViralSeq
       return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
     end
+    # return a new SeqHash object with given a range on the nt sequence position
+    # @param range [Range] range of positions on the nt sequence
+    # @return [ViralSeq::SeqHash] a sub SeqHash object
+    def nt_range(range)
+      dna_hash = self.dna_hash
+      new_hash = {}
+      dna_hash.each do |k,v|
+        new_hash[k] = v[range]
+      end
+      ViralSeq::SeqHash.new(new_hash)
+    end # end of #nt_range
+    # check the size range of the DNA sequences of the SeqHash object
+    # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
+    def check_nt_size
+      dna_hash = self.dna_hash
+      size_array = []
+      dna_hash.values.each do |v|
+        size_array << v.size
+      end
+      return { max: size_array.max, min: size_array.min }
+    end
     # write the nt sequences to a FASTA format file
     # @param file [String] path to the FASTA output file
     # @return [NilClass]
@@ -592,6 +617,98 @@ module ViralSeq
     alias_method :pm, :poisson_minority_cutoff
+    # calculate false detection rate for minority mutations
+    # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
+    # @param error_rate [Float] estimated sequencing error rate
+    # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
+    # @example calculate FDR for mutations that appeared twice in the sample dataset
+    #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
+    #   fdr_hash = my_seqhash.fdr
+    #   fdr_hash[2].round(5)
+    #   => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
+    def fdr(error_rate = 0.0001)
+      sequences = self.dna_hash.values
+      if sequences.size == 0
+        return {}
+      else
+        seq_count = self.size
+        observed_hash = variant_for_poisson(sequences)
+        p_unadjusted = []
+        observed_hash.each do |k, v|
+          p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
+          p_unadjusted += Array.new(v, p_value)
+        end
+        p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
+        vars_pair = observed_hash.to_a
+        fdr_hash = Hash.new(0)
+        (0..(p_fdr.size - 1)).each do |i|
+          fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
+        end
+        return fdr_hash
+      end
+    end #end of #fdr
+    # analysis for the nt sequence variants.
+    # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
+    def nt_variants
+      return_obj = {}
+      nt_hash = self.dna_hash
+      tcs_number = self.size
+      dl = ViralSeq::TcsCore.detection_limit(tcs_number)
+      fdr_hash = self.fdr
+      pm_cut_off = self.pm
+      con = self.consensus
+      return_obj[:tcs_number] = tcs_number
+      return_obj[:lower_detection_limit] = dl
+      return_obj[:pm_cut_off] = pm_cut_off
+      return_obj[:positions] = []
+      cis = {}
+      (0..(con.size - 1)).each do |p|
+        position_obj = {}
+        position_obj[:position] = p + 1
+        position_obj[:tcs_number] = tcs_number
+        position_obj[:lower_detection_limit] = dl
+        position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
+        nts = []
+        dna_hash.each do |n,s|
+          nts << s[p]
+        end
+        freq_hash = nts.count_freq
+        [:A, :C, :G, :T, :-].each do |k|
+          v = freq_hash[k.to_s]
+          position_obj[k] = {}
+          position_obj[k][:count] = v
+          if v > 0
+            if cis[[v, tcs_number]]
+              ci = cis[[v, tcs_number]]
+            else
+              ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
+              cis[[v, tcs_number]] = ci
+            end
+            position_obj[k][:freq] = ci.mean.round(4)
+            position_obj[k][:freq_ci_low] = ci.lower.round(4)
+            position_obj[k][:freq_ci_high] = ci.upper.round(4)
+            position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
+            position_obj[k][:fdr] = fdr_hash[v]
+          else
+            position_obj[k][:freq] = 0
+            position_obj[k][:freq_ci_low] = 0
+            position_obj[k][:freq_ci_high] = 0
+            position_obj[k][:greater_than_pm] = false
+            position_obj[k][:fdr] = nil
+          end
+        end
+        return_obj[:positions] << position_obj
+      end
+      return_obj
+    end # end of nt_variants
     # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
     # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -1183,6 +1300,28 @@ module ViralSeq
       return new_sh
     end
+    # QC for each nucleotide sequence comparing with sample consensus for indels
+    # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
+    def qc_indel
+      con = self.consensus
+      dna_hash = self.dna_hash
+      names_passed = []
+      names_indel = []
+      dna_hash.uniq_hash.each do |seq, names|
+        if seq.compare_with(con) < 4
+          names_passed += names
+        elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
+          names_indel += names
+        else
+          names_passed += names
+        end
+      end
+      return {no_indel: self.sub(names_passed),
+        has_indel: self.sub(names_indel)}
+    end # end of qc_indel
     # trim dna sequences based on the provided reference coordinates.
     # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
     # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array

data/lib/viral_seq/tcs_core.rb CHANGED Viewed

@@ -6,6 +6,10 @@ module ViralSeq
     class << self
       # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
+      # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
+      # @param m [Integer] PID abundance
+      # @param error_rate [Float] estimated platform error rate.
+      # @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
       def calculate_cut_off(m, error_rate = 0.02)
         n = 0
@@ -280,6 +284,23 @@ module ViralSeq
         abort infor.red.bold
       end
+      # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
+      # R required.
+      # @param tcs_number [Integer] number of TCS
+      # @return [Float] lower detection limit
+      # @example calculate lower detection limit
+      #   ViralSeq::TcsCore.detection_limit(100)
+      #   => 0.0362
+      def detection_limit(tcs_number)
+        if ViralSeq::DETECT_SEN[tcs_number]
+          return ViralSeq::DETECT_SEN[tcs_number]
+        else
+          dl =  `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
+          dl.to_f
+        end
+      end
       private
       def unzip_r(indir, f)

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.2.9"
-  TCS_VERSION = "2.3.8"
+  VERSION = "1.6.0"
+  TCS_VERSION = "2.5.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.2.9
+  version: 1.6.0
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-08-02 00:00:00.000000000 Z
+date: 2022-01-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -172,6 +172,7 @@ files:
 - docs/dr.json
 - docs/sample_miseq_data/hivdr_control/r1.fastq.gz
 - docs/sample_miseq_data/hivdr_control/r2.fastq.gz
+- docs/variants_structure.pdf
 - lib/viral_seq.rb
 - lib/viral_seq/constant.rb
 - lib/viral_seq/enumerable.rb