RubyGems - viral_seq - Versions diffs - 1.0.13 → 1.2.0 - Mend

viral_seq 1.0.13 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/.gitignore +0 -1
data/Gemfile.lock +16 -3
data/README.md +102 -13
data/bin/tcs +51 -10
data/bin/tcs_log +102 -0
data/bin/tcs_sdrm +402 -0
data/docs/assets/img/cover.jpg +0 -0
data/docs/dr.json +67 -0
data/docs/sample_miseq_data/hivdr_control/r1.fastq.gz +0 -0
data/docs/sample_miseq_data/hivdr_control/r2.fastq.gz +0 -0
data/lib/viral_seq.rb +5 -1
data/lib/viral_seq/constant.rb +35 -5
data/lib/viral_seq/hivdr.rb +1 -1
data/lib/viral_seq/muscle.rb +3 -2
data/lib/viral_seq/recency.rb +52 -0
data/lib/viral_seq/sdrm.rb +101 -35
data/lib/viral_seq/seq_hash.rb +25 -5
data/lib/viral_seq/seq_hash_pair.rb +6 -4
data/lib/viral_seq/sequence.rb +1 -84
data/lib/viral_seq/tcs_core.rb +34 -5
data/lib/viral_seq/tcs_dr.rb +71 -0
data/lib/viral_seq/tcs_json.rb +41 -10
data/lib/viral_seq/version.rb +2 -2
data/viral_seq.gemspec +11 -0
metadata +74 -4

data/docs/sample_miseq_data/hivdr_control/r1.fastq.gz ADDED Viewed

Binary file

data/docs/sample_miseq_data/hivdr_control/r2.fastq.gz ADDED Viewed

Binary file

data/lib/viral_seq.rb CHANGED Viewed

@@ -37,6 +37,10 @@ require_relative "viral_seq/string"
 require_relative "viral_seq/version"
 require_relative "viral_seq/tcs_core"
 require_relative "viral_seq/tcs_json"
+require_relative "viral_seq/tcs_dr"
+require_relative "viral_seq/sdrm"
+require_relative "viral_seq/recency"
 require "muscle_bio"
+require "json"
+require "securerandom"

data/lib/viral_seq/constant.rb CHANGED Viewed

@@ -1,11 +1,41 @@
 module ViralSeq
   # array for all amino acid one letter abbreviations
   AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
-  SDRM_HIV_PR_LIST = {}
-  SDRM_HIV_RT_LIST = {}
-  SDRM_HIV_IN_LIST = {}
+  # R script for tcs_sdrm script
+  R_SCRIPT = 'setwd("PATH_TO_FASTA")
+              library(phangorn)
+              library(ape)
+              library(ggplot2)
+              library(scales)
+              library(ggforce)
+              library(cowplot)
+              library(magrittr)
+              library(gridExtra)
+              pdf("OUTPUT_PDF", onefile=T, width=11, height=8.5)
+              fileNames <- list.files()
+              for (fileName in fileNames) {
+              dna <- read.dna(fileName, format="fasta")
+              class(dna)
+              D<- dist.dna(dna, model="raw")
+              pi <- mean(D)
+              dist20 <- quantile(D, prob=c(0.20))
+              alldist <- data.frame(File=fileName, pi, dist20)
+              write.table(alldist,"OUTPUT_CSV",append=TRUE, sep = ",", row.names = FALSE, col.names=FALSE)
+              D2 <- dist.dna(dna, model="TN93")*100
+              def.par <- par(no.readonly = TRUE)
+              par(mfrow=c(1,2))
+              hist<-hist(D, main=fileName, xlab="% Pairwise Distance", ylab="Frequency", col="gray")
+              abline(v=dist20, col="royalblue",lwd=2)
+              abline(v=pi, col="red", lwd=2)
+              legend(x="topright", c("dist20", "pi"), col = c("royalblue", "red"), lwd = c(2,2), cex=0.5)
+              njtree<-NJ(D2)
+              njtreeplot <- plot(njtree, show.tip.label=F, "unrooted", main=fileName)
+              add.scale.bar(cex=0.7, font=2, col="red")
+              }
+              dev.off()'
 end

data/lib/viral_seq/hivdr.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module ViralSeq
-  class SDRM
+  class SeqHash
     # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
     #   works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)

data/lib/viral_seq/muscle.rb CHANGED Viewed

@@ -39,8 +39,9 @@ module ViralSeq
     def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
       temp_dir = Dir.home
-      temp_file = File.join(temp_dir, "_temp_muscle_in")
-      temp_aln = File.join(temp_dir, "_temp_muscle_aln")
+      temp_name = "_"  + SecureRandom.alphanumeric
+      temp_file = File.join(temp_dir, temp_name)
+      temp_aln = File.join(temp_dir, (temp_name + "_aln"))
       name = ">test"
       temp_in = File.open(temp_file,"w")
       temp_in.puts ">ref"

data/lib/viral_seq/recency.rb ADDED Viewed

@@ -0,0 +1,52 @@
+module ViralSeq
+  # recency prediction function based on HIV MPID-NGS
+  # @see https://pubmed.ncbi.nlm.nih.gov/32663847 Ref: Zhou et al. J Infect Dis. 2021
+  module Recency
+    # @params tcs_RT [Integer] number of TCS at the RT region
+    # @params tcs_V1V3 [Integer] number of TCS at the V1V3 region
+    # @params pi_RT [Float] pairwise diversity at the RT region
+    # @params pi_V1V3 [Float] pairwise diversity at the V1V3 region
+    # @params dist20_RT [Float] dist20 at the RT region
+    # @params dist20_V1V3 [Float] dist20 at the V1V3 region
+    # @return [String] determination of the recency
+    def self.define(tcs_RT: nil,
+                     tcs_V1V3: nil,
+                     pi_RT: nil,
+                     dist20_RT: nil,
+                     pi_V1V3: nil,
+                     dist20_V1V3: nil)
+      tcs_RT ||= 0
+      tcs_V1V3 ||= 0
+      if (tcs_RT >= 3 && pi_RT) and (tcs_V1V3 >= 3 && pi_V1V3)
+        if (pi_RT + pi_V1V3) < 0.0103
+            recency = "recent"
+        elsif (pi_RT + pi_V1V3) >= 0.0103 and (dist20_RT + dist20_V1V3) >= 0.006
+            recency = "chronic"
+        else
+            recency = "indeterminant"
+        end
+      elsif (tcs_RT >= 3 && pi_RT) and tcs_V1V3 < 3
+        if pi_RT < 0.0021
+          recency = "recent"
+        elsif pi_RT >= 0.0021 and dist20_RT >= 0.001
+          recency = "chronic"
+        else
+          recency = "indeterminant"
+        end
+      elsif (tcs_V1V3 >= 3 && pi_V1V3)
+        if pi_V1V3 >= 0.0103 and dist20_V1V3 >= 0.006
+          recency = "chronic"
+        else
+          recency = "insufficient data"
+        end
+      else
+        recency = "insufficient data"
+      end
+      return recency
+    end
+  end
+end

data/lib/viral_seq/sdrm.rb CHANGED Viewed

@@ -1,43 +1,109 @@
 module ViralSeq
   class DRMs
-    def initialize (mutation_list = {})
-      @mutation_list = mutation_list
-    end
-    attr_accessor :mutation_list
-  end
+    class << self
-  def self.sdrm_hiv_pr(seq_hash)
-  end
-  def self.sdrm_hiv_rt(seq_hash)
-  end
+      # function to retrieve sdrm positions as a hash
+      # @param ref_option [Symbol], name of reference genomes, options are `:hiv_pr`, `:hiv_rt`, `:hiv_in`, `hcv_ns5a`
+      # @return [Hash] Hash of :position_number => [ 'wildtype_codon', ['mutation_codons']]
+      def sdrm_hash(options)
+        sdrm = {}
+        case options
+        when :hcv_ns5a
+          sdrm[28] = ['M',['T']]
+          sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
+          sdrm[31] = ['L',['M','V','F']]
+          sdrm[32] = ['P',['L']]
+          sdrm[44] = ['K',['R']]
+          sdrm[58] = ['H',['D','P','S']]
+          sdrm[64] = ['T',['A','S']]
+          sdrm[77] = ['P',['A','S']]
+          sdrm[78] = ['R',['K']]
+          sdrm[79] = ['T',['A']]
+          sdrm[83] = ['T',['M']]
+          sdrm[85] = ['S',['N','H','Y']]
+          sdrm[92] = ['A',['P','T','K','E']]
+          sdrm[93] = ['Y',['C','F','H','N']]
+          sdrm[107] = ['K',['T','S']]
+          sdrm[121] = ['I',['V']]
+          sdrm[135] = ['T',['A']]
+        when :nrti
+          sdrm[41] = ['M',['L']]
+          sdrm[65] = ['K',['R']]
+          sdrm[67] = ['D',['N','G','E']]
+          sdrm[69] = ['T',['D']]
+          sdrm[70] = ['K',['R','E']]
+          sdrm[74] = ['L',['V','I']]
+          sdrm[75] = ['V',['M','T','A','S']]
+          sdrm[77] = ['F',['L']]
+          sdrm[115] = ['Y',['F']]
+          sdrm[116] = ['F',['Y']]
+          sdrm[151] = ['Q',['M']]
+          sdrm[184] = ['M',['V','I']]
+          sdrm[210] = ['L',['W']]
+          sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
+          sdrm[219] = ["K",["Q","E","N","R"]]
+        when :nnrti
+          sdrm[100] = ['L',['I']]
+          sdrm[101] = ['K',['E','P']]
+          sdrm[103] = ['K',['N','S']]
+          sdrm[106] = ['V',['M','A']]
+          sdrm[179] = ['V',['F','D']]
+          sdrm[181] = ['Y',['C','I','V']]
+          sdrm[188] = ['Y',['L','H','C']]
+          sdrm[190] = ['G',['A','S','E']]
+          sdrm[225] = ['P',['H']]
+          sdrm[230] = ['M',['L']]
+        when :hiv_pr
+          sdrm[23] = ['L',['I']]
+          sdrm[24] = ['L',['I']]
+          sdrm[30] = ['D',['N']]
+          sdrm[32] = ['V',['I']]
+          sdrm[46] = ['M',['I','L']]
+          sdrm[47] = ['I',['V','A']]
+          sdrm[48] = ['G',['V','M']]
+          sdrm[50] = ['I',['V','L']]
+          sdrm[53] = ['F',['L']]
+          sdrm[54] = ['I',['V','L','M','T','A','S']]
+          sdrm[73] = ['G',['S','T','C','A']]
+          sdrm[76] = ['L',['V']]
+          sdrm[82] = ['V',['A','T','S','F','L','C','M']]
+          sdrm[83] = ['N',['D']]
+          sdrm[84] = ['I',['V','A','C']]
+          sdrm[88] = ['N',['D','S']]
+          sdrm[90] = ['L',['M']]
+        when :hiv_in
+          sdrm[66] = ['T',['A','I','K']]
+          sdrm[74] = ['L',['M']]
+          sdrm[92] = ['E',['Q']]
+          sdrm[95] = ['Q',['K']]
+          sdrm[97] = ['T',['A']]
+          sdrm[121] = ['F',['Y']]
+          sdrm[140] = ['G',['A','S','C']]
+          sdrm[143] = ["Y",["C","H","R"]]
+          sdrm[147] = ['S',['G']]
+          sdrm[148] = ['Q',['H','K','R']]
+          sdrm[155] = ['N',['S','H']]
+        else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
+        end
+        return sdrm
+      end # end of #sdrm_hash
-  def self.sdrm_hiv_in(seq_hash)
-  end
-  def self.list_from_json(file)
-  end
-  def self.list_from_csv(file)
-  end
-  def self.export_list_hiv_pr(file, format = :json)
-    if foramt == :json
+      # function to export SDRM positions as json object
+      # @param (see #sdrm_hash)
+      # @return [Array] json Array of SDRM positions
+      def sdrm_json(options)
+        sdrm = ViralSeq::DRMs.sdrm_hash(options)
+        json_array = []
+        sdrm.each do |pos, muts|
+          mutation = {}
+          mutation[:position] = pos
+          mutation[:wildtypeCodon] = muts[0]
+          mutation[:mutationCodons] = muts[1]
+          json_array << mutation
+        end
+        return json_array
+      end
     end
   end
-  def self.export_list_hiv_rt(file, format = :json)
-  end
-  def self.export_list_hiv_in(file, format = :json)
-  end
-  def drm_analysis(seq_hash)
-    mutation_list = self.mutation_list
-  end
 end

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module ViralSeq
   #     # filter nt sequences with the reference coordinates
   #   filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
   #     # return a new ViralSeq::SeqHash object without stop codons
-  #   filtered_seqhash = filtered_seqhash.a3g[1]
+  #   filtered_seqhash = filtered_seqhash.a3g[:filtered_seq]
   #     # further filter out sequences with A3G hypermutations
   #   filtered_seqhash.pi
   #     # return pairwise diveristy π
@@ -187,6 +187,25 @@ module ViralSeq
       return new_seqhash
     end
+    # sample a certain number of sequences from a SeqHash object
+    # @param n [Integer] number of sequences to sample
+    # @return [ViralSeq::SeqHash] sampled SeqHash
+    def sample(n = 1)
+      keys = self.dna_hash.keys
+      sampled_keys = keys.sample(n)
+      sampled_nt = {}
+      sampled_aa = {}
+      sampled_qc = {}
+      sampled_title = self.title + "_sampled_" + n.to_s
+      sampled_keys.each do |k|
+        sampled_nt[k] = self.dna_hash[k]
+        sampled_aa[k] = self.aa_hash[k]
+        sampled_qc[k] = self.qc_hash[k]
+      end
+      return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
+    end
     # write the nt sequences to a FASTA format file
     # @param file [String] path to the FASTA output file
     # @return [NilClass]
@@ -394,7 +413,6 @@ module ViralSeq
             end
           end
         end
         consensus_seq += call_consensus_base(max_base_list)
       end
       return consensus_seq
@@ -583,8 +601,8 @@ module ViralSeq
         temp_dir=File.dirname($0)
       end
-      temp_file = temp_dir + "/_temp_muscle_in"
-      temp_aln = temp_dir + "/_temp_muscle_aln"
+      temp_file = File.join(temp_dir, "_temp_muscle_in")
+      temp_aln = File.join(temp_dir, "_temp_muscle_aln")
       File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
       if path_to_muscle
         unless ViralSeq.check_muscle?(path_to_muscle)
@@ -742,6 +760,7 @@ module ViralSeq
       seq_hash_unique_pass = []
       seq_hash_unique.each do |seq|
+        next if seq.nil?
         loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
         next unless loc # if locator tool fails, skip this seq.
         if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
@@ -808,7 +827,7 @@ module ViralSeq
     end # end of locator
     alias_method :loc, :sequence_locator
-    # Remove squences with residual offspring Primer IDs.
+    # Remove sequences with residual offspring Primer IDs.
     #   Compare PID with sequences which have identical sequences.
     #   PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
     #     each sequence tag starting with ">" and the Primer ID sequence
@@ -1155,6 +1174,7 @@ module ViralSeq
         new_sh.aa_hash[k] = aa_hash[k]
         new_sh.qc_hash[k] = qc_hash[k]
       end
+      new_sh.file = self.file
       new_sh.title = self.title + "_" + n.to_s
       return new_sh
     end

data/lib/viral_seq/seq_hash_pair.rb CHANGED Viewed

@@ -110,19 +110,21 @@ module ViralSeq
       raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
       raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
       joined_seq = {}
-      seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
+      seq_pair_hash.each do |seq_name,seq_pair|
         r1_seq = seq_pair[0]
         r2_seq = seq_pair[1]
         if overlap.zero?
           joined_sequence = r1_seq + r2_seq
+        elsif diff.zero?
+          if r1_seq[-overlap..-1] == r2_seq[0,overlap]
+            joined_sequence= r1_seq + r2_seq[overlap..-1]
+          end
         elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
           joined_sequence= r1_seq + r2_seq[overlap..-1]
         else
           next
         end
-        seq_names.each do |seq_name|
-          joined_seq[seq_name] = joined_sequence
-        end
+        joined_seq[seq_name] = joined_sequence if joined_sequence
       end
       joined_seq_hash = ViralSeq::SeqHash.new

data/lib/viral_seq/sequence.rb CHANGED Viewed

@@ -113,7 +113,7 @@ module ViralSeq
     def sdrm(option, start_aa = 1)
       aa_array = self.aa_array
       out_hash = {}
-      sdrm = sdrm_hash(option)
+      sdrm = ViralSeq::DRMs.sdrm_hash(option)
       aa_length = aa_array.size
       end_aa = start_aa + aa_length - 1
       (start_aa..end_aa).each do |position|
@@ -535,88 +535,5 @@ module ViralSeq
       return aa_out
     end # end of #amino_acid_2
-    # sdrm position hash
-    def sdrm_hash(options)
-      sdrm = {}
-      case options
-      when :hcv_ns5a
-        sdrm[28] = ['M',['T']]
-        sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
-        sdrm[31] = ['L',['M','V','F']]
-        sdrm[32] = ['P',['L']]
-        sdrm[44] = ['K',['R']]
-        sdrm[58] = ['H',['D','P','S']]
-        sdrm[64] = ['T',['A','S']]
-        sdrm[77] = ['P',['A','S']]
-        sdrm[78] = ['R',['K']]
-        sdrm[79] = ['T',['A']]
-        sdrm[83] = ['T',['M']]
-        sdrm[85] = ['S',['N','H','Y']]
-        sdrm[92] = ['A',['P','T','K','E']]
-        sdrm[93] = ['Y',['C','F','H','N']]
-        sdrm[107] = ['K',['T','S']]
-        sdrm[121] = ['I',['V']]
-        sdrm[135] = ['T',['A']]
-      when :nrti
-        sdrm[41] = ['M',['L']]
-        sdrm[65] = ['K',['R']]
-        sdrm[67] = ['D',['N','G','E']]
-        sdrm[69] = ['T',['D']]
-        sdrm[70] = ['K',['R','E']]
-        sdrm[74] = ['L',['V','I']]
-        sdrm[75] = ['V',['M','T','A','S']]
-        sdrm[77] = ['F',['L']]
-        sdrm[115] = ['Y',['F']]
-        sdrm[116] = ['F',['Y']]
-        sdrm[151] = ['Q',['M']]
-        sdrm[184] = ['M',['V','I']]
-        sdrm[210] = ['L',['W']]
-        sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
-        sdrm[219] = ["K",["Q","E","N","R"]]
-      when :nnrti
-        sdrm[100] = ['L',['I']]
-        sdrm[101] = ['K',['E','P']]
-        sdrm[103] = ['K',['N','S']]
-        sdrm[106] = ['V',['M','A']]
-        sdrm[179] = ['V',['F','D']]
-        sdrm[181] = ['Y',['C','I','V']]
-        sdrm[188] = ['Y',['L','H','C']]
-        sdrm[190] = ['G',['A','S','E']]
-        sdrm[225] = ['P',['H']]
-        sdrm[230] = ['M',['L']]
-      when :hiv_pr
-        sdrm[23] = ['L',['I']]
-        sdrm[24] = ['L',['I']]
-        sdrm[30] = ['D',['N']]
-        sdrm[32] = ['V',['I']]
-        sdrm[46] = ['M',['I','L']]
-        sdrm[47] = ['I',['V','A']]
-        sdrm[48] = ['G',['V','M']]
-        sdrm[50] = ['I',['V','L']]
-        sdrm[53] = ['F',['L']]
-        sdrm[54] = ['I',['V','L','M','T','A','S']]
-        sdrm[73] = ['G',['S','T','C','A']]
-        sdrm[76] = ['L',['V']]
-        sdrm[82] = ['V',['A','T','S','F','L','C','M']]
-        sdrm[83] = ['N',['D']]
-        sdrm[84] = ['I',['V','A','C']]
-        sdrm[88] = ['N',['D','S']]
-        sdrm[90] = ['L',['M']]
-      when :hiv_in
-        sdrm[66] = ['T',['A','I','K']]
-        sdrm[74] = ['L',['M']]
-        sdrm[92] = ['E',['Q']]
-        sdrm[95] = ['Q',['K']]
-        sdrm[97] = ['T',['A']]
-        sdrm[121] = ['F',['Y']]
-        sdrm[140] = ['G',['A','S','C']]
-        sdrm[143] = ["Y",["C","H","R"]]
-        sdrm[147] = ['S',['G']]
-        sdrm[148] = ['Q',['H','K','R']]
-        sdrm[155] = ['N',['S','H']]
-      else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
-      end
-      return sdrm
-    end
   end # end of ViralSeq::Sequence
 end # end of ViralSeq