RubyGems - viral_seq - Versions diffs - 1.6.4 → 1.7.1 - Mend

viral_seq 1.6.4 → 1.7.1

Files changed (12) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +20 -0
data/bin/tcs +44 -8
data/bin/tcs_log +1 -1
data/lib/viral_seq/seq_hash.rb +11 -8
data/lib/viral_seq/sequence.rb +5 -5
data/lib/viral_seq/string.rb +37 -0
data/lib/viral_seq/tcs_core.rb +4 -4
data/lib/viral_seq/version.rb +2 -2
metadata +2 -3
data/rc_swans.svc@longleaf.unc.edu +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 02d26d720fef0501d70b012d9919f932b23a21b1caabbf508a56fffa162c311b
-  data.tar.gz: '0681add2b2fa2ca7dedffeaaf43bd8b0e2b6200dd38633e2eeda58946ced8238'
+  metadata.gz: 4e6d55ab37ecd3b9c5688c99772fc49792a5319bac853ac768367a8b42c0e0b6
+  data.tar.gz: a69e78c80f22848facb41ad4f9d9fb64e6d4e47ff6e18afa3421d64513ce6558
 SHA512:
-  metadata.gz: 301c188d736c9812006d30db8995fa7df683cc63443c1370de3d487dae77b88cfc60c8b674abc93b35f7457ced536a6e374433e5fbe0f423e4a7993ea4240ebc
-  data.tar.gz: 1118ab7b586da98bb2c3533f81c35a02f0efb1978c0b91e15b56218b05342ad83e73f96bd2df53f75e2e6cdd16c591582ebe864562a02d3dd78997678b706233
+  metadata.gz: ae34ac12bd2b86d4c7fc040765b26b94d41cfe239a206b2e84bf55841988826bcfbf685e788b93224ee78e29b1280454059991d644f81cbf24f1b97fff3f2294
+  data.tar.gz: 254993ea2126ca51d0ad5e2b6be2dca90e1b3ed817266e46b5ca46f91d2a69288c0a87c58906ef3da8ad6465e08788c850d6bc72013e30f1ead13e186ba16dfd

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    viral_seq (1.6.2)
+    viral_seq (1.6.5)
       colorize (~> 0.1)
       combine_pdf (~> 1.0, >= 1.0.0)
       muscle_bio (~> 0.5)

data/README.md CHANGED Viewed

@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
 Specifically for Primer ID sequencing and HIV drug resistance analysis.
+CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
 #### tcs web app - https://primer-id.org/
 ## Illustration for the Primer ID Sequencing
@@ -22,6 +24,12 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
 [Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
 [Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
+## Requirements
+Required Ruby Version: >= 2.5
+Required RubyGems version: >= 1.3.6
 ## Install
 ```bash
@@ -179,6 +187,18 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
 ## Updates
+### Version-1.7.1-05120203
+  1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
+  2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
+  3. Bug fix.
+  4. TCS version to 2.5.2
+### Version-1.7.0-08242022
+  1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
+  2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
 ### Version-1.6.4-07182022
   1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.

data/bin/tcs CHANGED Viewed

@@ -22,20 +22,38 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+# Install using `gem install viral_seq`
 # Use JSON file as the run param
 # run `tcs -j` to generate param json file.
-require 'viral_seq'
+def gem_installed?(gem_name)
+  found_gem = false
+  begin
+    found_gem = Gem::Specification.find_by_name(gem_name)
+  rescue Gem::LoadError
+    return false
+  else
+    return true
+  end
+end
+if gem_installed?('viral_seq')
+  require 'viral_seq'
+else
+  printf "\n****************************************************\n"
+  printf "**** THIS PACKAGE CANNOT BE RUN FROM SOURCE ********\n"
+  printf "**** PLEASE INSTALL USING `gem install viral_seq` **\n"
+  printf "****************************************************\n\n"
+  exit 1
+end
 require 'json'
 require 'colorize'
 require 'optparse'
 options = {}
-# banner = '-'*50 + "\n" +
-#         '| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
-#         '-'*50 + "\n"
 banner = "\n" +
 "████████  ██████ ███████     ██████  ██ ██████  ███████ ██      ██ ███    ██ ███████\n".light_red +
 "   ██    ██      ██          ██   ██ ██ ██   ██ ██      ██      ██ ████   ██ ██\n".light_yellow +
@@ -86,7 +104,7 @@ end.parse!
 if options[:json_generator]
   params = ViralSeq::TcsJson.generate
 elsif options[:dr]
-  params = ViralSeq::TcsDr::PARAMS
+  params = ViralSeq::TcsDr::PARAMS
 elsif (options[:params_json] && File.exist?(options[:params_json]))
   params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
 else
@@ -145,6 +163,24 @@ begin
     $platform_sequencing_length = 300
   end
+  r1_raw_size = r1_fastq_sh.dna_hash.values[0].size
+  r2_raw_size = r2_fastq_sh.dna_hash.values[0].size
+  if r1_raw_size >= $platform_sequencing_length
+    r1_size_diff = r1_raw_size - $platform_sequencing_length
+  else
+    raise StandardError.new "R1 size smaller than the input platform format #{$platform_sequencing_length} bp."
+  end
+  if r2_raw_size >= $platform_sequencing_length
+    r2_size_diff = r2_raw_size - $platform_sequencing_length
+  else
+    raise StandardError.new "R2 size smaller than the input platform format #{$platform_sequencing_length} bp."
+  end
+  r1_truncate_base_number = 2 + r1_size_diff
+  r2_truncate_base_number = 2 + r2_size_diff
   primers = params[:primer_pairs]
   if primers.empty? or primers.nil?
     ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
@@ -217,8 +253,8 @@ begin
       r2_seq = r2_passed_seq[seqtag]
       pid = r2_seq[0, pid_length]
       id[seqtag] = pid
-      bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
-      bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
+      bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-r2_truncate_base_number]
+      bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-r1_truncate_base_number]
     end
     # TCS cut-off

data/bin/tcs_log CHANGED Viewed

@@ -155,7 +155,7 @@ region_colors = {"Other" => "#808080"}
 CSV.foreach(log_file).each_with_index do |row, i|
     next if i == 0 || row[0] == nil
-    lib_name = row[0]
+    lib_name = row[0].to_s
     region = row[1]
     raw_sequences_per_barcode = row[2].to_i

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -450,7 +450,7 @@ module ViralSeq
     # function to determine if the sequences have APOBEC3g/f hypermutation.
     #   # APOBEC3G/F pattern: GRD -> ARD
     #   # control pattern: G[YN|RC] -> A[YN|RC]
-    #   # use the sample consensus to determine potential a3g sites
+    #   # use the sample consensus to determine potential a3g sites (default) or provide external reference sequences as a `String`
     #   # Two criteria to identify hypermutation
     #   # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
     #   # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
@@ -486,7 +486,7 @@ module ViralSeq
     #   # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
     # @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
-    def a3g_hypermut
+    def a3g_hypermut(ref = nil)
       # mut_hash number of apobec3g/f mutations per sequence
       mut_hash = {}
       hm_hash = {}
@@ -495,8 +495,10 @@ module ViralSeq
       # total G->A mutations at apobec3g/f positions.
       total = 0
-      # make consensus sequence for the input sequence hash
-      ref = self.consensus
+      unless ref
+        # make consensus sequence for the input sequence hash
+        ref = self.consensus
+      end
       # obtain apobec3g positions and control positions
       apobec = apobec3gf(ref)
@@ -509,7 +511,6 @@ module ViralSeq
         c = 0 # control muts
         d = 0 # potenrial controls
         mut.each do |n|
-          next if v[n] == "-"
           if v[n] == "A"
             a += 1
             b += 1
@@ -521,7 +522,6 @@ module ViralSeq
         total += a
         control.each do |n|
-          next if v[n] == "-"
           if v[n] == "A"
             c += 1
             d += 1
@@ -544,7 +544,7 @@ module ViralSeq
         end
       end
-      if self.dna_hash.size > 20
+      if self.dna_hash.size > 200
         rate = total.to_f/(self.dna_hash.size)
         count_mut = mut_hash.values.count_freq
         maxi_count = count_mut.values.max
@@ -566,10 +566,12 @@ module ViralSeq
           end
         end
       end
       hm_seq_hash = ViralSeq::SeqHash.new
       hm_hash.each do |k,_v|
         hm_seq_hash.dna_hash[k] = self.dna_hash[k]
       end
       hm_seq_hash.title = self.title + "_hypermut"
       hm_seq_hash.file = self.file
       filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
@@ -1356,7 +1358,7 @@ module ViralSeq
     # APOBEC3G/F pattern: GRD -> ARD,
     # control pattern: G[YN|RC] -> A[YN|RC],
     def apobec3gf(seq = '')
-      seq.tr!("-", "")
+      #seq.tr!("-", "")
       seq_length = seq.size
       apobec_position = []
       control_position = []
@@ -1368,6 +1370,7 @@ module ViralSeq
           control_position << n
         end
       end
       return [apobec_position,control_position]
     end # end of #apobec3gf

data/lib/viral_seq/sequence.rb CHANGED Viewed

@@ -180,7 +180,7 @@ module ViralSeq
         l1 = 0
         l2 = 0
-        aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :PPP, path_to_muscle)
+        aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :Super5, path_to_muscle)
         aln_test = aln_seq[1]
         aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
         gap_begin = $1.size
@@ -214,7 +214,7 @@ module ViralSeq
             l2 = l2 + (post_aln - b2)
           end
-          aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
+          aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
           aln_test = aln_seq[1]
           aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
           gap_begin = $1.size
@@ -263,7 +263,7 @@ module ViralSeq
         end
         while repeat == 1
-          aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
+          aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
           aln_test = aln_seq[1]
           aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
           gap_begin = $1.size
@@ -293,7 +293,7 @@ module ViralSeq
         end
         ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
-        aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
+        aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
         aln_test = aln_seq[1]
         ref = aln_seq[0]
@@ -307,7 +307,7 @@ module ViralSeq
         if (ori_ref_l - l2 - 1) >= l1
           ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
-          aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
+          aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
           aln_test = aln_seq[1]
           ref = aln_seq[0]

data/lib/viral_seq/string.rb CHANGED Viewed

@@ -56,6 +56,43 @@ class String
     Regexp.new match
   end
+  # parse the nucleotide sequences as an Array of Array
+  # @return [Array] Array of Array at each position
+  # @example parse a sequence with ambiguities to Array of Array
+  #   "ATRWCG".nt_to_array
+  #   => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
+  def nt_to_array
+    return_array = []
+    self.each_char.each do |base|
+      base_array = base.to_list
+      return_array.append base_array
+    end
+    return return_array
+  end
+  # compare the given nt sequence string with the ref sequence string
+  # @param ref [String] the ref sequence string to compare with
+  # @return [Interger] Number of differences
+  # @example parse a sequence with ambiguities to Array of Array
+  #   "ATRWCG".nt_to_array
+  #   => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
+  def nt_diff(ref)
+    count_diff = 0
+    self_array = self.split("")
+    ref_array = ref.nt_to_array
+    self_array.each_with_index do |nt, i|
+      ref_nt = ref_array[i]
+      unless ref_nt.include? nt
+        count_diff += 1
+      end
+    end
+    return count_diff
+  end
   # parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
   # @return [Array] parsed nt bases
   # @example parse IUPAC `R`

data/lib/viral_seq/tcs_core.rb CHANGED Viewed

@@ -223,7 +223,7 @@ module ViralSeq
         end
         forward_bio_primer_size = forward_bio_primer.size
         forward_starting_number = forward_n + forward_bio_primer_size
-        forward_primer_ref = forward_bio_primer.nt_parser
+        #forward_primer_ref = forward_bio_primer.nt_parser
         r1_passed_seq = {}
         r1_raw = r1_sh.dna_hash
@@ -232,7 +232,7 @@ module ViralSeq
           seq = r1_raw[name]
           next unless general_filter seq
           primer_region_seq = seq[forward_n, forward_bio_primer_size]
-          if primer_region_seq =~ forward_primer_ref
+          if primer_region_seq.nt_diff(forward_bio_primer) < 3
             new_name = remove_tag name
             r1_passed_seq[new_name] = seq
           end
@@ -255,13 +255,13 @@ module ViralSeq
         cdna_bio_primer = $2
         cdna_bio_primer_size = cdna_bio_primer.size
         reverse_starting_number = pid_length + cdna_bio_primer_size
-        cdna_primer_ref = cdna_bio_primer.nt_parser
+       # cdna_primer_ref = cdna_bio_primer.nt_to_array
         r2_passed_seq = {}
         proc_filter = proc do |name|
           seq = r2_raw[name]
           next unless general_filter seq
           primer_region_seq = seq[pid_length, cdna_bio_primer_size]
-          if primer_region_seq =~ cdna_primer_ref
+          if primer_region_seq.nt_diff(cdna_bio_primer) < 4
             new_name = remove_tag name
             r2_passed_seq[new_name] = seq
           end

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.6.4"
-  TCS_VERSION = "2.5.1"
+  VERSION = "1.7.1"
+  TCS_VERSION = "2.5.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.6.4
+  version: 1.7.1
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-07-19 00:00:00.000000000 Z
+date: 2023-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -193,7 +193,6 @@ files:
 - lib/viral_seq/tcs_dr.rb
 - lib/viral_seq/tcs_json.rb
 - lib/viral_seq/version.rb
-- rc_swans.svc@longleaf.unc.edu
 - viral_seq.gemspec
 homepage: https://github.com/ViralSeq/viral_seq
 licenses:

data/rc_swans.svc@longleaf.unc.edu DELETED Viewed

Binary file