RubyGems - viral_seq - Versions diffs - 1.2.7 → 1.4.0 - Mend

viral_seq 1.2.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 554845dba339d0e06b84c88bc117258516f391bdf58cce015c2669e7b2c6c0d5
-  data.tar.gz: 870280337c90d1f5b9ecbea6e6478d7e2dc22aa70917c6b2ecd94afaa185c1c6
+  metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
+  data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
 SHA512:
-  metadata.gz: 54db76e6fd8333ccebb19dee602378ec8dbe5d196ec7bd675e55f65db80cb06ac2ab51ce1f13ab7ea65c0a50ad49978bd3e9581074c497b298f0912858946fa8
-  data.tar.gz: 03d02329192465a9f278715c8a85e3a910e5c5c7252026980d29e669df823a5bdb4be323eeb56f7c9804b71fa8f1763c5a526227f3764315d6eb8e208934ce81
+  metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
+  data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead

data/Gemfile.lock CHANGED Viewed

@@ -1,12 +1,12 @@
 PATH
   remote: .
   specs:
-    viral_seq (1.1.1)
-      colorize (>= 0.1)
-      combine_pdf (>= 1.0.0)
-      muscle_bio (>= 0.4)
-      prawn (>= 2.3.0)
-      prawn-table (>= 0.2.0)
+    viral_seq (1.3.0)
+      colorize (~> 0.1)
+      combine_pdf (~> 1.0, >= 1.0.0)
+      muscle_bio (~> 0.4)
+      prawn (~> 2.3, >= 2.3.0)
+      prawn-table (~> 0.2, >= 0.2.0)
 GEM
   remote: https://rubygems.org/

data/README.md CHANGED Viewed

@@ -179,10 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
 ## Updates
+### Version 1.4.0-10132021
+  1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
+  2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
+### Version 1.3.0-08302021
+  1. Fixed a bug in the `tcs` pipeline.
+### Version 1.2.9-08022021
+  1. Fixed a bug when reading the input primer sequences in lowercases.
+  2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
+### Version 1.2.8-07292021
+  1. Fixed an issue when reading .fastq files containing blank_lines.
 ### Version 1.2.7-07152021
-  1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
-  `tcs` version to v2.3.5.
+  1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
+  `tcs` version to v2.3.6.
 ### Version 1.2.6-07122021

data/bin/tcs CHANGED Viewed

@@ -152,8 +152,8 @@ begin
     primer[:region] ? region = primer[:region] : region = "region"
     summary_json[:primer_set_name] = region
-    cdna_primer = primer[:cdna]
-    forward_primer = primer[:forward]
+    cdna_primer = primer[:cdna].upcase
+    forward_primer = primer[:forward].upcase
     export_raw = primer[:export_raw]
     limit_raw = primer[:limit_raw]
@@ -401,7 +401,11 @@ begin
       when 4
         joined_sh = shp.join2(model: :indiv)
       end
-      return joined_sh
+      if joined_sh
+        return joined_sh
+      else
+        joined_sh = ViralSeq::SeqHash.new
+      end
     end
     if primer[:end_join]

data/bin/tcs_sdrm CHANGED Viewed

@@ -91,12 +91,12 @@ libs.each do |lib|
   point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
   point_mutation_out = File.open(point_mutation_file, "w")
   point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
-                          "number,percentage,95% CI low, 95% CI high, notes"
+                          "number,frequency,95% CI low,95% CI high,fdr,notes"
   linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
   linkage_out = File.open(linkage_file, "w")
   linkage_out.puts "region,TCS,mutation linkage,number," +
-                   "percentage,95% CI low, 95% CI high, notes"
+                   "frequency,95% CI low, 95% CI high, notes"
   aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
   aa_report_out = File.open(aa_report_file, "w")
@@ -132,6 +132,7 @@ libs.each do |lib|
       stop_codon_seqs = stop_codon_check[:with_stop_codon]
       filtered_seqs = stop_codon_check[:without_stop_codon]
       poisson_minority_cutoff = filtered_seqs.pm
+      fdr_hash = filtered_seqs.fdr
       summary_hash[:PR] = [
                             seqs.size.to_s,
                             a3g_seqs.size.to_s,
@@ -142,7 +143,7 @@ libs.each do |lib|
       next if filtered_seqs.size < 3
       filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
-      sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
+      sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
       point_mutation_list += sdrm[0]
       linkage_list += sdrm[1]
       aa_report_list += sdrm[2]
@@ -155,6 +156,7 @@ libs.each do |lib|
       stop_codon_seqs = stop_codon_check[:with_stop_codon]
       filtered_seqs = stop_codon_check[:without_stop_codon]
       poisson_minority_cutoff = filtered_seqs.pm
+      fdr_hash = filtered_seqs.fdr
       summary_hash[:IN] = [
                             seqs.size.to_s,
                             a3g_seqs.size.to_s,
@@ -165,7 +167,7 @@ libs.each do |lib|
       next if filtered_seqs.size < 3
       filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
-      sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
+      sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
       point_mutation_list += sdrm[0]
       linkage_list += sdrm[1]
       aa_report_list += sdrm[2]
@@ -190,6 +192,7 @@ libs.each do |lib|
       reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
       filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
       poisson_minority_cutoff = filtered_seqs.pm
+      fdr_hash = filtered_seqs.fdr
       summary_hash[:RT] = [
                             seqs.size.to_s,
                             hypermut_seq_keys.size.to_s,
@@ -200,7 +203,7 @@ libs.each do |lib|
       next if filtered_seqs.size < 3
       filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
-      sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
+      sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
       point_mutation_list += sdrm[0]
       linkage_list += sdrm[1]
       aa_report_list += sdrm[2]
@@ -346,7 +349,7 @@ libs.each do |lib|
       title: "Surveillance Drug Resistance Mutations",
       file: point_mutation_file,
       newPDF: "",
-      table_width: [65,55,85,80,60,65,85,85,85,45],
+      table_width: [60,50,70,65,65,60,75,70,70,70,45],
       extra_text: "* Mutation below Poisson cut-off for minority mutations"
     },
     {

data/lib/viral_seq/hivdr.rb CHANGED Viewed

@@ -9,10 +9,13 @@ module ViralSeq
     #   IN codon 53-174 (HXB2 4384-4751)
     # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
     #   can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
+    # @param fdr [Hash] hash of events => (false detecton rate)
+    #   can be obtained using ViralSeq::SeqHash#fdr
+    #
     # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
     #
     #   # point_mutation_list: two demensional array for the following information,
-    #     # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
+    #     # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
     #   # linkage_list: two demensional array for the following information,
     #     # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
     #   # report_list: two demensional array for the following information,
@@ -20,12 +23,13 @@ module ViralSeq
     # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
     #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
     #   p_cut_off = my_seqhash.pm
-    #   pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
-    #   puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
-    #   => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
-    #   => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
-    #   => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
-    #   => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
+    #   fdr_hash = my_seqhash.fdr
+    #   pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
+    #   puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
+    #   => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
+    #   => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
+    #   => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
+    #   => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
     #
     #   puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
     #   => region,tcs_number,linkage,count,%,CI_low,CI_high,label
@@ -136,7 +140,7 @@ module ViralSeq
     #   => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
     #   => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-    def sdrm_hiv_pr(cutoff = 0)
+    def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
       sequences = self.dna_hash
       region = "PR"
       rf_label = 0
@@ -167,8 +171,9 @@ module ViralSeq
         count_mut_list = mut_list.count_freq
         count_mut_list.each do |m,number|
           ci = ViralSeq::Math::BinomCI.new(number, n_seq)
+          fdr = fdr_hash[number].round(5)
           label = number < cutoff ? "*" : ""
-          point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
+          point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
         end
       end
       point_mutation_list.sort_by! {|record| record[2]}
@@ -229,7 +234,7 @@ module ViralSeq
     # @param (see #sdrm_hiv_pr)
     # @return (see #sdrm_hiv_pr)
-    def sdrm_hiv_rt(cutoff = 0)
+    def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
       sequences = self.dna_hash
       region = "RT"
       rf_label = 1
@@ -280,8 +285,9 @@ module ViralSeq
         count_mut_list = mut_list.count_freq
         count_mut_list.each do |m,number|
           ci = ViralSeq::Math::BinomCI.new(number, n_seq)
+          fdr = fdr_hash[number].round(5)
           label = number < cutoff ? "*" : ""
-          point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
+          point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
         end
       end
@@ -291,8 +297,9 @@ module ViralSeq
         count_mut_list = mut_list.count_freq
         count_mut_list.each do |m,number|
           ci = ViralSeq::Math::BinomCI.new(number, n_seq)
+          fdr = fdr_hash[number].round(5)
           label = number < cutoff ? "*" : ""
-          point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
+          point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
         end
       end
@@ -365,7 +372,7 @@ module ViralSeq
     # @param (see #sdrm_hiv_pr)
     # @return (see #sdrm_hiv_pr)
-    def sdrm_hiv_in(cutoff = 0)
+    def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
       sequences = self.dna_hash
       region = "IN"
       rf_label = 2
@@ -397,8 +404,9 @@ module ViralSeq
         count_mut_list = mut_list.count_freq
         count_mut_list.each do |m,number|
           ci = ViralSeq::Math::BinomCI.new(number, n_seq)
+          fdr = fdr_hash[number].round(5)
           label = number < cutoff ? "*" : ""
-          point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
+          point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
         end
       end
       point_mutation_list.sort_by! {|record| record[2]}

data/lib/viral_seq/math.rb CHANGED Viewed

@@ -31,7 +31,7 @@ module ViralSeq
       def rand
         if (@compute_next_pair = !@compute_next_pair)
           theta = 2 * ::Math::PI * @rng.call
-          scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
+          scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
           @g1 = @mean + scale * ::Math.sin(theta)
           @g0 = @mean + scale * ::Math.cos(theta)
         else

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -116,6 +116,8 @@ module ViralSeq
       File.open(fastq_file,'r') do |file|
         file.readlines.collect do |line|
+          line.tr!("\u0000","")
+          next if line == "\n"
           count +=1
           count_m = count % 4
           if count_m == 1
@@ -590,6 +592,37 @@ module ViralSeq
     alias_method :pm, :poisson_minority_cutoff
+    # calculate false detection rate for minority mutations
+    # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
+    # @param error_rate [Float] estimated sequencing error rate
+    # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
+    # @example calculate FDR for mutations that appeared twice in the sample dataset
+    #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
+    #   fdr_hash = my_seqhash.fdr
+    #   fdr_hash[2].round(5)
+    #   => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
+    def fdr(error_rate = 0.0001)
+      sequences = self.dna_hash.values
+      if sequences.size == 0
+        return {}
+      else
+        seq_count = self.size
+        observed_hash = variant_for_poisson(sequences)
+        p_unadjusted = []
+        observed_hash.each do |k, v|
+          p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
+          p_unadjusted += Array.new(v, p_value)
+        end
+        p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
+        vars_pair = observed_hash.to_a
+        fdr_hash = Hash.new(0)
+        (0..(p_fdr.size - 1)).each do |i|
+          fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
+        end
+        return fdr_hash
+      end
+    end #end of #fdr
     # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
     # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio

data/lib/viral_seq/tcs_core.rb CHANGED Viewed

@@ -305,7 +305,8 @@ module ViralSeq
       end
       def general_filter(seq)
-        if seq.size < ($platform_sequencing_length - 1)
+        return false unless seq
+        if seq.size < ($platform_sequencing_length - 10)
           return false
         elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
           return false

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.2.7"
-  TCS_VERSION = "2.3.6"
+  VERSION = "1.4.0"
+  TCS_VERSION = "2.3.8"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.2.7
+  version: 1.4.0
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-07-15 00:00:00.000000000 Z
+date: 2021-10-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler