RubyGems - viral_seq - Versions diffs - 1.4.0 → 1.5.0 - Mend

viral_seq 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
-  data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
+  metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
+  data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
 SHA512:
-  metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
-  data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
+  metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
+  data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6

data/README.md CHANGED Viewed

@@ -179,6 +179,14 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
 ## Updates
+### Version 1.5.0-01042022
+  1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
+  2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
+  3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
+  4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
+  5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
 ### Version 1.4.0-10132021
   1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`

data/bin/tcs CHANGED Viewed

@@ -200,7 +200,7 @@ begin
     summary_json[:paired_raw_sequence] = paired_seq_number
     if paired_seq_number < raw_sequence_number * 0.001
       summary_json[:warnings] <<
-        "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
+        "WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
     end
     common_keys.each do |seqtag|

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -208,6 +208,31 @@ module ViralSeq
       return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
     end
+    # return a new SeqHash object with given a range on the nt sequence position
+    # @param range [Range] range of positions on the nt sequence
+    # @return [ViralSeq::SeqHash] a sub SeqHash object
+    def nt_range(range)
+      dna_hash = self.dna_hash
+      new_hash = {}
+      dna_hash.each do |k,v|
+        new_hash[k] = v[range]
+      end
+      ViralSeq::SeqHash.new(new_hash)
+    end # end of #nt_range
+    # check the size range of the DNA sequences of the SeqHash object
+    # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
+    def check_nt_size
+      dna_hash = self.dna_hash
+      size_array = []
+      dna_hash.values.each do |v|
+        size_array << v.size
+      end
+      return { max: size_array.max, min: size_array.min }
+    end
     # write the nt sequences to a FASTA format file
     # @param file [String] path to the FASTA output file
     # @return [NilClass]
@@ -624,6 +649,67 @@ module ViralSeq
       end
     end #end of #fdr
+    # analysis for the nt sequence variants.
+    # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
+    def nt_variants
+      return_obj = {}
+      nt_hash = self.dna_hash
+      tcs_number = self.size
+      dl = ViralSeq::TcsCore.detection_limit(tcs_number)
+      fdr_hash = self.fdr
+      pm_cut_off = self.pm
+      con = self.consensus
+      return_obj[:tcs_number] = tcs_number
+      return_obj[:lower_detection_limit] = dl
+      return_obj[:pm_cut_off] = pm_cut_off
+      return_obj[:positions] = []
+      cis = {}
+      (0..(con.size - 1)).each do |p|
+        position_obj = {}
+        position_obj[:position] = p + 1
+        position_obj[:tcs_number] = tcs_number
+        position_obj[:lower_detection_limit] = dl
+        position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
+        nts = []
+        dna_hash.each do |n,s|
+          nts << s[p]
+        end
+        freq_hash = nts.count_freq
+        [:A, :C, :G, :T, :-].each do |k|
+          v = freq_hash[k.to_s]
+          position_obj[k] = {}
+          position_obj[k][:count] = v
+          if v > 0
+            if cis[[v, tcs_number]]
+              ci = cis[[v, tcs_number]]
+            else
+              ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
+              cis[[v, tcs_number]] = ci
+            end
+            position_obj[k][:freq] = ci.mean.round(4)
+            position_obj[k][:freq_ci_low] = ci.lower.round(4)
+            position_obj[k][:freq_ci_high] = ci.upper.round(4)
+            position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
+            position_obj[k][:fdr] = fdr_hash[v]
+          else
+            position_obj[k][:freq] = 0
+            position_obj[k][:freq_ci_low] = 0
+            position_obj[k][:freq_ci_high] = 0
+            position_obj[k][:greater_than_pm] = false
+            position_obj[k][:fdr] = nil
+          end
+        end
+        return_obj[:positions] << position_obj
+      end
+      return_obj
+    end # end of nt_variants
     # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
     # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
     # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
@@ -1214,6 +1300,28 @@ module ViralSeq
       return new_sh
     end
+    # QC for each nucleotide sequence comparing with sample consensus for indels
+    # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
+    def qc_indel
+      con = self.consensus
+      dna_hash = self.dna_hash
+      names_passed = []
+      names_indel = []
+      dna_hash.uniq_hash.each do |seq, names|
+        if seq.compare_with(con) < 4
+          names_passed += names
+        elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
+          names_indel += names
+        else
+          names_passed += names
+        end
+      end
+      return {no_indel: self.sub(names_passed),
+        has_indel: self.sub(names_indel)}
+    end # end of qc_indel
     # trim dna sequences based on the provided reference coordinates.
     # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
     # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array

data/lib/viral_seq/tcs_core.rb CHANGED Viewed

@@ -280,6 +280,19 @@ module ViralSeq
         abort infor.red.bold
       end
+      # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
+      # R required.
+      # @param tcs_number [Integer] number of TCS
+      # @return [Float] lower detection limit
+      # @example calculate lower detection limit
+      #   ViralSeq::TcsCore.detection_limit(100)
+      #   => 0.0362
+      def detection_limit(tcs_number)
+        dl =  `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
+        dl.to_f
+      end
       private
       def unzip_r(indir, f)

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.4.0"
-  TCS_VERSION = "2.3.8"
+  VERSION = "1.5.0"
+  TCS_VERSION = "2.4.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.4.0
+  version: 1.5.0
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-10-14 00:00:00.000000000 Z
+date: 2022-01-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler