RubyGems - viral_seq - Versions diffs - 1.0.6 → 1.0.7 - Mend

viral_seq 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -2
data/README.md +9 -2
data/bin/locator +11 -9
data/lib/viral_seq/muscle.rb +2 -2
data/lib/viral_seq/seq_hash.rb +112 -17
data/lib/viral_seq/seq_hash_pair.rb +9 -5
data/lib/viral_seq/version.rb +1 -1
data/viral_seq.gemspec +4 -0
metadata +17 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: df8d50f2dfbf0f2e7e1efcf66c8a91c286c7b5029533b725a4a33219882748bb
-  data.tar.gz: 4061c3875d4629025d1ccc216a54fdb7a011d397408a3ecb15125475e9f262e9
+  metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
+  data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
 SHA512:
-  metadata.gz: a52087ced9fe258ef5bab4449b90e964ff9a557292dc1ce679aae03a56bd2570fdf1221e7026fec2b1ccb49ad2a9ff076338a397982e47c46877e2cdfb4e6d2e
-  data.tar.gz: 792cb9424fd46d536d0b95cfc90914a8548ee5ea6d1c3efe45cccd1d01c6dbd6b7a7ee0ba1be010bd6cf7a3ea201f4850803c28f16889b5286e1e458a774c8f1
+  metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
+  data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85

data/Gemfile.lock CHANGED

@@ -1,12 +1,14 @@
 PATH
   remote: .
   specs:
-    viral_seq (1.0.5)
+    viral_seq (1.0.7)
+      colorize (~> 0.1)
       muscle_bio (~> 0.4)
 GEM
   remote: https://rubygems.org/
   specs:
+    colorize (0.8.1)
     diff-lcs (1.3)
     muscle_bio (0.4.0)
     rake (10.5.0)
@@ -34,4 +36,4 @@ DEPENDENCIES
   viral_seq!
 BUNDLED WITH
-   2.0.2
+   2.1.4

data/README.md CHANGED

@@ -14,7 +14,7 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
     #!/usr/bin/env ruby
     require 'viral_seq'
 #### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
     $ locator -i sequence.fasta -o sequence.fasta.csv
@@ -51,6 +51,13 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
 ## Updates
+Version 1.0.7-01282020:
+    1. Several methods added, including
+        ViralSeq::SeqHash#error_table
+        ViralSeq::SeqHash#random_select
+    2. Improved performance for several functions.
 Version 1.0.6-07232019:
     1. Several methods added to ViralSeq::SeqHash, including
@@ -58,7 +65,7 @@ Version 1.0.6-07232019:
         ViralSeq::SeqHash#+
         ViralSeq::SeqHash#write_nt_fa
         ViralSeq::SeqHash#mutation
-    2. Update documentations and rspec samples.
+    2. Update documentations and rspec samples.
 Version 1.0.5-07112019:

data/bin/locator CHANGED

@@ -3,13 +3,14 @@
 require 'viral_seq'
 require 'csv'
 require 'optparse'
+require 'colorize'
 def myparser
   options = {}
   OptionParser.new do |opts|
-    opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
+    opts.banner = "#{"Usage:".red.bold} locator #{"-i".blue.bold} [nt_sequence_fasta_file] #{"-o".blue.bold} [locator_info_csv_file] #{"-r".blue.bold} [reference_genome_option]"
-    opts.on('-i', '--infile FASTA_FILE', 'nt sequence file in FASTA format') do |i|
+    opts.on('-i', '--infile FASTA_FILE', "#{"nt sequence".blue.bold} file in FASTA format") do |i|
       options[:infile] = i
     end
@@ -17,7 +18,7 @@ def myparser
       options[:outfile] = o
     end
-    opts.on('-r', '--ref_option OPTION', 'reference genome option, choose from `HXB2` (default), `NL43`, `MAC239`') do |o|
+    opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `NL43`, `MAC239`".blue.bold}") do |o|
       options[:ref_option] = o.to_sym
     end
@@ -35,9 +36,9 @@ def myparser
   return options
 end
-puts "\nSequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION}) by Shuntai Zhou"
-puts "See details at https://github.com/ViralSeq/viral_seq\n"
-puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n\n"
+puts "\n" + "Sequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION})".red.bold + " by " + "Shuntai Zhou".blue.bold
+puts "See details at " +  "https://github.com/ViralSeq/viral_seq\n".blue
+puts "Resembling" + " Sequence Locator ".magenta.bold + "from LANL" + " (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n".blue
 ARGV << '-h' if ARGV.size == 0
@@ -47,7 +48,7 @@ begin
   if options[:infile]
     seq_file = options[:infile]
   else
-    raise StandardError.new("Input file sequence file not found")
+    raise StandardError.new("Input file sequence file not found".red.bold)
   end
   if options[:outfile]
@@ -57,14 +58,14 @@ begin
   end
   unless File.exist?(seq_file)
-    raise StandardError.new("Input file sequence file not found")
+    raise StandardError.new("Input file sequence file not found".red.bold)
   end
   seqs = ViralSeq::SeqHash.fa(seq_file)
   opt =  options[:ref_option] ? options[:ref_option] : :HXB2
   unless [:HXB2, :NL43, :MAC239].include? opt
-    puts "Reference option #{opt} not recognized, using `:HXB2` as the reference genome."
+    puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
     opt = :HXB2
   end
@@ -76,6 +77,7 @@ begin
   end
   File.write(csv_file, data)
+  puts "Output file found at #{csv_file.green.bold}"
 rescue StandardError => e
   puts e.message
   puts "\n"

data/lib/viral_seq/muscle.rb CHANGED

@@ -39,8 +39,8 @@ module ViralSeq
     def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
       temp_dir = Dir.home
-      temp_file = temp_dir + "/_temp_muscle_in"
-      temp_aln = temp_dir + "/_temp_muscle_aln"
+      temp_file = File.join(temp_dir, "_temp_muscle_in")
+      temp_aln = File.join(temp_dir, "_temp_muscle_aln")
       name = ">test"
       temp_in = File.open(temp_file,"w")
       temp_in.puts ">ref"

data/lib/viral_seq/seq_hash.rb CHANGED

@@ -248,10 +248,12 @@ module ViralSeq
     def translate(codon_position = 0)
       seqs = self.dna_hash
       @aa_hash = {}
-      seqs.each do |name, seq|
-        s = ViralSeq::Sequence.new(name, seq)
+      seqs.uniq_hash.each do |seq, array_of_name|
+        s = ViralSeq::Sequence.new('name', seq)
         s.translate(codon_position)
-        @aa_hash[name] = s.aa_string
+        array_of_name.each do |name|
+          @aa_hash[name] = s.aa_string
+        end
       end
       return nil
     end # end of #translate
@@ -332,12 +334,13 @@ module ViralSeq
     def stop_codon(codon_position = 0)
       self.translate(codon_position)
       keys = []
-      self.aa_hash.each do |k,v|
-        keys << k if v.include?('*')
+      aa_seqs = self.aa_hash
+      aa_seqs.uniq_hash.each do |seq,array_of_name|
+        keys += array_of_name if seq.include?('*')
       end
       seqhash1 = self.sub(keys)
       seqhash1.title = self.title + "_stop"
-      keys2 = self.aa_hash.keys - keys
+      keys2 = aa_seqs.keys - keys
       seqhash2 = self.sub(keys2)
       return [seqhash1, seqhash2]
     end #end of #stop_codon
@@ -904,11 +907,11 @@ module ViralSeq
     # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
     # @example gap strip for an array of sequences
     #   array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
-    #   array = { AACCGGTT
-    #             A-CCGGTT
-    #             AAC-GGTT
-    #             AACCG-TT
-    #             AACCGGT- }
+    #   array = %w{ AACCGGTT
+    #               A-CCGGTT
+    #               AAC-GGTT
+    #               AACCG-TT
+    #               AACCGGT- }
     #   my_seqhash = ViralSeq::SeqHash.array(array)
     #   puts my_seqhash.gap_strip.dna_hash.values
     #     ACGT
@@ -963,12 +966,11 @@ module ViralSeq
     # @param (see #gap_strip)
     # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
     # @example gap strip for an array of sequences only at the ends
-    #   array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
-    #   array = { AACCGGTT
-    #             A-CCGGTT
-    #             AAC-GGTT
-    #             AACCG-TT
-    #             AACCGGT- }
+    #   array = %w{ AACCGGTT
+    #               A-CCGGTT
+    #               AAC-GGTT
+    #               AACCG-TT
+    #               AACCGGT- }
     #   my_seqhash = ViralSeq::SeqHash.array(array)
     #   puts my_seqhash.gap_strip_ends.dna_hash.values
     #     AACCGGT
@@ -1048,6 +1050,99 @@ module ViralSeq
       return new_seqhash
     end
+    # return an table of frequencies of nucleotides at each position.
+    # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
+    # @param head [Boolean] if the head of table is included.
+    # @return [Array] a two-dimension array of the frequency table,
+    #  including the following info:
+    #    position on the sequence (starting from 1)
+    #    consensus nucleotide
+    #    total sequence numbers
+    #    percentage of A, shows "-" if agrees with consensus
+    #    percentage of C, shows "-" if agrees with consensus
+    #    percentage of G, shows "-" if agrees with consensus
+    #    percentage of T, shows "-" if agrees with consensus
+    #
+    # @example error table for an array of sequences
+    #   array = %w{ AACCGGTT
+    #               AGCCGGTT
+    #               AACTGCTT
+    #               AACCGTTA
+    #               AACCGGTA }
+    #   my_seqhash = ViralSeq::SeqHash.array(array)
+    #   my_seqhash.error_table.each {|r| puts r.join(',')}
+    #     position,consensus,total_seq_number,A,C,G,T
+    #     1,A,5,-,,,
+    #     2,A,5,-,,0.2,
+    #     3,C,5,,-,,
+    #     4,C,5,,-,,0.2
+    #     5,G,5,,,-,
+    #     6,G,5,,0.2,-,0.2
+    #     7,T,5,,,,-
+    #     8,T,5,0.4,,,-
+    def error_table(ref = self.consensus, head = true)
+      table = []
+      if head
+        table << %w{
+          position
+          consensus
+          total_seq_number
+          A
+          C
+          G
+          T
+        }
+      end
+      ref_size = ref.size
+      (0..(ref_size - 1)).each do |position|
+        ref_base = ref[position]
+        nts = []
+        self.dna_hash.each do |_k,v|
+          nts << v[position]
+        end
+        freq = nts.count_freq
+        freq2 = {}
+        freq.each do |nt,c|
+          if nt == ref_base
+            freq2[nt] = '-'
+          else
+            freq2[nt] = (c/(self.size).to_f)
+          end
+        end
+        table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
+      end
+      return table
+    end # end of error_table
+    # randomly select n number of sequences from the orginal SeqHash object
+    # @param n [Integer] number of sequences to randomly select
+    # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
+    def random_select(n = 100)
+      new_sh = ViralSeq::SeqHash.new
+      dna_hash = self.dna_hash
+      aa_hash = self.aa_hash
+      qc_hash = self.qc_hash
+      keys = dna_hash.keys.sample(n)
+      keys.each do |k|
+        new_sh.dna_hash[k] = dna_hash[k]
+        new_sh.aa_hash[k] = aa_hash[k]
+        new_sh.qc_hash[k] = qc_hash[k]
+      end
+      new_sh.title = self.title + "_" + n.to_s
+      return new_sh
+    end
     # start of private functions

data/lib/viral_seq/seq_hash_pair.rb CHANGED

@@ -7,7 +7,7 @@ module ViralSeq
   # @example join the paired-end sequences with an overlap of 100 bp
   #   my_seqhashpair.join1(100)
   # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
-  #   my_seqhashpair.join1(:indiv)
+  #   my_seqhashpair.join2(model: :indiv)
   class SeqHashPair
@@ -104,17 +104,21 @@ module ViralSeq
       raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
       raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
       joined_seq = {}
-      seq_pair_hash.each do |seq_name, seq_pair|
+      seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
         r1_seq = seq_pair[0]
         r2_seq = seq_pair[1]
         if overlap.zero?
-          joined_seq[seq_name] = r1_seq + r2_seq
+          joined_sequence = r1_seq + r2_seq
         elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
-          joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
+          joined_sequence= r1_seq + r2_seq[overlap..-1]
         else
           next
         end
+        seq_names.each do |seq_name|
+          joined_seq[seq_name] = joined_sequence
+        end
       end
       joined_seq_hash = ViralSeq::SeqHash.new
       joined_seq_hash.dna_hash = joined_seq
       joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +143,7 @@ module ViralSeq
     #   my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
     #   my_seqhashpair.join2.dna_hash
     #   => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
-    #   my_seqhashpair.join2(model :indiv).dna_hash
+    #   my_seqhashpair.join2(model: :indiv).dna_hash
     #   => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
     def join2(model: :con, diff: 0.0)

data/lib/viral_seq/version.rb CHANGED

@@ -2,5 +2,5 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.0.6"
+  VERSION = "1.0.7"
 end

data/viral_seq.gemspec CHANGED

@@ -31,5 +31,9 @@ Gem::Specification.new do |spec|
   # muscle_bio gem required
   spec.add_runtime_dependency "muscle_bio", "~> 0.4"
+  # colorize gem required
+  spec.add_runtime_dependency "colorize", "~> 0.1"
   spec.requirements << 'R required for some functions'
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.0.6
+  version: 1.0.7
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-07-23 00:00:00.000000000 Z
+date: 2020-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -67,6 +67,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.4'
+- !ruby/object:Gem::Dependency
+  name: colorize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
 description: |-
   A Ruby Gem with bioinformatics tools for processing viral NGS data.
                             Specifically for Primer-ID sequencing and HIV drug resistance analysis.
@@ -124,7 +138,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements:
 - R required for some functions
-rubygems_version: 3.0.3
+rubygems_version: 3.1.2
 signing_key:
 specification_version: 4
 summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.