RubyGems - viral_seq - Versions diffs - 1.0.7 → 1.0.12 - Mend

viral_seq 1.0.7 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile.lock +3 -3
data/README.md +119 -50
data/bin/locator +20 -0
data/bin/tcs +454 -0
data/lib/viral_seq.rb +4 -1
data/lib/viral_seq/constant.rb +5 -1
data/lib/viral_seq/enumerable.rb +0 -10
data/lib/viral_seq/hash.rb +1 -1
data/lib/viral_seq/hivdr.rb +1 -1
data/lib/viral_seq/sdrm.rb +43 -0
data/lib/viral_seq/seq_hash.rb +61 -25
data/lib/viral_seq/seq_hash_pair.rb +7 -1
data/lib/viral_seq/tcs_core.rb +305 -0
data/lib/viral_seq/tcs_json.rb +178 -0
data/lib/viral_seq/version.rb +2 -1
data/viral_seq.gemspec +1 -1
metadata +10 -5

data/lib/viral_seq.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
+# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
 require_relative "viral_seq/sequence"
 require_relative "viral_seq/string"
 require_relative "viral_seq/version"
+require_relative "viral_seq/tcs_core"
+require_relative "viral_seq/tcs_json"
 require "muscle_bio"

data/lib/viral_seq/constant.rb CHANGED Viewed

@@ -1,7 +1,11 @@
 module ViralSeq
   # array for all amino acid one letter abbreviations
   AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
+  SDRM_HIV_PR_LIST = {}
+  SDRM_HIV_RT_LIST = {}
+  SDRM_HIV_IN_LIST = {}
 end

data/lib/viral_seq/enumerable.rb CHANGED Viewed

@@ -3,10 +3,6 @@
 #   array = [1,2,3,4,5,6,7,8,9,10]
 #   array.median
 #   => 5.5
-# @example sum
-#   array = [1,2,3,4,5,6,7,8,9,10]
-#   array.sum
-#   => 55
 # @example average number (mean)
 #   array = [1,2,3,4,5,6,7,8,9,10]
 #   array.mean
@@ -45,12 +41,6 @@ module Enumerable
     len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
   end
-  # generate summed value
-  # @return [Numeric] summed value
-  def sum
-     self.inject(0){|accum, i| accum + i }
-  end
   # generate mean number
   # @return [Float] mean value
   def mean

data/lib/viral_seq/hash.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# addition methods for Class::Hash required for ViralSeq
+# additional methods for Class::Hash required for ViralSeq
 class Hash

data/lib/viral_seq/hivdr.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module ViralSeq
-  class SeqHash
+  class SDRM
     # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
     #   works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)

data/lib/viral_seq/sdrm.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module ViralSeq
+  class DRMs
+    def initialize (mutation_list = {})
+      @mutation_list = mutation_list
+    end
+    attr_accessor :mutation_list
+  end
+  def self.sdrm_hiv_pr(seq_hash)
+  end
+  def self.sdrm_hiv_rt(seq_hash)
+  end
+  def self.sdrm_hiv_in(seq_hash)
+  end
+  def self.list_from_json(file)
+  end
+  def self.list_from_csv(file)
+  end
+  def self.export_list_hiv_pr(file, format = :json)
+    if foramt == :json
+    end
+  end
+  def self.export_list_hiv_rt(file, format = :json)
+  end
+  def self.export_list_hiv_in(file, format = :json)
+  end
+  def drm_analysis(seq_hash)
+    mutation_list = self.mutation_list
+  end
+end

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module ViralSeq
   #     # align with MUSCLE
   #   filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
   #     # filter nt sequences with the reference coordinates
-  #   filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
+  #   filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
   #     # return a new ViralSeq::SeqHash object without stop codons
   #   filtered_seqhash = filtered_seqhash.a3g[1]
   #     # further filter out sequences with A3G hypermutations
@@ -130,8 +130,8 @@ module ViralSeq
           end
         end
       end
-      sequence_hash = Hash[*sequence_a]
-      quality_hash = Hash[*quality_a]
+      sequence_hash = Hash[sequence_a.each_slice(2).to_a]
+      quality_hash = Hash[quality_a.each_slice(2).to_a]
       seq_hash = ViralSeq::SeqHash.new
       seq_hash.dna_hash = sequence_hash
@@ -181,6 +181,7 @@ module ViralSeq
       new_seqhash = ViralSeq::SeqHash.new
       new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
       new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
+      new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
       new_seqhash.title = self.title + "_with_" + sh2.title
       new_seqhash.file = self.file + "," + sh2.file
       return new_seqhash
@@ -312,22 +313,22 @@ module ViralSeq
     # screen for sequences with stop codons.
     # @param (see #translate)
-    # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
+    # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
     #
-    #   # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
-    #   # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
+    #   # :with_stop_codon : ViralSeq::SeqHash object with stop codons
+    #   # :without_stop_codon: ViralSeq::SeqHash object without stop codons
     # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
     #   my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
     #   my_seqhash.dna_hash
     #   => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
-    #   stop_codon_seqhash = my_seqhash.stop_codon[0]
+    #   stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
     #   stop_codon_seqhash.dna_hash
     #   => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
     #   stop_codon_seqhash.aa_hash
     #   => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
     #   stop_codon_seqhash.title
     #   => "my_fasta_file_stop"
-    #   filtered_seqhash = my_seqhash.stop_codon[1]
+    #   filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
     #   filtered_seqhash.aa_hash
     #   {">seq1"=>"IRT", ">seq3"=>"MRT"}
@@ -342,12 +343,15 @@ module ViralSeq
       seqhash1.title = self.title + "_stop"
       keys2 = aa_seqs.keys - keys
       seqhash2 = self.sub(keys2)
-      return [seqhash1, seqhash2]
+      return {
+        with_stop_codon: seqhash1,
+        without_stop_codon: seqhash2
+      }
     end #end of #stop_codon
     # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
-    # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
+    # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
     # @return [String] consensus sequence
     # @example consensus sequence from an array of sequences.
     #   seq_array = %w{ ATTTTTTTTT
@@ -379,11 +383,18 @@ module ViralSeq
         base_count = all_base.count_freq
         max_base_list = []
-        base_count.each do |k,v|
-          if v/seq_size.to_f >= cutoff
-            max_base_list << k
+        if cutoff.zero?
+          max_count = base_count.values.max
+          max_base_hash = base_count.select {|_k,v| v == max_count}
+          max_base_list = max_base_hash.keys
+        else
+          base_count.each do |k,v|
+            if v/seq_size.to_f >= cutoff
+              max_base_list << k
+            end
           end
         end
         consensus_seq += call_consensus_base(max_base_list)
       end
       return consensus_seq
@@ -394,14 +405,14 @@ module ViralSeq
     #   # control pattern: G[YN|RC] -> A[YN|RC]
     #   # use the sample consensus to determine potential a3g sites
     #   # Two criteria to identify hypermutation
-    #   # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
+    #   # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
     #   # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
     #   # note:  criteria 2 only applies on a sequence file containing more than 20 sequences,
     #   #        b/c Poisson model does not do well on small sample size.
-    # @return [Array] three values.
-    #   first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
-    #   second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
-    #   third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
+    # @return [Hash] three paris.
+    #   :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
+    #   :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
+    #   :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
     #     # sequence tag
     #     # G to A mutation numbers at potential a3g positions
     #     # total potential a3g G positions
@@ -412,17 +423,17 @@ module ViralSeq
     # @example identify apobec3gf mutations from a sequence fasta file
     #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
     #   hypermut = my_seqhash.a3g
-    #   hypermut[0].dna_hash.keys
+    #   hypermut[:a3g_seq].dna_hash.keys
     #   => [">Seq7", ">Seq14"]
-    #   hypermut[1].dna_hash.keys
+    #   hypermut[:filtered_seq].dna_hash.keys
     #   => [">Seq1", ">Seq2", ">Seq5"]
-    #   hypermut[2]
+    #   hypermut[:stats]
     #   => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
     #
     # @example identify apobec3gf mutations from another sequence fasta file
     #   my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
     #   hypermut = my_seqhash.a3g
-    #   hypermut[2]
+    #   hypermut[:stats]
     #   => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
     #   # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
     #   # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -515,7 +526,10 @@ module ViralSeq
       hm_seq_hash.title = self.title + "_hypermut"
       hm_seq_hash.file = self.file
       filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
-      return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
+      return { a3g_seq: hm_seq_hash,
+               filtered_seq: filtered_seq_hash,
+               stats: hm_hash.values
+              }
     end #end of #a3g_hypermut
     alias_method :a3g, :a3g_hypermut
@@ -535,7 +549,7 @@ module ViralSeq
       if sequences.size == 0
         return 0
       else
-        cut_off = 1
+        cut_off = Float::INFINITY
         l = sequences[0].size
         rate = sequences.size * error_rate
         count_mut = variant_for_poisson(sequences)
@@ -544,7 +558,7 @@ module ViralSeq
         poisson_hash.each do |k,v|
           cal = l * v
-          obs = count_mut[k] ? count_mut[k] : 0
+          obs = count_mut[k] ? count_mut[k] : 1
           if obs >= fold_cutoff * cal
             cut_off = k
             break
@@ -729,6 +743,7 @@ module ViralSeq
       seq_hash_unique.each do |seq|
         loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
+        next unless loc # if locator tool fails, skip this seq.
         if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
           if indel
             seq_hash_unique_pass << seq
@@ -1144,6 +1159,27 @@ module ViralSeq
       return new_sh
     end
+    # trim dna sequences based on the provided reference coordinates.
+    # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
+    # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
+    # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
+    # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
+    # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
+    def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
+      seq_hash = self.dna_hash.dup
+      seq_hash_unique = seq_hash.uniq_hash
+      trimmed_seq_hash = {}
+      seq_hash_unique.each do |seq, names|
+        trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
+        names.each do |name|
+          trimmed_seq_hash[name] = trimmed_seq
+        end
+      end
+      return_seq_hash = self.dup
+      return_seq_hash.dna_hash = trimmed_seq_hash
+      return return_seq_hash
+    end
     # start of private functions
     private

data/lib/viral_seq/seq_hash_pair.rb CHANGED Viewed

@@ -80,6 +80,12 @@ module ViralSeq
       alias_method :fa, :new_from_fasta
     end
+    # the size of nt sequence hash of the SeqHashPair object
+    # @return [Integer] size of nt sequence hash of the SeqHash object
+    def size
+      self.dna_hash.size
+    end
     # Pair-end join function for KNOWN overlap size.
     # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
     # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -211,7 +217,7 @@ module ViralSeq
     # {minimal overlap set to 4. }
     def overlap_matrix(sequence1, sequence2)
       min_overlap = 4
-      max_overlap = [sequence1.size, sequence2.size].max
+      max_overlap = [sequence1.size, sequence2.size].min
       matrix_hash = {}
       (min_overlap..max_overlap).each do |overlap|
         matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])

data/lib/viral_seq/tcs_core.rb ADDED Viewed

@@ -0,0 +1,305 @@
+module ViralSeq
+  # Core functions for `tcs` pipeline
+  class TcsCore
+    class << self
+      # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
+      def calculate_cut_off(m, error_rate = 0.02)
+        n = 0
+        case error_rate
+        when 0.005...0.015
+          if m <= 10
+            n = 2
+          else
+            n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
+          end
+        when 0...0.005
+          if m <= 10
+            n = 2
+          else
+            n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
+          end
+        else
+          if m <= 10
+            n = 2
+          elsif m <= 8500
+            n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
+          else
+            n = 0.0079 * m + 9.4869
+          end
+        end
+        n = n.round
+        n = 2 if n < 3
+        return n
+      end
+      # identify which file in the directory is R1 file, and which is R2 file based on file names
+      # input as directory (Dir object or a string of path)
+      # by default, .gz files will be unzipped.
+      # return as an hash of {r1_file: file1, r1_file: file2}
+      def r1r2(directory, unzip = true)
+        files = []
+        Dir.chdir(directory) { files = Dir.glob "*" }
+        r1_file = ""
+        r2_file = ""
+        files.each do |f|
+          tag = parser_file_name(f)[:tag]
+          if tag.include? "R1"
+            unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
+          elsif tag.include? "R2"
+            unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
+          end
+        end
+        return { r1_file: r1_file, r2_file: r2_file }
+      end # end of ViralSeq:TcsCore.r1r2
+      # sort directories containing mulitple r1 and r2 files.
+      # use the library name (first string before "_") to seperate libraries
+      # out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
+      # return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
+      def sort_by_lib(directory, out_dir = directory + "_sorted")
+        Dir.mkdir(out_dir) unless File.directory?(out_dir)
+        files = []
+        Dir.chdir(directory) {files = Dir.glob("*")}
+        files.each do |file|
+          path = File.join(directory,file)
+          index = file.split("_")[0]
+          index_dir = File.join(out_dir, index)
+          Dir.mkdir(index_dir) unless File.directory?(index_dir)
+          File.rename(path, File.join(index_dir, file))
+        end
+        return_obj = { with_both_r1_r2: [],
+                       missing_r1: [],
+                       missing_r2: [],
+                       error: []
+                      }
+        libs = []
+        Dir.chdir(out_dir) { libs = Dir.glob('*') }
+        libs.each do |lib|
+          file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
+          if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
+            return_obj[:with_both_r1_r2] << lib
+          elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
+            return_obj[:missing_r1] << lib
+          elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
+            return_obj[:missing_r2] << lib
+          else
+            return_obj[:error] << lib
+          end
+        end
+        return return_obj
+      end
+      # sort array of file names to determine if there is potential errors
+      # input name_array array of file names
+      # output hash { }
+      # need to change for each file name have an error code. and a bool to show if all pass
+      def validate_file_name(name_array)
+        errors = {
+                   file_type_error: [] ,
+                   missing_r1_file: [] ,
+                   missing_r2_file: [] ,
+                   extra_r1_r2_file: [],
+                   no_region_tag: [] ,
+                   multiple_region_tag: []
+                 }
+        passed_libs = {}
+        name_with_r1_r2 = []
+        name_array.each do |name|
+          tag = parser_file_name(name)[:tag]
+          if name !~ /\.fastq\Z|\.fastq\.gz\Z/
+            errors[:file_type_error] << name
+          elsif tag.count("R1") == 0 and tag.count("R2") == 0
+            errors[:no_region_tag] << name
+          elsif tag.count("R1") > 0 and tag.count("R2") > 0
+            errors[:multiple_region_tag] << name
+          elsif tag.count("R1") > 1 or tag.count("R2") > 1
+            errors[:multiple_region_tag] << name
+          else
+            name_with_r1_r2 << name
+          end
+        end
+        libs = {}
+        name_with_r1_r2.map do |name|
+          libname = parser_file_name(name)[:libname]
+          libs[libname] ||= []
+          libs[libname] << name
+        end
+        libs.each do |libname, files|
+          count_r1_file = 0
+          count_r2_file = 0
+          files.each do |name|
+            tag = parser_file_name(name)[:tag]
+            if tag.include? "R1"
+              count_r1_file += 1
+            elsif tag.include? "R2"
+              count_r2_file += 1
+            end
+          end
+          if count_r1_file > 1 or count_r2_file > 1
+            errors[:extra_r1_r2_file] += files
+          elsif count_r1_file.zero?
+            errors[:missing_r1_file] += files
+          elsif count_r2_file.zero?
+            errors[:missing_r2_file] += files
+          else
+            passed_libs[libname] = files
+          end
+        end
+        passed_names = []
+        passed_libs.values.each { |names| passed_names += names}
+        if passed_names.size < name_array.size
+          pass = false
+        else
+          pass = true
+        end
+        return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
+      end
+      # filter r1 raw sequences for non-specific primers.
+      # input r1_sh, SeqHash obj.
+      # return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
+      def filter_r1(r1_sh, forward_primer)
+        if forward_primer.match(/(N+)(\w+)$/)
+          forward_n = $1.size
+          forward_bio_primer = $2
+        else
+          forward_n = 0
+          forward_bio_primer = forward_primer
+        end
+        forward_bio_primer_size = forward_bio_primer.size
+        forward_starting_number = forward_n + forward_bio_primer_size
+        forward_primer_ref = forward_bio_primer.nt_parser
+        r1_passed_seq = {}
+        r1_raw = r1_sh.dna_hash
+        proc_filter = proc do |name|
+          seq = r1_raw[name]
+          next unless general_filter seq
+          primer_region_seq = seq[forward_n, forward_bio_primer_size]
+          if primer_region_seq =~ forward_primer_ref
+            new_name = remove_tag name
+            r1_passed_seq[new_name] = seq
+          end
+        end
+        r1_raw.keys.map do |name|
+          proc_filter.call name
+        end
+        return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
+      end # end of filter_r1
+      # filter r2 raw sequences for non-specific primers.
+      # input r2_sh, SeqHash obj.
+      # return filtered Hash of sequence name and seq pair, as well as the length of PID.
+      def filter_r2(r2_sh, cdna_primer)
+        r2_raw = r2_sh.dna_hash
+        cdna_primer.match(/(N+)(\w+)$/)
+        pid_length = $1.size
+        cdna_bio_primer = $2
+        cdna_bio_primer_size = cdna_bio_primer.size
+        reverse_starting_number = pid_length + cdna_bio_primer_size
+        cdna_primer_ref = cdna_bio_primer.nt_parser
+        r2_passed_seq = {}
+        proc_filter = proc do |name|
+          seq = r2_raw[name]
+          next unless general_filter seq
+          primer_region_seq = seq[pid_length, cdna_bio_primer_size]
+          if primer_region_seq =~ cdna_primer_ref
+            new_name = remove_tag name
+            r2_passed_seq[new_name] = seq
+          end
+        end
+        r2_raw.keys.map do |name|
+          proc_filter.call name
+        end
+        return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
+      end # end of filter_r2
+      # puts error message in the log file handler, and abort with the same infor
+      def log_and_abort(log, infor)
+        log.puts Time.now.to_s + "\t" + infor
+        log.close
+        abort infor.red.bold
+      end
+      private
+      def unzip_r(indir, f)
+        r_file = File.join(indir, f)
+        if f =~ /.gz/
+          `gzip -d #{r_file}`
+          new_f = f.sub ".gz", ""
+          r_file = File.join(indir, new_f)
+        end
+        return r_file
+      end
+      def parser_file_name(file_name)
+        t = file_name.split(".")[0].split("_")
+        if t.size == 1
+          libname = "lib"
+          tag = [ t[0].upcase ]
+        else
+          libname = t[0]
+          tag = t[1..-1].map(&:upcase)
+        end
+        return {libname: libname, tag: tag}
+      end
+      def general_filter(seq)
+        if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
+          return false
+        elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
+          return false
+        elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
+          return false
+        else
+          return true
+        end
+      end
+      # remove region info tags from the raw MiSeq sequences.
+      def remove_tag(seq_name)
+        if seq_name =~ /\s/
+          new_tag = $`
+        else
+          new_tag = seq_name[0..-3]
+        end
+      end
+    end # end of class << self
+  end # end of TcsCore module
+end # end of main module