RubyGems - viral_seq - Versions diffs - 1.0.14 → 1.1.0 - Mend

viral_seq 1.0.14 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.gitignore +0 -1
data/README.md +52 -7
data/bin/tcs +13 -4
data/bin/tcs_log +83 -0
data/doc/dr.json +68 -0
data/lib/viral_seq/seq_hash.rb +1 -1
data/lib/viral_seq/seq_hash_pair.rb +6 -4
data/lib/viral_seq/tcs_core.rb +3 -1
data/lib/viral_seq/tcs_json.rb +41 -10
data/lib/viral_seq/version.rb +2 -2
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '048e85ab67fbb667919d02d4509a15111798b116b3f927c921d203dc8565a1a2'
-  data.tar.gz: 6951e410bd4f9b727a44fab1aa88f9cc263151cf9aed2a9c25ae9d866ed72450
+  metadata.gz: ea453e452e6832e942512cdb94462c33af89ffd8295017806c9aa6ff7ec77ad4
+  data.tar.gz: 2bb89d193e0e84ebe0791882c53e226a0a934ea3b9d1e61f87b8ffff6c22af1b
 SHA512:
-  metadata.gz: 02cc87e245918a5c8f1b16b0db978da66e3bf7e83c6c6140c394c560d31c86ab1845b337a4f53b4b7883ff8e452e8caef0b036ea113c4416d6a29d16f419eb81
-  data.tar.gz: 9f53bd6c46f4a49b5c14b8b8019ffa3e1abcf442bf0a6cc09a7dbc768a474f2afd81d5cf168f38eb8ab5abbd601c60f1f824f753bc77dbcc0d3c0d93568b9ae3
+  metadata.gz: 9dc0403ecaea119d3aa3e832305a0bd4f038fdb71789dcd036080fa89b0e454ee79001b6042df171364e4207a93b2d4d5747336b2fb7f8fb7d83103f5d641134
+  data.tar.gz: 510ccfce7d717b56d55e2477ae01124009d1f53f010635759cf2f69afe0132313e08db9abaae1ec6d8d894961beba1c2d70a637eafa9b57b05f0aac3372cd0ca

data/.gitignore CHANGED Viewed

@@ -2,7 +2,6 @@
 /.yardoc
 /_yardoc/
 /coverage/
-/doc/
 /pkg/
 /spec/reports/
 /tmp/

data/README.md CHANGED Viewed

@@ -2,7 +2,16 @@
 A Ruby Gem containing bioinformatics tools for processing viral NGS data.
-Specifically for Primer-ID sequencing and HIV drug resistance analysis.
+Specifically for Primer ID sequencing and HIV drug resistance analysis.
+## Illustration for the Primer ID Sequencing
+![Primer ID Sequencing](https://storage.googleapis.com/tcs-dr-public/pid.png)
+### Reference readings on the Primer ID sequencing
+[Primer ID JID paper](https://doi.org/10.21769/BioProtoc.3938)
+[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
 ## Install
@@ -14,19 +23,45 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
 ### Excutables
-Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
+### `tcs`
+Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
+Example commands:
 ```bash
-    $ locator -i sequence.fasta -o sequence.fasta.csv
+    $ tcs -p params.json # run TCS pipeline with params.json
+    $ tcs -j # CLI to generate params.json
+    $ tcs -h # print out the help
 ```
+---
+### `tcs_log`
+Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs.
-Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
+Example file structure:
+```
+batch_tcs_jobs/
+      ├── lib1
+      ├── lib2
+      ├── lib3
+      ├── lib4
+      ├── ...
+```
+Example command:
 ```bash
-    $ tcs -p params.json # run TCS pipeline with params.json
-    $ tcs -j # CLI to generate params.json
-    $ tcs -h # print out the help
+    $ tcs_log batch_tcs_jobs
+```
+---
+### `locator`
+Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
+```bash
+    $ locator -i sequence.fasta -o sequence.fasta.csv
 ```
+---
 ## Some Examples
@@ -86,6 +121,16 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
 ## Updates
+### Version 1.1.0-03252021
+    1. Optimized the algorithm of end-join.
+    2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
+    3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
+    4. Added the preset of MPID-HIVDR params file ***dr.json*** in /doc.
+    5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
+    Users can choose from 3 MiSeq platforms for processing their sequencing data.
+    MiSeq 300x7x300 is the default option.
 ### Version 1.0.14-03052021
   1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.

data/bin/tcs CHANGED Viewed

@@ -23,7 +23,7 @@
 # THE SOFTWARE.
 # Use JSON file as the run param
-# run tcs_json_generator.rb to generate param json file.
+# run `tcs -j` to generate param json file.
 require 'viral_seq'
 require 'json'
@@ -115,6 +115,12 @@ else
   error_rate = 0.02
 end
+if params[:platform_format]
+  $platform_sequencing_length = params[:platform_format]
+else
+  $platform_sequencing_length = 300
+end
 primers = params[:primer_pairs]
 if primers.empty?
   ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
@@ -273,7 +279,6 @@ primers.each do |primer|
       r1_sub_seq << bio_r1[seq_name]
       r2_sub_seq << bio_r2[seq_name]
     end
     #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
     consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
     r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
@@ -364,6 +369,7 @@ primers.each do |primer|
     shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
     joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
     log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
     summary_json[:combined_tcs] = joined_sh.size
     if export_raw
@@ -433,12 +439,15 @@ primers.each do |primer|
       trim_end = primer[:trim_ref_end]
       trim_ref = primer[:trim_ref].to_sym
       joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
-      joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
       if export_raw
         joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
-        joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
       end
     end
+    joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
+    if export_raw
+      joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
+    end
   end
   File.open(outfile_log, "w") do |f|

data/bin/tcs_log ADDED Viewed

@@ -0,0 +1,83 @@
+#!/usr/bin/env ruby
+# pool run logs from one batch of tcs jobs
+# file structure:
+#   batch_tcs_jobs/
+#   ├── lib1
+#   ├── lib2
+#   ├── lib3
+#   ├── lib4
+#   ├── ...
+#
+# command example:
+#   $ tcs_log batch_tcs_jobs
+require 'viral_seq'
+require 'pathname'
+require 'json'
+require 'fileutils'
+indir = ARGV[0].chomp
+indir_basename = File.basename(indir)
+indir_dirname = File.dirname(indir)
+tcs_dir = File.join(indir_dirname, (indir_basename + "_tcs"))
+Dir.mkdir(tcs_dir) unless File.directory?(tcs_dir)
+libs = []
+Dir.chdir(indir) {libs = Dir.glob("*")}
+outdir2 = File.join(tcs_dir, "combined_TCS_per_lib")
+outdir3 = File.join(tcs_dir, "TCS_per_region")
+outdir4 = File.join(tcs_dir, "combined_TCS_per_region")
+Dir.mkdir(outdir2) unless File.directory?(outdir2)
+Dir.mkdir(outdir3) unless File.directory?(outdir3)
+Dir.mkdir(outdir4) unless File.directory?(outdir4)
+log_file = File.join(tcs_dir,"log.csv")
+log = File.open(log_file,'w')
+log.puts "lib name,Region,Raw Sequences per barcode,R1 Raw,R2 Raw,Paired Raw,Cutoff,PID Length,Consensus1,Consensus2,Distinct to Raw,Resampling index,Combined TCS,Combined TCS after QC"
+libs.each do |lib|
+  Dir.mkdir(File.join(outdir2, lib)) unless File.directory?(File.join(outdir2, lib))
+  fasta_files = []
+  json_files = []
+  Dir.chdir(File.join(indir, lib)) do
+     fasta_files = Dir.glob("**/*.fasta")
+     json_files = Dir.glob("**/log.json")
+  end
+  fasta_files.each do |f|
+    path_array = Pathname(f).each_filename.to_a
+    region = path_array[0]
+    if path_array[-1] == "combined.fasta"
+      FileUtils.cp(File.join(indir, lib, f), File.join(outdir2, lib, (lib + "_" + region)))
+      Dir.mkdir(File.join(outdir4,region)) unless File.directory?(File.join(outdir4,region))
+      FileUtils.cp(File.join(indir, lib, f), File.join(outdir4, region, (lib + "_" + region)))
+    else
+      Dir.mkdir(File.join(outdir3,region)) unless File.directory?(File.join(outdir3,region))
+      Dir.mkdir(File.join(outdir3,region, lib)) unless File.directory?(File.join(outdir3,region, lib))
+      FileUtils.cp(File.join(indir, lib, f), File.join(outdir3, region, lib, (lib + "_" + region + "_" + path_array[-1])))
+    end
+  end
+  json_files.each do |f|
+    json_log = JSON.parse(File.read(File.join(indir, lib, f)), symbolize_names: true)
+    log.print [lib,
+               json_log[:primer_set_name],
+               json_log[:total_raw_sequence],
+               json_log[:r1_filtered_raw],
+               json_log[:r2_filtered_raw],
+               json_log[:paired_raw_sequence],
+               json_log[:consensus_cutoff],
+               json_log[:length_of_pid],
+               json_log[:total_tcs_with_ambiguities],
+               json_log[:total_tcs],
+               json_log[:distinct_to_raw],
+               json_log[:resampling_param],
+               json_log[:combined_tcs],
+               json_log[:combined_tcs_after_qc],
+             ].join(',') + "\n"
+  end
+end
+log.close

data/doc/dr.json ADDED Viewed

@@ -0,0 +1,68 @@
+{
+  "raw_sequence_dir": "MyExampleDir",
+  "platform_error_rate": 0.02,
+  "primer_pairs": [
+    {
+      "region": "RT",
+      "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
+      "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
+      "majority": 0.5,
+      "end_join": true,
+      "end_join_option": 1,
+      "overlap": 0,
+      "TCS_QC": true,
+      "ref_genome": "HXB2",
+      "ref_start": 2648,
+      "ref_end": 3257,
+      "indel": true,
+      "trim": false
+    },
+    {
+      "region": "PR",
+      "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
+      "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
+      "majority": 0.5,
+      "end_join": true,
+      "end_join_option": 3,
+      "TCS_QC": true,
+      "ref_genome": "HXB2",
+      "ref_start": 0,
+      "ref_end": 2591,
+      "indel": true,
+      "trim": true,
+      "trim_ref": "HXB2",
+      "trim_ref_start": 2253,
+      "trim_ref_end": 2549
+    },
+    {
+      "region": "IN",
+      "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
+      "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
+      "majority": 0.5,
+      "end_join": true,
+      "end_join_option": 3,
+      "overlap": 171,
+      "TCS_QC": true,
+      "ref_genome": "HXB2",
+      "ref_start": 4384,
+      "ref_end": 4751,
+      "indel": false,
+      "trim": false
+    },
+    {
+      "region": "V1V3",
+      "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
+      "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
+      "majority": 0.5,
+      "end_join": true,
+      "end_join_option": 1,
+      "overlap": 0,
+      "TCS_QC": true,
+      "ref_genome": "HXB2",
+      "ref_start": 6585,
+      "ref_end": 7208,
+      "indel": true,
+      "trim": false
+    }
+  ]
+}

data/lib/viral_seq/seq_hash.rb CHANGED Viewed

@@ -394,7 +394,6 @@ module ViralSeq
             end
           end
         end
         consensus_seq += call_consensus_base(max_base_list)
       end
       return consensus_seq
@@ -742,6 +741,7 @@ module ViralSeq
       seq_hash_unique_pass = []
       seq_hash_unique.each do |seq|
+        next if seq.nil?
         loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
         next unless loc # if locator tool fails, skip this seq.
         if start_nt.include?(loc[0]) && end_nt.include?(loc[1])

data/lib/viral_seq/seq_hash_pair.rb CHANGED Viewed

@@ -110,19 +110,21 @@ module ViralSeq
       raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
       raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
       joined_seq = {}
-      seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
+      seq_pair_hash.each do |seq_name,seq_pair|
         r1_seq = seq_pair[0]
         r2_seq = seq_pair[1]
         if overlap.zero?
           joined_sequence = r1_seq + r2_seq
+        elsif diff.zero?
+          if r1_seq[-overlap..-1] == r2_seq[0,overlap]
+            joined_sequence= r1_seq + r2_seq[overlap..-1]
+          end
         elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
           joined_sequence= r1_seq + r2_seq[overlap..-1]
         else
           next
         end
-        seq_names.each do |seq_name|
-          joined_seq[seq_name] = joined_sequence
-        end
+        joined_seq[seq_name] = joined_sequence if joined_sequence
       end
       joined_seq_hash = ViralSeq::SeqHash.new

data/lib/viral_seq/tcs_core.rb CHANGED Viewed

@@ -305,7 +305,9 @@ module ViralSeq
       end
       def general_filter(seq)
-        if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
+        if seq.size < $platform_sequencing_length
+          return false
+        elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
           return false
         elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
           return false

data/lib/viral_seq/tcs_json.rb CHANGED Viewed

@@ -13,6 +13,22 @@ module ViralSeq
         print '> '
         param[:raw_sequence_dir] = gets.chomp.rstrip
+        puts "Choose MiSeq Platform (1-3):\n1. 150x7x150\n2. 250x7x250\n3. 300x7x300 (default)"
+        print "> "
+        pf_option = gets.chomp.rstrip
+        # while ![1,2,3].include?(pf_option.to_i)
+        #   print "Entered MiSeq Platform #{pf_option.red.bold} not valid (choose 1-3), try again\n> "
+        #   pf_option = gets.chomp.rstrip
+        # end
+        case pf_option.to_i
+        when 1
+          param[:platform_format] = 150
+        when 2
+          param[:platform_format] = 250
+        else
+          param[:platform_format] = 300
+        end
         puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
         print '> '
         input_error = gets.chomp.rstrip.to_f
@@ -52,12 +68,12 @@ module ViralSeq
           if ej =~ /y|yes/i
             data[:end_join] = true
-            print "End-join option? Choose from (1-4):\n
-            1: simple join, no overlap
-            2: known overlap \n
-            3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
-            4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
-            > "
+            puts "End-join option? Choose from (1-4):"
+            puts "1: simple join, no overlap"
+            puts "2: known overlap"
+            puts "3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap"
+            puts "4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap"
+            print "> "
             ej_option = gets.chomp.rstrip
             while ![1,2,3,4].include?(ej_option.to_i)
               puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
@@ -138,7 +154,12 @@ module ViralSeq
         if save_option =~ /y|yes/i
           print "Path to save JSON file:\n> "
           path = gets.chomp.rstrip
-          File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
+          while !validate_path_name(path)
+            print "Entered path no valid, try again.\n".red.bold
+            print "Path to save JSON file:\n> "
+            path = gets.chomp.rstrip
+          end
+          File.open(validate_path_name(path), 'w') {|f| f.puts JSON.pretty_generate(param)}
         end
         print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
@@ -147,7 +168,7 @@ module ViralSeq
         if rsp =~ /y/i
           return param
         else
-          abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
+          abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`".blue
         end
       end
@@ -172,7 +193,17 @@ module ViralSeq
               when 3
                 :MAC239
               end
-      end
-    end
+      end # end of get_ref
+      def validate_path_name(path)
+        if path.empty?
+          return false
+        elsif File.directory? path
+          return File.join(path, 'params.json')
+        elsif File.directory?(File.dirname(path))
+          return path
+        end
+      end # end of validate_path_name
+    end # end of class << self
   end # end TcsJson
 end # end main module

data/lib/viral_seq/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 # version info and histroy
 module ViralSeq
-  VERSION = "1.0.14"
-  TCS_VERSION = "2.1.1"
+  VERSION = "1.1.0"
+  TCS_VERSION = "2.2.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: viral_seq
 version: !ruby/object:Gem::Version
-  version: 1.0.14
+  version: 1.1.0
 platform: ruby
 authors:
 - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-03-06 00:00:00.000000000 Z
+date: 2021-03-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -90,6 +90,7 @@ email:
 executables:
 - locator
 - tcs
+- tcs_log
 extensions: []
 extra_rdoc_files: []
 files:
@@ -104,6 +105,8 @@ files:
 - Rakefile
 - bin/locator
 - bin/tcs
+- bin/tcs_log
+- doc/dr.json
 - lib/viral_seq.rb
 - lib/viral_seq/constant.rb
 - lib/viral_seq/enumerable.rb