RubyGems - miga-base - Versions diffs - 0.3.1.7 → 0.3.2.0 - Mend

miga-base 0.3.1.7 → 0.3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

checksums.yaml +4 -4
data/actions/ncbi_get.rb +8 -0
data/lib/miga/common.rb +9 -215
data/lib/miga/common/base.rb +49 -0
data/lib/miga/common/format.rb +135 -0
data/lib/miga/common/path.rb +49 -0
data/lib/miga/daemon.rb +3 -60
data/lib/miga/daemon/base.rb +69 -0
data/lib/miga/dataset.rb +3 -3
data/lib/miga/dataset/result.rb +5 -5
data/lib/miga/result.rb +5 -0
data/lib/miga/version.rb +7 -5
data/scripts/distances.bash +2 -19
data/scripts/taxonomy.bash +2 -21
data/test/common_test.rb +9 -0
data/utils/distance/base.rb +6 -0
data/utils/distance/commands.rb +82 -0
data/utils/distance/database.rb +86 -0
data/utils/distance/pipeline.rb +98 -0
data/utils/distance/runner.rb +104 -0
data/utils/distance/temporal.rb +37 -0
data/utils/distances.rb +9 -0
data/utils/enveomics/Docs/recplot2.md +233 -0
data/utils/enveomics/Makefile +1 -1
data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
data/utils/enveomics/Manifest/categories.json +11 -1
data/utils/enveomics/Manifest/examples.json +2 -2
data/utils/enveomics/README.md +2 -0
data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
data/utils/enveomics/Scripts/SRA.download.bash +28 -21
data/utils/enveomics/Scripts/Table.barplot.R +1 -0
data/utils/enveomics/Scripts/aai.rb +4 -2
data/utils/enveomics/build_enveomics_r.bash +5 -5
data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
data/utils/enveomics/enveomics.R/README.md +26 -17
data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
data/utils/requirements.txt +1 -1
metadata +28 -4
data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18

data/utils/enveomics/Makefile CHANGED

@@ -7,7 +7,7 @@ include globals.mk
 TEST=Tests
 enveomics_r=enveomics.R
-enveomics_r_v=enveomics.R_1.1.6
+enveomics_r_v=enveomics.R_1.3
 .PHONY: test install install-scripts install-r uninstall install-deps
 test: $(enveomics_r_v).tar.gz

data/utils/enveomics/Manifest/Tasks/blasttab.json CHANGED

@@ -623,6 +623,7 @@
         { "r_package": "optparse" },
         { "r_package": "enveomics.R" }
       ],
+      "see_also": [ "RecPlot2.compareIdentities.R" ],
       "options": [
         {
           "opt": "--prefix",
@@ -644,6 +645,12 @@
           "default": 300,
           "description": ["Breaks in the identity histogram."]
         },
+        {
+          "opt": "--id-free-range",
+          "description": ["Indicates that the range should be freely set from",
+            "the observed values. Otherwise, 70-100% is included in the",
+            "identity histogram (default)."]
+        },
         {
           "opt": "--id-metric",
           "arg": "select",
@@ -684,6 +691,13 @@
           "description": ["Color of peaks, mandatory for peak-finding (e.g.,",
             "darkred)."]
         },
+        {
+          "opt": "--peaks-method",
+          "arg": "select",
+          "values": ["emauto","em","mower"],
+          "default": "emauto",
+          "description": "Method to detect peaks."
+        },
         {
           "name": "R Object Output",
           "arg": "out_file",
@@ -707,6 +721,58 @@
           "description": "Height of the plot in inches (7 by default)."
         }
       ]
+    },
+    {
+      "task": "RecPlot2.compareIdentities.R",
+      "description": ["Calculates the difference between identity",
+        "distributions of two recruitment plots."],
+      "help_arg": "--help",
+      "requires": [
+        { "r_package": "optparse" },
+        { "r_package": "enveomics.R" }
+      ],
+      "see_also": [ "BlastTab.recplot2.R" ],
+      "options": [
+        {
+          "opt": "--method",
+          "arg": "string",
+          "default": "hellinger",
+          "description": ["Distance method to use. This should be (an",
+            "unambiguous abbreviation of) one of:",
+            "'hellinger' (Hellinger, 1090, doi:10.1515/crll.1909.136.210),",
+            "'bhattacharyya' (Bhattacharyya, 1943, Bull. Calcutta Math. Soc.",
+            "35), 'kl' or 'kullback–leibler' (Kullback & Leibler, 1951,",
+            "doi:10.1214/aoms/1177729694), or 'euclidean'."]
+        },
+        {
+          "opt": "--pseudocounts",
+          "arg": "float",
+          "default": 0.0,
+          "description": ["Smoothing parameter for Laplace smoothing. Use 0",
+            "for no smoothing, or 1 for add-one smoothing."]
+        },
+        {
+          "opt": "--max-deviation",
+          "arg": "float",
+          "description": ["Maximum mean deviation between identity breaks",
+            "tolerated (as percent identity). Difference in number of",
+            "id.breaks is never tolerated."]
+        },
+        {
+          "arg": "in_file",
+          "name": "RecPlot A",
+          "mandatory": true,
+          "description": ["File containing the R object `rp` for the first",
+            "recruitment plot."]
+        },
+        {
+          "arg": "in_file",
+          "name": "RecPlot B",
+          "mandatory": true,
+          "description": ["File containing the R object `rp` for the second",
+            "recruitment plot."]
+        }
+      ]
     }
   ]
 }

data/utils/enveomics/Manifest/Tasks/fasta.json CHANGED

@@ -158,12 +158,19 @@
       "warn": ["Please note that this script will check for the consistency of",
         "the names (assuming a pair of related reads contains the same name",
         "varying only in a trailing slash (/) followed by a digit. If you want",
-        "to turn this feature off just set the $eval_T variable to zero. If",
+        "to turn this feature off just set the checking period to zero. If",
         "you want to decrease the sampling period (to speed the script up) or",
-        "increase it (to make it more sensitive to errors) just change $eval_T",
-        "accordingly."],
+        "increase it (to make it more sensitive to errors) just change the ",
+        "checking period accordingly."],
       "help_arg": "",
       "options": [
+        {
+          "name": "Checking period",
+          "opt": "-T",
+          "arg": "integer",
+          "default": 1000,
+          "description": "Sampling period for names evaluation."
+        },
         {
           "arg": "out_file",
           "mandatory": true,

data/utils/enveomics/Manifest/Tasks/fastq.json CHANGED

@@ -45,10 +45,10 @@
       "warn": ["Note that this script will check for the consistency of the",
         "names (assuming a pair of related reads contains the same name",
         "varying only in a trailing slash (/) followed by a digit. If you want",
-        "to turn this feature off just set the -T option to zero. If you want",
-        "to decrease the sampling period (to speed the script up) or increase",
-        "it (to make it more sensitive to errors) just change -T option",
-        "accordingly."],
+        "to turn this feature off just set the checking period to zero. If you",
+        "want to decrease the sampling period (to speed the script up) or",
+        "increase it (to make it more sensitive to errors) just change the",
+        "checking period accordingly."],
       "see_also": ["FastQ.split.pl","FastA.interpose.pl"],
       "help_arg": "",
       "options": [

data/utils/enveomics/Manifest/Tasks/mapping.json CHANGED

@@ -8,7 +8,8 @@
         "missing from the file. If you produce your BedGraph file with",
         "bedtools genomecov and want to consider zero-coverage position, be",
         "sure to use -bga (not -bg)."],
-      "see_also": ["BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
+      "see_also": ["BedGraph.window.rb",
+        "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
       "help_arg": "--help",
       "options": [
         {
@@ -23,6 +24,42 @@
           "default": 0.5,
           "description": ["Central range to consider, between 0 and 1. By",
             "default: inter-quartile range (0.5)."]
+        },
+        {
+          "opt": "--per-seq",
+          "description": ["Calculate averages per reference sequence, not",
+            "total. Assumes a sorted BedGraph file."]
+        },
+        {
+          "opt": "--length",
+          "description": "Add sequence length to the output."
+        }
+      ]
+    },
+    {
+      "task": "BedGraph.window.rb",
+      "description": ["Estimates the sequencing depth per windows from a",
+        "BedGraph file."],
+      "warn": ["This script doesn't consider zero-coverage positions if",
+        "missing from the file. If you produce your BedGraph file with",
+        "bedtools genomecov and want to consider zero-coverage position, be",
+        "sure to use -bga (not -bg)."],
+      "see_also": ["BedGraph.tad.rb",
+        "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--input",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "Input BedGraph file."
+        },
+        {
+          "name": "Window size",
+          "opt": "--win",
+          "arg": "float",
+          "default": 1000,
+          "description": "Window size, in base pairs."
         }
       ]
     }

data/utils/enveomics/Manifest/categories.json CHANGED

@@ -3,6 +3,7 @@
     "Sequence similarity search": {
       "Statistics": [
         "BedGraph.tad.rb",
+        "BedGraph.window.rb",
         "BlastPairwise.AAsubs.pl",
         "BlastTab.advance.bash",
         "BlastTab.recplot2.R",
@@ -10,7 +11,8 @@
         "BlastTab.seqdepth_nomedian.pl",
         "BlastTab.seqdepth_ZIP.pl",
         "BlastTab.sumPerHit.pl",
-        "FastQ.test-error.rb"
+        "FastQ.test-error.rb",
+        "RecPlot2.compareIdentities.R"
       ],
       "Manipulation": [
         "BlastTab.addlen.rb",
@@ -134,6 +136,14 @@
       "Clustering": [
         "ogs.mcl.rb",
         "clust.rand.rb"
+      ],
+      "Read recruitments": [
+        "BedGraph.tad.rb",
+        "BedGraph.window.rb",
+        "BlastTab.catsbj.pl",
+        "BlastTab.pairedHits.rb",
+        "BlastTab.recplot2.R",
+        "RecPlot2.compareIdentities.R"
       ]
     }
   }

data/utils/enveomics/Manifest/examples.json CHANGED

@@ -56,8 +56,8 @@
       "task": "BlastTab.recplot2.R",
       "description": ["Generates recruitment plots for a comparison",
         "between a virome containing HIV and the HIV-1 genome."],
-      "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,"NA",
-        "hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
+      "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,null,
+        null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
     },
     {
       "_": "== Examples of functional annotations ==",

data/utils/enveomics/README.md CHANGED

@@ -29,6 +29,8 @@ http://www.perlfoundation.org/artistic_license_2_0.
 Most scripts in this repository are self-documented.  However,
 more extensive documentation (and some discussion) can be found at the
 [documentation website](http://enve-omics.ce.gatech.edu/enveomics/docs).
+Additional documentation for recruitment plots can be found
+[here](Docs/recplot2.md).
 ## Citation

data/utils/enveomics/Scripts/Aln.cat.rb CHANGED

@@ -129,6 +129,7 @@ begin
       puts ">#{key}", a[key].join("").gsub(/(.{1,60})/, "\\1\n")
       a.delete(key)
    end
+   $stderr.puts "  #{lengths.inject(:+)} columns." unless o[:q]
    unless o[:coords].nil?
       $stderr.puts "Generating coordinates." unless o[:q]
       coords = File.open(o[:coords],"w")

data/utils/enveomics/Scripts/BedGraph.tad.rb CHANGED

@@ -1,9 +1,9 @@
 #!/usr/bin/env ruby
-require "optparse"
+require 'optparse'
-o = {range:0.5}
-ARGV << "-h" if ARGV.empty?
+o = {range: 0.5, perseq: false, length: false}
+ARGV << '-h' if ARGV.empty?
 OptionParser.new do |opt|
   opt.banner = "
   Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
@@ -13,20 +13,26 @@ OptionParser.new do |opt|
   want to consider zero-coverage position, be sure to use -bga (not -bg).
   Usage: #{$0} [options]"
-  opt.separator ""
-  opt.on("-i", "--input PATH",
-    "Input BedGraph file (mandatory)."){ |v| o[:i]=v }
-  opt.on("-r", "--range FLOAT",
-    "Central range to consider, between 0 and 1.",
+  opt.separator ''
+  opt.on('-i', '--input PATH',
+    'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
+  opt.on('-r', '--range FLOAT',
+    'Central range to consider, between 0 and 1.',
     "By default: #{o[:range]} (inter-quartile range)."
     ){ |v| o[:range]=v.to_f }
-  opt.on("-h", "--help", "Display this screen.") do
+  opt.on('-s', '--per-seq',
+    'Calculate averages per reference sequence, not total.',
+    'Assumes a sorted BedGraph file.'
+    ){ |v| o[:perseq] = v }
+  opt.on('-l', '--length',
+    'Add sequence length to the output.'){ |v| o[:length] = v }
+  opt.on('-h', '--help', 'Display this screen.') do
     puts opt
     exit
   end
-  opt.separator ""
+  opt.separator ''
 end.parse!
-abort "-i is mandatory." if o[:i].nil?
+abort '-i is mandatory.' if o[:i].nil?
 def pad(d, idx, r)
   idx.each do |i|
@@ -39,33 +45,49 @@ def pad(d, idx, r)
   d
 end
+def report(sq, d, ln, o)
+  # Estimate padding ranges
+  pad = (1.0-o[:range])/2.0
+  r = (pad*ln).round
+  # Pad
+  d = pad(d, d.each_index.to_a, r+0)
+  d = pad(d, d.each_index.to_a.reverse, r+0)
+  # Average
+  y = [0.0]
+  unless d.compact.empty?
+    s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
+    y[0] = s.to_f/d.compact.inject(:+)
+  end
+  # Report
+  y.unshift(sq) if o[:perseq]
+  y << ln if o[:length]
+  puts y.join("\t")
+end
 # Read BedGraph
-d = []
+d  = []
 ln = 0
+pre_sq = nil
 File.open(o[:i], "r") do |ifh|
   ifh.each_line do |i|
     next if i =~ /^#/
-    r = i.chomp.split("\t")[1 .. -1].map{ |j| j.to_i }
+    r  = i.chomp.split("\t")
+    sq = r.shift
+    if o[:perseq] and !pre_sq.nil? and pre_sq!=sq
+      report(pre_sq, d, ln, o)
+      d  = []
+      ln = 0
+    end
+    r.map! { |j| j.to_i }
     l = r[1]-r[0]
     d[ r[2] ] ||= 0
-    d[ r[2] ] += l
+    d[ r[2] ]  += l
     ln += l
+    pre_sq = sq
   end
 end
-# Estimate padding ranges
-pad = (1.0-o[:range])/2.0
-r = (pad*ln).round
-# Pad
-d = pad(d, d.each_index.to_a, r+0)
-d = pad(d, d.each_index.to_a.reverse, r+0)
-# Average
-if d.compact.empty?
-  p 0.0
-else
-  s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
-  p s.to_f/d.compact.inject(:+)
-end
+report(pre_sq, d, ln, o)

data/utils/enveomics/Scripts/BedGraph.window.rb ADDED

@@ -0,0 +1,71 @@
+#!/usr/bin/env ruby
+require 'optparse'
+o = {win: 1000}
+ARGV << '-h' if ARGV.empty?
+OptionParser.new do |opt|
+  opt.banner = "
+  Estimates the sequencing depth per windows from a BedGraph file.
+  IMPORTANT: This script doesn't consider zero-coverage positions if missing
+  from the file. If you produce your BedGraph file with bedtools genomecov and
+  want to consider zero-coverage position, be sure to use -bga (not -bg).
+  Usage: #{$0} [options]"
+  opt.separator ''
+  opt.on('-i', '--input PATH',
+    'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
+  opt.on('-w', '--win INT',
+    'Window size, in base pairs.', "By default: #{o[:win]}."
+    ){ |v| o[:win]=v.to_i }
+  opt.on('-h', '--help', 'Display this screen.') do
+    puts opt
+    exit
+  end
+  opt.separator ''
+end.parse!
+abort '-i is mandatory.' if o[:i].nil?
+def report(d, a, b, seqs)
+  # Average
+  y = 0.0
+  unless d.compact.empty?
+    s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
+    y = s.to_f/d.compact.inject(:+)
+  end
+  # Report
+  puts [a, b, y, seqs.keys.join(",")].join("\t")
+end
+# Read BedGraph
+d  = []
+ln = 0
+a = 1
+seqs = {}
+b = o[:win]
+File.open(o[:i], "r") do |ifh|
+  ifh.each_line do |i|
+    next if i =~ /^#/
+    r  = i.chomp.split("\t")
+    sq = r.shift
+    seqs[sq] = 1
+    r.map!{ |j| j.to_i }
+    l = r[1]-r[0]
+    d[ r[2] ] ||= 0
+    d[ r[2] ]  += l
+    ln += l
+    while ln >= b
+      d[ r[2] ] -= (ln-b)
+      report(d, a, b, seqs)
+      seqs = {}
+      seqs[ sq ] = 1 if ln > b
+      d = []
+      d[ r[2] ] = (ln-b)
+      a = b + 1
+      b = a + o[:win] - 1
+    end
+  end
+end

data/utils/enveomics/Scripts/BlastTab.recplot2.R CHANGED

@@ -19,11 +19,12 @@ opt <- enve.cliopts(enve.recplot2,
   o_desc=list(pos.breaks="Breaks in the positions histogram.",
     id.breaks="Breaks in the identity histogram.",
     id.summary="Function summarizing the identity bins. By default: sum.",
-    peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred)."),
+    peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred).",
+    peaks.method="Method to detect peaks; one of emauto, em, or mower."),
   p_desc=paste("","Produce recruitment plot objects provided that",
     "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
   ignore=c("plot"),
-  defaults=c(id.metric="identity", peaks.col=NA))
+  defaults=c(id.metric="identity", peaks.col=NA, peaks.method="emauto"))
 #= Run it!
 if(length(opt$args)>1){
@@ -35,6 +36,10 @@ if(length(opt$args)>1){
 }
 pc <- opt$options[["peaks.col"]]
 if(!is.na(pc) && pc=="NA") opt$options[["peaks.col"]] <- NA
+if(!is.null(opt$options[["peaks.method"]])){
+  opt$options[["peaks.opts"]] <- list(method=opt$options[["peaks.method"]])
+  opt$options[["peaks.method"]] <- NULL
+}
 rp <- do.call("enve.recplot2", opt$options)
 save(rp, file=opt$args[1])
 if(length(opt$args)>1) dev.off()