RubyGems - miga-base - Versions diffs - 0.3.1.6 → 0.3.1.7 - Mend

miga-base 0.3.1.6 → 0.3.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/actions/ncbi_get.rb +57 -42
data/lib/miga/result/base.rb +7 -0
data/lib/miga/result/dates.rb +42 -0
data/lib/miga/result.rb +4 -0
data/lib/miga/version.rb +1 -1
data/scripts/essential_genes.bash +5 -4
data/utils/enveomics/Makefile +1 -1
data/utils/enveomics/Manifest/Tasks/aasubs.json +75 -75
data/utils/enveomics/Manifest/Tasks/blasttab.json +194 -185
data/utils/enveomics/Manifest/Tasks/distances.json +130 -130
data/utils/enveomics/Manifest/Tasks/fasta.json +51 -3
data/utils/enveomics/Manifest/Tasks/fastq.json +161 -126
data/utils/enveomics/Manifest/Tasks/graphics.json +111 -111
data/utils/enveomics/Manifest/Tasks/mapping.json +30 -0
data/utils/enveomics/Manifest/Tasks/ogs.json +308 -265
data/utils/enveomics/Manifest/Tasks/other.json +451 -449
data/utils/enveomics/Manifest/Tasks/remote.json +1 -1
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +18 -10
data/utils/enveomics/Manifest/Tasks/tables.json +250 -250
data/utils/enveomics/Manifest/Tasks/trees.json +52 -52
data/utils/enveomics/Manifest/Tasks/variants.json +4 -4
data/utils/enveomics/Manifest/categories.json +12 -4
data/utils/enveomics/Manifest/examples.json +1 -1
data/utils/enveomics/Scripts/BedGraph.tad.rb +71 -0
data/utils/enveomics/Scripts/BlastTab.recplot2.R +23 -22
data/utils/enveomics/Scripts/FastA.split.rb +79 -0
data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
data/utils/enveomics/Scripts/JPlace.to_iToL.rb +272 -258
data/utils/enveomics/Scripts/aai.rb +13 -6
data/utils/enveomics/Scripts/ani.rb +2 -2
data/utils/enveomics/Scripts/clust.rand.rb +102 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +12 -14
data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +2 -2
data/utils/enveomics/Scripts/rbm.rb +23 -14
data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
data/utils/enveomics/enveomics.R/R/barplot.R +2 -2
metadata +9 -2

data/utils/enveomics/Manifest/Tasks/trees.json CHANGED Viewed

@@ -10,58 +10,58 @@
       "help_arg": "--help",
       "options": [
         {
-	  "name": "Input tree",
-	  "opt": "--t",
-	  "arg": "in_file",
-	  "mandatory": true,
-	  "description": "A tree to prune in Newick format."
-	},
-	{
-	  "opt": "--dist-quantile",
-	  "arg": "float",
-	  "default": 0.25,
-	  "description": "The quantile of edge lengths."
-	},
-	{
-	  "opt": "--min_dist",
-	  "arg": "float",
-	  "description": ["The minimum distance to allow between two tips. If",
-	    "not set, dist.quantile is used instead to calculate it."]
-	},
-	{
-	  "opt": "--quiet",
-	  "description": ["Boolean indicating if the function must run without",
-	    "output."]
-	},
-	{
-	  "opt": "--max_iters",
-	  "arg": "integer",
-	  "default": 1000,
-	  "description": "Maximum number of iterations."
-	},
-	{
-	  "opt": "--min_nodes_random",
-	  "arg": "integer",
-	  "default": 40000,
-	  "description": ["Minimum number of nodes to trigger 'tip-pairs'",
-	    "nodes sampling. This sampling is less reproducible and more",
-	    "computationally expensive, but it's the only solution if the",
-	    "cophenetic matrix exceeds 2^31-1 entries; above that, it cannot",
-	    "be represented in R."]
-	},
-	{
-	  "opt": "--random_nodes_frx",
-	  "arg": "float",
-	  "default": 1.0,
-	  "description": ["Fraction of the nodes to be sampled if more than",
-	    "'Min nodes random'."]
-	},
-	{
-	  "arg": "out_file",
-	  "mandatory": true,
-	  "description": ["Output file in Newick format containing the pruned",
-	    "tree."]
-	}
+          "name": "Input tree",
+          "opt": "--t",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "A tree to prune in Newick format."
+        },
+        {
+          "opt": "--dist-quantile",
+          "arg": "float",
+          "default": 0.25,
+          "description": "The quantile of edge lengths."
+        },
+        {
+          "opt": "--min_dist",
+          "arg": "float",
+          "description": ["The minimum distance to allow between two tips. If",
+            "not set, dist.quantile is used instead to calculate it."]
+        },
+        {
+          "opt": "--quiet",
+          "description": ["Boolean indicating if the function must run without",
+            "output."]
+        },
+        {
+          "opt": "--max_iters",
+          "arg": "integer",
+          "default": 1000,
+          "description": "Maximum number of iterations."
+        },
+        {
+          "opt": "--min_nodes_random",
+          "arg": "integer",
+          "default": 40000,
+          "description": ["Minimum number of nodes to trigger 'tip-pairs'",
+            "nodes sampling. This sampling is less reproducible and more",
+            "computationally expensive, but it's the only solution if the",
+            "cophenetic matrix exceeds 2^31-1 entries; above that, it cannot",
+            "be represented in R."]
+        },
+        {
+          "opt": "--random_nodes_frx",
+          "arg": "float",
+          "default": 1.0,
+          "description": ["Fraction of the nodes to be sampled if more than",
+            "'Min nodes random'."]
+        },
+        {
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["Output file in Newick format containing the pruned",
+            "tree."]
+        }
       ]
     }
   ]

data/utils/enveomics/Manifest/Tasks/variants.json CHANGED Viewed

@@ -58,10 +58,10 @@
           "description": "Minimum information content (in bits, from 0 to 1).",
           "default": 0.0
         },
-	{
-	  "opt": "--indels",
-	  "description": "Process indels."
-	}
+        {
+          "opt": "--indels",
+          "description": "Process indels."
+        }
       ]
     },
     {

data/utils/enveomics/Manifest/categories.json CHANGED Viewed

@@ -2,13 +2,15 @@
   "categories": {
     "Sequence similarity search": {
       "Statistics": [
+        "BedGraph.tad.rb",
         "BlastPairwise.AAsubs.pl",
         "BlastTab.advance.bash",
         "BlastTab.recplot2.R",
         "BlastTab.seqdepth.pl",
         "BlastTab.seqdepth_nomedian.pl",
         "BlastTab.seqdepth_ZIP.pl",
-        "BlastTab.sumPerHit.pl"
+        "BlastTab.sumPerHit.pl",
+        "FastQ.test-error.rb"
       ],
       "Manipulation": [
         "BlastTab.addlen.rb",
@@ -33,7 +35,8 @@
         "FastA.gc.pl",
         "FastA.length.pl",
         "FastA.N50.pl",
-        "FastA.qlen.pl"
+        "FastA.qlen.pl",
+        "FastQ.test-error.rb"
       ],
       "Manipulation": [
         "FastA.filter.pl",
@@ -41,11 +44,12 @@
         "FastA.filterN.pl",
         "FastA.fragment.rb",
         "FastA.interpose.pl",
-	"FastA.per_file.pl",
+        "FastA.per_file.pl",
         "FastA.rename.pl",
         "FastA.revcom.pl",
         "FastA.slider.pl",
         "FastA.split.pl",
+        "FastA.split.rb",
         "FastA.subsample.pl",
         "FastA.tag.rb",
         "FastA.wrap.rb",
@@ -86,7 +90,7 @@
         "Table.df2dist.R",
         "Table.filter.pl",
         "Table.merge.pl",
-	"Table.replace.rb",
+        "Table.replace.rb",
         "Table.round.rb",
         "Table.split.pl"
       ],
@@ -126,6 +130,10 @@
         "Aln.cat.rb",
         "Aln.convert.pl",
         "BlastPairwise.AAsubs.pl"
+      ],
+      "Clustering": [
+        "ogs.mcl.rb",
+        "clust.rand.rb"
       ]
     }
   }

data/utils/enveomics/Manifest/examples.json CHANGED Viewed

@@ -56,7 +56,7 @@
       "task": "BlastTab.recplot2.R",
       "description": ["Generates recruitment plots for a comparison",
         "between a virome containing HIV and the HIV-1 genome."],
-      "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,
+      "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,"NA",
         "hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
     },
     {

data/utils/enveomics/Scripts/BedGraph.tad.rb ADDED Viewed

@@ -0,0 +1,71 @@
+#!/usr/bin/env ruby
+require "optparse"
+o = {range:0.5}
+ARGV << "-h" if ARGV.empty?
+OptionParser.new do |opt|
+  opt.banner = "
+  Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
+  IMPORTANT: This script doesn't consider zero-coverage positions if missing
+  from the file. If you produce your BedGraph file with bedtools genomecov and
+  want to consider zero-coverage position, be sure to use -bga (not -bg).
+  Usage: #{$0} [options]"
+  opt.separator ""
+  opt.on("-i", "--input PATH",
+    "Input BedGraph file (mandatory)."){ |v| o[:i]=v }
+  opt.on("-r", "--range FLOAT",
+    "Central range to consider, between 0 and 1.",
+    "By default: #{o[:range]} (inter-quartile range)."
+    ){ |v| o[:range]=v.to_f }
+  opt.on("-h", "--help", "Display this screen.") do
+    puts opt
+    exit
+  end
+  opt.separator ""
+end.parse!
+abort "-i is mandatory." if o[:i].nil?
+def pad(d, idx, r)
+  idx.each do |i|
+    next if d[i].nil?
+    d[i] -= r
+    break unless d[i] < 0
+    r = -d[i]
+    d[i] = nil
+  end
+  d
+end
+# Read BedGraph
+d = []
+ln = 0
+File.open(o[:i], "r") do |ifh|
+  ifh.each_line do |i|
+    next if i =~ /^#/
+    r = i.chomp.split("\t")[1 .. -1].map{ |j| j.to_i }
+    l = r[1]-r[0]
+    d[ r[2] ] ||= 0
+    d[ r[2] ] += l
+    ln += l
+  end
+end
+# Estimate padding ranges
+pad = (1.0-o[:range])/2.0
+r = (pad*ln).round
+# Pad
+d = pad(d, d.each_index.to_a, r+0)
+d = pad(d, d.each_index.to_a.reverse, r+0)
+# Average
+if d.compact.empty?
+  p 0.0
+else
+  s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
+  p s.to_f/d.compact.inject(:+)
+end

data/utils/enveomics/Scripts/BlastTab.recplot2.R CHANGED Viewed

@@ -1,40 +1,41 @@
 #!/usr/bin/env Rscript
-#
 # @author  Luis M. Rodriguez-R
-# @update  Jan-05-2016
 # @license artistic license 2.0
-#
 #= Load stuff
 suppressPackageStartupMessages(library(enveomics.R))
-args <- commandArgs(trailingOnly = F)
+args <- commandArgs(trailingOnly = FALSE)
 enveomics_R <- file.path(dirname(
-   sub("^--file=", "", args[grep("^--file=", args)])),
-   "lib", "enveomics.R")
+  sub("^--file=", "", args[grep("^--file=", args)])),
+  "lib", "enveomics.R")
 #= Generate interface
 opt <- enve.cliopts(enve.recplot2,
-   file.path(enveomics_R, "man", "enve.recplot2.Rd"),
-   positional_arguments=c(1,4),
-   usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
-   mandatory=c("prefix"),
-   o_desc=list(pos.breaks="Breaks in the positions histogram.",
-     id.breaks="Breaks in the identity histogram.",
-     id.summary="Function summarizing the identity bins. By default: sum."),
-   p_desc=paste("","Produce recruitment plot objects provided that",
-     "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
-   ignore=c("plot"),
-   defaults=c(id.metric="identity"))
+  file.path(enveomics_R, "man", "enve.recplot2.Rd"),
+  positional_arguments=c(1,4),
+  usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
+  mandatory=c("prefix"),
+  o_desc=list(pos.breaks="Breaks in the positions histogram.",
+    id.breaks="Breaks in the identity histogram.",
+    id.summary="Function summarizing the identity bins. By default: sum.",
+    peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred)."),
+  p_desc=paste("","Produce recruitment plot objects provided that",
+    "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
+  ignore=c("plot"),
+  defaults=c(id.metric="identity", peaks.col=NA))
 #= Run it!
 if(length(opt$args)>1){
-   args = as.list(opt$args[-1])
-   for(i in 2:3) if(length(args)>=i) args[[i]] <- as.numeric(args[[i]])
-   do.call("pdf", args)
+  args = as.list(opt$args[-1])
+  for(i in 2:3) if(length(args)>=i) args[[i]] <- as.numeric(args[[i]])
+  do.call("pdf", args)
 }else{
-   opt$options[["plot"]] <- FALSE
+  opt$options[["plot"]] <- FALSE
 }
+pc <- opt$options[["peaks.col"]]
+if(!is.na(pc) && pc=="NA") opt$options[["peaks.col"]] <- NA
 rp <- do.call("enve.recplot2", opt$options)
 save(rp, file=opt$args[1])
 if(length(opt$args)>1) dev.off()

data/utils/enveomics/Scripts/FastA.split.rb ADDED Viewed

@@ -0,0 +1,79 @@
+#!/usr/bin/env ruby
+#
+# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @license Artistic-2.0
+#
+require "optparse"
+o = {q:false, n:12, lett:false, dc:false, z:false, out:"%s.%s.fa"}
+ARGV << "-h" if ARGV.size==0
+OptionParser.new do |opt|
+  opt.banner = "
+  Evenly splits a multi-FastA file into multiple multi-FastA files.
+  Usage: #{$0} [options]"
+  opt.separator ""
+  opt.separator "Mandatory"
+  opt.on("-i", "--input PATH", "Input FastA file."){ |v| o[:i] = v}
+  opt.on("-p", "--prefix PATH", "Prefix of output FastA files."){ |v| o[:p] = v}
+  opt.separator ""
+  opt.separator "Options"
+  opt.on("-n", "--number INT",
+    "Number of output files to produce. By default: #{o[:n]}."
+    ){ |v| o[:n] = v.to_i }
+  opt.on("-z", "--zero-padded",
+    "Use zero-padded numbers as output index."){ o[:lett]=false; o[:z]=true }
+  opt.on("-l", "--lowercase-letters",
+    "Use lowercase letters as output index."){ o[:lett]=true ; o[:dc]=true }
+    opt.on("-u", "--uppercase-letters",
+    "Use uppercase letters as output index."){ o[:lett]=true }
+  opt.on("-o", "--out STR",
+    "Format of output filenames, where %s are replaced by prefix and index.",
+    "By default: #{o[:out]}."){ |v| o[:out] = v }
+  opt.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
+  opt.on("-h", "--help", "Display this screen.") do
+    puts opt
+    exit
+  end
+  opt.separator ""
+end.parse!
+abort "-i is mandatory." if o[:i].nil?
+abort "-p is mandatory." if o[:p].nil?
+ofh = []
+idx = if o[:lett]
+  k = Math::log(o[:n], 26).ceil
+  r = o[:dc] ? ["a","z"] : ["A","Z"]
+  ((r[0]*k) .. (r[1]*k)).first(o[:n])
+elsif o[:z]
+  k = Math::log(o[:n], 10).ceil
+  (1 .. o[:n]).map{ |i| "%0#{k}d" % i }
+else
+  (1 .. o[:n]).map{ |i| i.to_s }
+end
+idx.each do |i|
+  fn = o[:out] % [o[:p], i]
+  ofh << File.open(fn, "w")
+end
+i = -1
+seq = ""
+File.open(o[:i], "r") do |ifh|
+  ifh.each_line do |ln|
+    next if ln =~ /^;/
+    if ln =~ /^>/
+      ofh[i % o[:n]].print seq
+      i += 1
+      seq = ""
+    end
+    seq << ln
+  end
+  ofh[i % o[:n]].print seq
+end
+ofh.each{ |i| i.close }
+$stderr.puts "Sequences: #{i+1}.", "Files: #{o[:n]}." unless o[:q]

data/utils/enveomics/Scripts/FastQ.test-error.rb ADDED Viewed

@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+require 'optparse'
+o = {q:false, key:2}
+ARGV << '-h' if ARGV.empty?
+OptionParser.new do |opts|
+   opts.banner = "
+Compares the estimated error of sequencing reads (Q-score) with
+observed mismatches (identity against a know reference sequence).
+Usage: #{$0} [options]"
+  opts.separator ""
+  opts.separator "Mandatory"
+  opts.on("-f", "--fastq FILE",
+       "Path to the FastQ file containing the sequences."){ |v| o[:fastq] = v }
+  opts.on("-b", "--blast FILE",
+       "Path to the tabular BLAST file mapping reads to reference sequences."
+       ){ |v| o[:blast] = v }
+  opts.on("-o", "--out FILE",
+      "Path to the output tab-delimited file to create."){ |v| o[:out] = v }
+  opts.separator ""
+  opts.separator "Other Options"
+  opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
+  opts.on("-h", "--help", "Display this screen") do
+    puts opts
+    exit
+  end
+  opts.separator ""
+end.parse!
+abort "-f is mandatory" if o[:fastq].nil?
+abort "-b is mandatory" if o[:blast].nil?
+abort "-o is mandatory" if o[:out].nil?
+# Read the Q scores and estimate expected mismatches
+mm = {} # <- Hash with read IDs as key, and arrays as values:
+        #    [ expected mismatches, variance of mismatches, length ]
+$stderr.puts "Reading FastQ file" unless o[:q]
+File.open(o[:fastq], "r") do |fh|
+  id = nil
+  fh.each_line do |ln|
+    case $.%4
+    when 1
+      ln =~ /^@(\S+)/ or raise "Unexpected defline format: #{ln}"
+      id = $1
+      $stderr.print " #{mm.size} reads...\r" unless o[:q]
+    when 0
+      ln.chomp!
+      # I'm assuming ALWAYS Phred+33!!!
+      p = ln.split('').map{ |i| (i.ord - 33).to_f }.map{ |q| 10.0**(-q/10.0) }
+      mu = p.inject(:+)
+      var = p.map{ |i| i*(1.0-i) }.inject(:+)
+      mm[id] = [mu, var, p.size]
+    end
+  end
+  $stderr.puts " Found: #{mm.size} reads." unless o[:q]
+end
+ofh = File.open(o[:out], "w")
+ofh.puts %w[id obs_subs obs_id aln_len obs_ins obs_del obs_gap mu var len].join("\t")
+# Read Identities and compare against expectation
+$stderr.puts "Reading Tabular BLAST file" unless o[:q]
+File.open(o[:blast], "r") do |fh|
+  k = 0
+  fh.each_line do |ln|
+    r = ln.chomp.split("\t")
+    id = r[0]
+    next if mm[id].nil?
+    k += 1
+    $stderr.print " #{k} alignments...\r" unless o[:q]
+    obs_m = r[4].to_i + (r[6].to_i - 1) + (mm[id][2] - r[7].to_i)
+    obs_del = r[3].to_i - (r[7].to_i - r[6].to_i).abs
+    obs_ins = r[3].to_i - (r[9].to_i - r[8].to_i).abs
+    ofh.puts ([id, obs_m, r[2], r[7].to_i - r[6].to_i + 1,
+          obs_ins, obs_del, r[5]] + mm[id]).join("\t")
+  end
+  $stderr.puts " Found #{k} alignments." unless o[:q]
+end
+ofh.close