RubyGems - miga-base - Versions diffs - 0.3.9.0 → 0.3.9.1 - Mend

miga-base 0.3.9.0 → 0.3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +4 -4
data/actions/add.rb +33 -33
data/actions/edit.rb +33 -0
data/actions/new.rb +17 -18
data/actions/next_step.rb +33 -0
data/actions/run.rb +15 -12
data/bin/miga +43 -37
data/lib/miga/daemon.rb +2 -2
data/lib/miga/project/result.rb +16 -1
data/lib/miga/version.rb +2 -2
data/scripts/aai_distances.bash +1 -3
data/scripts/ani_distances.bash +1 -3
data/scripts/assembly.bash +1 -3
data/scripts/cds.bash +1 -3
data/scripts/clade_finding.bash +1 -3
data/scripts/d.bash +13 -0
data/scripts/distances.bash +1 -3
data/scripts/essential_genes.bash +1 -3
data/scripts/haai_distances.bash +1 -3
data/scripts/miga.bash +12 -9
data/scripts/mytaxa.bash +1 -3
data/scripts/mytaxa_scan.bash +1 -3
data/scripts/ogs.bash +36 -33
data/scripts/p.bash +23 -0
data/scripts/project_stats.bash +1 -3
data/scripts/read_quality.bash +1 -3
data/scripts/ssu.bash +1 -3
data/scripts/stats.bash +1 -3
data/scripts/subclades.bash +1 -3
data/scripts/taxonomy.bash +1 -3
data/scripts/trimmed_fasta.bash +1 -3
data/scripts/trimmed_reads.bash +1 -3
data/test/daemon_test.rb +3 -3
data/utils/distance/runner.rb +1 -1
data/utils/enveomics/Docs/recplot2.md +13 -2
data/utils/enveomics/Examples/aai-matrix.bash +3 -3
data/utils/enveomics/Examples/ani-matrix.bash +3 -3
data/utils/enveomics/Makefile +2 -2
data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
data/utils/enveomics/Manifest/Tasks/other.json +49 -0
data/utils/enveomics/Manifest/categories.json +4 -0
data/utils/enveomics/Manifest/examples.json +1 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
data/utils/enveomics/Scripts/aai.rb +4 -3
data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
data/utils/enveomics/enveomics.R/R/utils.R +19 -1
data/utils/enveomics/enveomics.R/README.md +11 -0
data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
data/utils/subclade/runner.rb +4 -0
metadata +14 -3

data/utils/enveomics/enveomics.R/R/recplot2.R CHANGED Viewed

@@ -9,6 +9,7 @@ setClass("enve.RecPlot2",
    id.counts='numeric',		##<< Counts per ID bin.
    id.breaks='numeric',		##<< Breaks of identity bins.
    pos.breaks='numeric',	##<< Breaks of position bins.
+   pos.names='character',       ##<< Names of the position bins.
    seq.breaks='numeric',	##<< Breaks of input sequences.
    peaks='list',                ##<< Peaks identified in the recplot.
    ### Limits of the subject sequences after concatenation.
@@ -70,10 +71,17 @@ plot.enve.RecPlot2 <- function
       ###   3: identity histogram,
       ###   4: Populations histogram (histogram of sequencing depths),
       ###   5: Color scale for the counts matrix (vertical),
-      ###   6: Color scale of the counts
-      ### matrix (horizontal). Only panels indicated here will be plotted. To
-      ### plot only one panel simply set this to the number of the panel you
-      ### want to plot.
+      ###   6: Color scale of the counts matrix (horizontal)
+      ### Only panels indicated here will be plotted. To plot only one panel
+      ### simply set this to the number of the panel you want to plot.
+      panel.fun=list(),
+      ### List of functions to be executed after drawing each panel. Use the
+      ### indices in `layout` (as characters) as keys. Functions for indices
+      ### missing in `layout` are ignored. For example, to add a vertical line
+      ### at the 3Mbp mark in both the position histogram and the counts matrix:
+      ### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
+      ### Note that the X-axis in both panels is in Mbp by default. To change
+      ### this behavior, set `pos.units` accordingly.
       widths=c(1,7,2),
       ### Relative widths of the columns of `layout`.
       heights=c(1,2),
@@ -166,7 +174,7 @@ plot.enve.RecPlot2 <- function
 		  list(maxColorValue=256, alpha=52)));
    }
-   # Counts matrix
+   # [1] Counts matrix
    if(any(layout==1)){
       par(mar=mar[['1']]);
       plot(1, t='n', bty='l',
@@ -182,9 +190,10 @@ plot.enve.RecPlot2 <- function
       image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
 	 bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
 	 length.out=1+length(palette)), add=TRUE);
+      if(exists('1',panel.fun)) panel.fun[['1']]();
    }
-   # Position histogram
+   # [2] Position histogram
    if(any(layout==2)){
       par(mar=mar[['2']]);
       if(any(layout==1)){
@@ -213,9 +222,10 @@ plot.enve.RecPlot2 <- function
       if(any(pos.counts.in==0))  rect(pos.breaks[c(pos.counts.in==0,FALSE)],
 	       seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
 	       seqdepth.lim[1]*3/2, col=in.col,  border=NA);
+      if(exists('2',panel.fun)) panel.fun[['2']]();
    }
-   # Identity histogram
+   # [3] Identity histogram
    if(any(layout==3)){
       par(mar=mar[['3']]);
       if(any(layout==1)){
@@ -248,9 +258,10 @@ plot.enve.RecPlot2 <- function
 	 plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
 	 text(1,1,labels='Insufficient data', srt=90)
       }
+      if(exists('3',panel.fun)) panel.fun[['3']]();
    }
-   # Populations histogram
+   # [4] Populations histogram
    peaks <- NA;
    if(any(layout==4)){
       par(mar=mar[['4']]);
@@ -308,9 +319,10 @@ plot.enve.RecPlot2 <- function
                     dpt,'X (', frx, '%', err, ')', sep=''))
 	 }
       }
+      if(exists('4',panel.fun)) panel.fun[['4']]();
    }
-   # Color scale
+   # [5] Color scale of the counts matrix (vertical)
    count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
       length.out=1+length(palette))
    if(any(layout==5)){
@@ -319,13 +331,17 @@ plot.enve.RecPlot2 <- function
 	 ylim=range(count.bins), yaxs='i', ylab='')
       rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
 	 border=NA)
+      if(exists('5',panel.fun)) panel.fun[['5']]();
    }
+   # [6] Color scale of the coutnts matrix (horizontal)
    if(any(layout==6)){
       par(mar=mar[['6']]);
       plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
 	 xlim=range(count.bins), xaxs='i',xlab='');
       rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
 	 border=NA);
+      if(exists('6',panel.fun)) panel.fun[['6']]();
    }
    par(mar=ori.mar);
@@ -337,113 +353,129 @@ plot.enve.RecPlot2 <- function
 #==============> Define core functions
 enve.recplot2 <- function(
-   ### Produces recruitment plots provided that BlastTab.catsbj.pl has
-   ### been previously executed.
-      prefix,
-      ### Path to the prefix of the BlastTab.catsbj.pl output files. At
-      ### least the files .rec and .lim must exist with this prefix.
-      plot=TRUE,
-      ### Should the object be plotted?
-      pos.breaks=1e3,
-      ### Breaks in the positions histogram. It can also be a vector of break
-      ### points, and values outside the range are ignored. If zero (0), it
-      ### uses the sequence breaks as defined in the .lim file, which means
-      ### one bin per contig (or gene, if the mapping is agains genes).
-      id.breaks=300,
-      ### Breaks in the identity histogram. It can also be a vector of break
-      ### points, and values outside the range are ignored.
-      id.free.range=FALSE,
-      ### Indicates that the range should be freely set from the observed
-      ### values. Otherwise, 70-100% is included in the identity histogram
-      ### (default).
-      id.metric=c('identity', 'corrected identity', 'bit score'),
-      ### Metric of identity to be used (Y-axis). Corrected identity is only
-      ### supported if the original BLAST file included sequence lengths.
-      id.summary=sum,
-      ### Function summarizing the identity bins. Other recommended options
-      ### include: `median` to estimate the median instead of total bins, and
-      ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
-      id.cutoff=95,
-      ### Cutoff of identity metric above which the hits are considered
-      ### 'in-group'. The 95% identity corresponds to the expectation of
-      ### ANI<95% within species.
-      threads=2,
-      ### Number of threads to use.
-      verbose=TRUE,
-      ### Indicates if the function should report the advance.
-      ...
-      ### Any additional parameters supported by `plot.enve.RecPlot2`.
-   ){
-   # Settings
-   id.metric <- match.arg(id.metric);
-   #Read files
-   if(verbose) cat("Reading files.\n")
-   rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
-      quote="");
-   lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
-      quote="", as.is=TRUE);
-   # Build matrix
-   if(verbose) cat("Building counts matrix.\n")
-   if(id.metric=="corrected identity" & ncol(rec)<6){
-      stop("Requesting corr. identity, but .rec file doesn't have 6th column")
-   }
-   rec.idcol <- ifelse(id.metric=="identity", 3,
-      ifelse(id.metric=="corrected identity", 6, 4));
-   if(length(pos.breaks)==1){
-      if(pos.breaks>0){
-         pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
-      }else{
-         pos.breaks <- c(lim[1,2], lim[,3])
-      }
-   }
-   if(length(id.breaks)==1){
-      id.range.v <- rec[,rec.idcol]
-      if(!id.free.range) id.range.v <- c(id.range.v,70,100)
-      id.range.v <- range(id.range.v)
-      id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
-   }
-   # Run in parallel
-   if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
-   cl		<- makeCluster(threads)
-   rec.l	<- list()
-   thl		<- ceiling(nrow(rec)/threads)
-   for(i in 0:(threads-1)){
-      rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
-			verbose=ifelse(i==0, verbose, FALSE))
-   }
-   counts.l	<- clusterApply(cl, rec.l, enve.recplot2.__counts,
-			pos.breaks=pos.breaks, id.breaks=id.breaks,
-			rec.idcol=rec.idcol)
-   counts	<- counts.l[[1]]
-   if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
-   stopCluster(cl)
-   # Estimate 1D histograms
-   if(verbose) cat("Building histograms.\n")
-   id.mids	<- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
-   id.ingroup	<- (id.mids > id.cutoff);
-   id.counts	<- apply(counts, 2, id.summary);
-   pos.counts.in   <- apply(counts[,id.ingroup], 1, sum);
-   pos.counts.out  <- apply(counts[,!id.ingroup], 1, sum);
+  ### Produces recruitment plots provided that BlastTab.catsbj.pl has
+  ### been previously executed.
+    prefix,
+    ### Path to the prefix of the BlastTab.catsbj.pl output files. At
+    ### least the files .rec and .lim must exist with this prefix.
+    plot=TRUE,
+    ### Should the object be plotted?
+    pos.breaks=1e3,
+    ### Breaks in the positions histogram. It can also be a vector of break
+    ### points, and values outside the range are ignored. If zero (0), it
+    ### uses the sequence breaks as defined in the .lim file, which means
+    ### one bin per contig (or gene, if the mapping is agains genes). Ignored
+    ### if `pos.breaks.tsv` is passed.
+    pos.breaks.tsv=NA,
+    ### Path to a list of (absolute) coordinates to use as position breaks.
+    ### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
+    ### must contain at least one column: coordinates of the break positions of
+    ### each position bin. If it has a second column, this is used as the name
+    ### of the position bin that ends at the given coordinate (the first row is
+    ### ignored). Any additional columns are currently ignored. If NA,
+    ### position bins are determined by `pos.breaks`.
+    id.breaks=300,
+    ### Breaks in the identity histogram. It can also be a vector of break
+    ### points, and values outside the range are ignored.
+    id.free.range=FALSE,
+    ### Indicates that the range should be freely set from the observed
+    ### values. Otherwise, 70-100% is included in the identity histogram
+    ### (default).
+    id.metric=c('identity', 'corrected identity', 'bit score'),
+    ### Metric of identity to be used (Y-axis). Corrected identity is only
+    ### supported if the original BLAST file included sequence lengths.
+    id.summary=sum,
+    ### Function summarizing the identity bins. Other recommended options
+    ### include: `median` to estimate the median instead of total bins, and
+    ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
+    id.cutoff=95,
+    ### Cutoff of identity metric above which the hits are considered
+    ### 'in-group'. The 95% identity corresponds to the expectation of
+    ### ANI<95% within species.
+    threads=2,
+    ### Number of threads to use.
+    verbose=TRUE,
+    ### Indicates if the function should report the advance.
+    ...
+    ### Any additional parameters supported by `plot.enve.RecPlot2`.
+  ){
+  # Settings
+  id.metric <- match.arg(id.metric);
-   # Plot and return
-   recplot <- new('enve.RecPlot2',
-      counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
-      pos.counts.out=pos.counts.out,
-      id.breaks=id.breaks, pos.breaks=pos.breaks,
-      seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
-      id.ingroup=id.ingroup,id.metric=id.metric,
-      call=match.call());
-   if(plot){
-      if(verbose) cat("Plotting.\n")
-      peaks <- plot(recplot, ...);
-      attr(recplot, "peaks") <- peaks
-   }
-   return(recplot);
-   ### Returns an object of class `enve.RecPlot2`.
+  #Read files
+  if(verbose) cat("Reading files.\n")
+  rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
+        quote="");
+  lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
+        quote="", as.is=TRUE);
+  # Build matrix
+  if(verbose) cat("Building counts matrix.\n")
+  if(id.metric=="corrected identity" & ncol(rec)<6){
+    stop("Requesting corr. identity, but .rec file doesn't have 6th column")
+  }
+  rec.idcol <- ifelse(id.metric=="identity", 3,
+        ifelse(id.metric=="corrected identity", 6, 4))
+  pos.names <- as.character(NULL)
+  if(!is.na(pos.breaks.tsv)){
+    tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
+    pos.breaks <- as.numeric(tmp[,1])
+    if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
+  }else if(length(pos.breaks)==1){
+    if(pos.breaks>0){
+      pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
+    }else{
+      pos.breaks <- c(lim[1,2], lim[,3])
+      pos.names  <- lim[,1]
+    }
+  }
+  if(length(id.breaks)==1){
+    id.range.v <- rec[,rec.idcol]
+    if(!id.free.range) id.range.v <- c(id.range.v,70,100)
+    id.range.v <- range(id.range.v)
+    id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
+  }
+  # Run in parallel
+  if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
+  cl    <- makeCluster(threads)
+  rec.l <- list()
+  thl   <- ceiling(nrow(rec)/threads)
+  for(i in 0:(threads-1)){
+    rec.l[[i+1]] <- list(
+          rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
+          verbose=ifelse(i==0, verbose, FALSE))
+  }
+  counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
+                pos.breaks=pos.breaks, id.breaks=id.breaks,
+                rec.idcol=rec.idcol)
+  counts   <- counts.l[[1]]
+  if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
+  stopCluster(cl)
+  # Estimate 1D histograms
+  if(verbose) cat("Building histograms.\n")
+  id.mids	<- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
+  id.ingroup	<- (id.mids > id.cutoff);
+  id.counts	<- apply(counts, 2, id.summary);
+  pos.counts.in   <- apply(counts[,id.ingroup], 1, sum);
+  pos.counts.out  <- apply(counts[,!id.ingroup], 1, sum);
+  # Plot and return
+  recplot <- new('enve.RecPlot2',
+    counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
+    pos.counts.out=pos.counts.out,
+    id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
+    seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
+    id.ingroup=id.ingroup,id.metric=id.metric,
+    call=match.call());
+  if(plot){
+    if(verbose) cat("Plotting.\n")
+    peaks <- plot(recplot, ...);
+    attr(recplot, "peaks") <- peaks
+  }
+  return(recplot);
+  ### Returns an object of class `enve.RecPlot2`.
 }
 enve.recplot2.findPeaks <- function(
@@ -502,9 +534,11 @@ enve.recplot2.findPeaks.emauto <- function(
     stop('Invalid criterion ', criterion)
   }
   for(comp in components){
+    if(verbose) cat('Testing:',comp,'\n')
     best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
           verbose, ...)
   }
+  if(length(best[['peaks']])==0) return(list())
   seqdepths.r <- signif(log(sapply(best[['peaks']],
         function(x) x$seq.depth)), merge.tol)
@@ -609,7 +643,7 @@ enve.recplot2.findPeaks.mower <- function(
       ### Range of quantiles to be used in the estimation of a peak's
       ### parameters.
       mlv.opts=list(method='parzen'),
-      ### Options passed to `mlv` to estimate the mode.
+      ### Ignored. For backwards compatibility.
       fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
 	 start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
       ### Options passed to `fitdist` to estimate the standard deviation if
@@ -758,53 +792,75 @@ enve.recplot2.changeCutoff <- function
    return(rp)
 }
-enve.recplot2.extractWindows <- function
-   ### Extract windows significantly below (or above) the peak in sequencing
-   ### depth.
-      (rp,
-      ### Recruitment plot, a enve.RecPlot2 object.
-      peak,
-      ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
-      ### list of enve.RecPlot2.Peak objects, in which case the core peak is
-      ### used (see `enve.recplot2.corePeak`).
-      lower.tail=TRUE,
-      ### If FALSE, it returns windows significantly above the peak in
-      ### sequencing depth.
-      significance=0.05,
-      ### Significance threshold (alpha) to select windows.
-      seq.names=FALSE
-      ### Returns subject sequence names instead of a vector of Booleans. If
-      ### the recruitment plot was generated with pos.breaks=0 it returns a
-      ### vector of characters (the sequence identifiers), otherwise it returns
-      ### a data.frame with a name column and two columns of coordinates.
-      ){
-   # Determine the threshold
+enve.recplot2.windowDepthThreshold <- function
+  ### Identifies the threshold below which windows should be identified as
+  ### variable or absent.
+    (rp,
+    ### Recruitment plot, an `enve.RecPlot2` object.
+    peak,
+    ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
+    ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
+    ### used (see `enve.recplot2.corePeak`).
+    lower.tail=TRUE,
+    ### If FALSE, it returns windows significantly above the peak in
+    ### sequencing depth.
+    significance=0.05
+    ### Significance threshold (alpha) to select windows.
+    ){
    if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
    par <- peak$param.hat
    par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
    thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
    if(peak$log) thr <- exp(thr)
-   # Select windows past the threshold
-   seqdepth.in <- enve.recplot2.seqdepth(rp)
-   if(lower.tail){
-      sel <- seqdepth.in < thr
-   }else{
-      sel <- seqdepth.in > thr
-   }
-   # seq.names=FALSE
-   if(!seq.names) return(sel)
-   # seq.names=TRUE and pos.breaks=0
-   if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
-         rp$pos.breaks==rp$seq.breaks)
-           return(rp$seq.names[sel])
-   # seq.names=TRUE and pos.breaks!=0
-   return(enve.recplot2.coordinates(rp,sel))
-   ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
-   ### it returns a vector of characters if the object was built with
-   ### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
-   ### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
+   return(thr)
+   ### Returns a float. The units are depth if the peaks were estimated in
+   ### linear scale, or log-depth otherwise (`peak$log`).
+}
+enve.recplot2.extractWindows <- function
+  ### Extract windows significantly below (or above) the peak in sequencing
+  ### depth.
+    (rp,
+    ### Recruitment plot, a `enve.RecPlot2` object.
+    peak,
+    ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
+    ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
+    ### used (see `enve.recplot2.corePeak`).
+    lower.tail=TRUE,
+    ### If FALSE, it returns windows significantly above the peak in
+    ### sequencing depth.
+    significance=0.05,
+    ### Significance threshold (alpha) to select windows.
+    seq.names=FALSE
+    ### Returns subject sequence names instead of a vector of Booleans. If
+    ### the recruitment plot was generated with named position bins (e.g, using
+    ### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
+    ### characters (the sequence identifiers), otherwise it returns a data.frame
+    ### with a name column and two columns of coordinates.
+  ){
+  # Determine the threshold
+  thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
+  # Select windows past the threshold
+  seqdepth.in <- enve.recplot2.seqdepth(rp)
+  if(lower.tail){
+    sel <- seqdepth.in < thr
+  }else{
+    sel <- seqdepth.in > thr
+  }
+  # seq.names=FALSE
+  if(!seq.names) return(sel)
+  # seq.names=TRUE and pos.names defined
+  if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
+  # seq.names=TRUE and pos.names undefined
+  return(enve.recplot2.coordinates(rp,sel))
+  ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
+  ### it returns a vector of characters if the object has `pos.names` defined,
+  ### or a data.frame with four columns otherwise:
+  ### name.from, name.to, pos.from, and pos.to
+  ### (see `enve.recplot2.coordinates`).
 }
 enve.recplot2.compareIdentities <- function
@@ -931,7 +987,11 @@ enve.recplot2.seqdepth <- function
     ){
   if(!inherits(x, "enve.RecPlot2"))
     stop("'x' must inherit from class `enve.RecPlot2`")
-  pos.cnts.in <- x$pos.counts.in
+  if(low.identity){
+    pos.cnts.in <- x$pos.counts.out
+  }else{
+    pos.cnts.in <- x$pos.counts.in
+  }
   pos.breaks  <- x$pos.breaks
   pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
   seqdepth.in <- pos.cnts.in/pos.binsize
@@ -987,6 +1047,7 @@ enve.recplot2.findPeaks.__emauto_one <- function
   ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
     (x, comp, do_crit, best, verbose, ...){
   peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
+  if(length(peaks)==0) return(best)
   k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
   crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
   if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
@@ -1049,7 +1110,7 @@ enve.recplot2.findPeaks.__mow_one <- function
    # Find peak
    o <- mlv.opts; o$x = lsd1;
-   mode1 <- do.call(mlv, o)$M;
+   mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
    if(verbose) cat('Anchoring at mode =',mode1,'\n')
    param.hat <- fitdist.opts$start; last.hat <- param.hat;
    lim <- NA;

data/utils/enveomics/enveomics.R/R/utils.R CHANGED Viewed

@@ -7,10 +7,28 @@ enve.col.alpha <- function
     ### such as 'darkred' or '#009988'.
     alpha=1/2
     ### Alpha value to add to the color, from 0 to 1.
-  ){
+    ){
   return(
     apply(col2rgb(col), 2,
       function(x) do.call(rgb, as.list(c(x[1:3]/256, alpha))) ) )
   ### Returns a color or a vector of colors in hex notation including alpha.
 }
+enve.truncate <- function
+  ### Removes the `n` highest and lowest values from a vector, and applies a
+  ### summary function. The value of `n` is determined such that the central
+  ### range is used, corresponding to the `f` fraction of values.
+    (x,
+    ### A vector of numbers.
+    f=0.95,
+    ### The fraction of values to retain.
+    FUN=mean
+    ### Summary function to apply to the vectors. To obtain the truncated
+    ### vector itself, use `c`.
+    ){
+  n <- round(length(x)*(1-f)/2)
+  y <- sort(x)[ -c(seq(1, n), seq(length(x)+1-n, length(x))) ]
+  return(FUN(y))
+  ### Returns the summary (`FUN`) of the truncated vector.
+}

data/utils/enveomics/enveomics.R/README.md CHANGED Viewed

@@ -32,6 +32,7 @@ And open help messages using any of the following commands:
 ?enve.recplot2.changeCutoff
 ?enve.recplot2.findPeaks
 ?enve.recplot2.corePeak
+?enve.recplot2.windowDepthThreshold
 ?enve.recplot2.extractWindows
 ?enve.recplot2.coordinates
 ?enve.recplot2.seqdepth
@@ -41,6 +42,7 @@ And open help messages using any of the following commands:
 ?enve.tribs.test
 ?enve.growthcurve
 ?enve.col.alpha
+?enve.truncate
 ```
 You can run some examples using these libraries in the
@@ -50,6 +52,15 @@ For additional information on recruitment plots, see the
 [Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
 ## Changelog
+* 1.4.4: Removes modeest library as requirement, and replaces mower peak-finder
+  initialization to median (instead of mode).
+* 1.4.2: Solved bug #36.
+* 1.4.0: New option `pos.breaks.tsv` for `enve.recplot2`.
+* 1.3.4: Gracefully handles and plots recruitment plots with insufficient data
+  to find peaks.
+* 1.3.3: New function `enve.recplot2.windowDepthThreshold`.
+* 1.3.2: New option `panel.fun` for `plot.enve.RecPlot2`.
+* 1.3.1: New function enve.truncate.
 * 1.3: Several bug fixes and new utilities for recruitment plots (recplot2).
 * 1.1.0: New function enve.growthcurve and related class enve.GrowthCurve
   with S3 methods plot and summary.

data/utils/enveomics/enveomics.R/man/enve.recplot.Rd CHANGED Viewed

@@ -37,7 +37,7 @@ If non-zero, requires the stats package.}
   \item{main}{Title of the plot.}
   \item{contig.col}{Color of the Contig boundaries. Set to NA to ignore Contig boundaries.}
   \item{ret.recplot}{Indicates if the matrix of the recruitment plot is to be returned.}
-  \item{ret.hist}{Indicates if the vectors of the identity and position histograms are to be returned.}
+  \item{ret.hist}{Ignored, for backwards compatibility.}
   \item{ret.mode}{Indicates if the mode of the identity is to be computed. It requires the modeest
 package.}
   \item{id.cutoff}{Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
@@ -59,7 +59,7 @@ id.mean: Mean identity.
 id.median: Median identity.
-id.mode (if ret.mode=TRUE): Mode of the identity.
+id.mode (if ret.mode=TRUE): Mode of the identity. Deprecated.
 id.hist (if ret.hist=TRUE): Values of the identity histogram.

data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd CHANGED Viewed

@@ -17,6 +17,7 @@ be produced by `enve.recplot2` and supports S4 method plot.}
     \item{\code{id.counts}:}{(\code{numeric}) Counts per ID bin.}
     \item{\code{id.breaks}:}{(\code{numeric}) Breaks of identity bins.}
     \item{\code{pos.breaks}:}{(\code{numeric}) Breaks of position bins.}
+    \item{\code{pos.names}:}{(\code{character}) Names of the position bins.}
     \item{\code{seq.breaks}:}{(\code{numeric}) Breaks of input sequences.}
     \item{\code{peaks}:}{(\code{list}) Peaks identified in the recplot.
 Limits of the subject sequences after concatenation.}

data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd CHANGED Viewed

@@ -3,10 +3,10 @@
 \title{enve recplot2}
 \description{Produces recruitment plots provided that BlastTab.catsbj.pl has
 been previously executed.}
-\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, id.breaks = 300,
-    id.free.range = FALSE, id.metric = c("identity", "corrected identity",
-        "bit score"), id.summary = sum, id.cutoff = 95, threads = 2,
-    verbose = TRUE, ...)}
+\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, pos.breaks.tsv = NA,
+    id.breaks = 300, id.free.range = FALSE, id.metric = c("identity",
+        "corrected identity", "bit score"), id.summary = sum,
+    id.cutoff = 95, threads = 2, verbose = TRUE, ...)}
 \arguments{
   \item{prefix}{Path to the prefix of the BlastTab.catsbj.pl output files. At
 least the files .rec and .lim must exist with this prefix.}
@@ -14,7 +14,15 @@ least the files .rec and .lim must exist with this prefix.}
   \item{pos.breaks}{Breaks in the positions histogram. It can also be a vector of break
 points, and values outside the range are ignored. If zero (0), it
 uses the sequence breaks as defined in the .lim file, which means
-one bin per contig (or gene, if the mapping is agains genes).}
+one bin per contig (or gene, if the mapping is agains genes). Ignored
+if `pos.breaks.tsv` is passed.}
+  \item{pos.breaks.tsv}{Path to a list of (absolute) coordinates to use as position breaks.
+This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
+must contain at least one column: coordinates of the break positions of
+each position bin. If it has a second column, this is used as the name
+of the position bin that ends at the given coordinate (the first row is
+ignored). Any additional columns are currently ignored. If NA,
+position bins are determined by `pos.breaks`.}
   \item{id.breaks}{Breaks in the identity histogram. It can also be a vector of break
 points, and values outside the range are ignored.}
   \item{id.free.range}{Indicates that the range should be freely set from the observed