miga-base 0.3.9.0 → 0.3.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/actions/add.rb +33 -33
  3. data/actions/edit.rb +33 -0
  4. data/actions/new.rb +17 -18
  5. data/actions/next_step.rb +33 -0
  6. data/actions/run.rb +15 -12
  7. data/bin/miga +43 -37
  8. data/lib/miga/daemon.rb +2 -2
  9. data/lib/miga/project/result.rb +16 -1
  10. data/lib/miga/version.rb +2 -2
  11. data/scripts/aai_distances.bash +1 -3
  12. data/scripts/ani_distances.bash +1 -3
  13. data/scripts/assembly.bash +1 -3
  14. data/scripts/cds.bash +1 -3
  15. data/scripts/clade_finding.bash +1 -3
  16. data/scripts/d.bash +13 -0
  17. data/scripts/distances.bash +1 -3
  18. data/scripts/essential_genes.bash +1 -3
  19. data/scripts/haai_distances.bash +1 -3
  20. data/scripts/miga.bash +12 -9
  21. data/scripts/mytaxa.bash +1 -3
  22. data/scripts/mytaxa_scan.bash +1 -3
  23. data/scripts/ogs.bash +36 -33
  24. data/scripts/p.bash +23 -0
  25. data/scripts/project_stats.bash +1 -3
  26. data/scripts/read_quality.bash +1 -3
  27. data/scripts/ssu.bash +1 -3
  28. data/scripts/stats.bash +1 -3
  29. data/scripts/subclades.bash +1 -3
  30. data/scripts/taxonomy.bash +1 -3
  31. data/scripts/trimmed_fasta.bash +1 -3
  32. data/scripts/trimmed_reads.bash +1 -3
  33. data/test/daemon_test.rb +3 -3
  34. data/utils/distance/runner.rb +1 -1
  35. data/utils/enveomics/Docs/recplot2.md +13 -2
  36. data/utils/enveomics/Examples/aai-matrix.bash +3 -3
  37. data/utils/enveomics/Examples/ani-matrix.bash +3 -3
  38. data/utils/enveomics/Makefile +2 -2
  39. data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
  40. data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
  41. data/utils/enveomics/Manifest/Tasks/other.json +49 -0
  42. data/utils/enveomics/Manifest/categories.json +4 -0
  43. data/utils/enveomics/Manifest/examples.json +1 -1
  44. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
  45. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
  46. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
  47. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
  48. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
  49. data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
  50. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  51. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  52. data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
  53. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  54. data/utils/enveomics/Scripts/aai.rb +4 -3
  55. data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
  56. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
  57. data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
  58. data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
  59. data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
  60. data/utils/enveomics/enveomics.R/R/utils.R +19 -1
  61. data/utils/enveomics/enveomics.R/README.md +11 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
  64. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
  65. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
  66. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
  68. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
  69. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
  70. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
  71. data/utils/subclade/runner.rb +4 -0
  72. metadata +14 -3
@@ -9,6 +9,7 @@ setClass("enve.RecPlot2",
9
9
  id.counts='numeric', ##<< Counts per ID bin.
10
10
  id.breaks='numeric', ##<< Breaks of identity bins.
11
11
  pos.breaks='numeric', ##<< Breaks of position bins.
12
+ pos.names='character', ##<< Names of the position bins.
12
13
  seq.breaks='numeric', ##<< Breaks of input sequences.
13
14
  peaks='list', ##<< Peaks identified in the recplot.
14
15
  ### Limits of the subject sequences after concatenation.
@@ -70,10 +71,17 @@ plot.enve.RecPlot2 <- function
70
71
  ### 3: identity histogram,
71
72
  ### 4: Populations histogram (histogram of sequencing depths),
72
73
  ### 5: Color scale for the counts matrix (vertical),
73
- ### 6: Color scale of the counts
74
- ### matrix (horizontal). Only panels indicated here will be plotted. To
75
- ### plot only one panel simply set this to the number of the panel you
76
- ### want to plot.
74
+ ### 6: Color scale of the counts matrix (horizontal)
75
+ ### Only panels indicated here will be plotted. To plot only one panel
76
+ ### simply set this to the number of the panel you want to plot.
77
+ panel.fun=list(),
78
+ ### List of functions to be executed after drawing each panel. Use the
79
+ ### indices in `layout` (as characters) as keys. Functions for indices
80
+ ### missing in `layout` are ignored. For example, to add a vertical line
81
+ ### at the 3Mbp mark in both the position histogram and the counts matrix:
82
+ ### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
83
+ ### Note that the X-axis in both panels is in Mbp by default. To change
84
+ ### this behavior, set `pos.units` accordingly.
77
85
  widths=c(1,7,2),
78
86
  ### Relative widths of the columns of `layout`.
79
87
  heights=c(1,2),
@@ -166,7 +174,7 @@ plot.enve.RecPlot2 <- function
166
174
  list(maxColorValue=256, alpha=52)));
167
175
  }
168
176
 
169
- # Counts matrix
177
+ # [1] Counts matrix
170
178
  if(any(layout==1)){
171
179
  par(mar=mar[['1']]);
172
180
  plot(1, t='n', bty='l',
@@ -182,9 +190,10 @@ plot.enve.RecPlot2 <- function
182
190
  image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
183
191
  bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
184
192
  length.out=1+length(palette)), add=TRUE);
193
+ if(exists('1',panel.fun)) panel.fun[['1']]();
185
194
  }
186
195
 
187
- # Position histogram
196
+ # [2] Position histogram
188
197
  if(any(layout==2)){
189
198
  par(mar=mar[['2']]);
190
199
  if(any(layout==1)){
@@ -213,9 +222,10 @@ plot.enve.RecPlot2 <- function
213
222
  if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
214
223
  seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
215
224
  seqdepth.lim[1]*3/2, col=in.col, border=NA);
225
+ if(exists('2',panel.fun)) panel.fun[['2']]();
216
226
  }
217
227
 
218
- # Identity histogram
228
+ # [3] Identity histogram
219
229
  if(any(layout==3)){
220
230
  par(mar=mar[['3']]);
221
231
  if(any(layout==1)){
@@ -248,9 +258,10 @@ plot.enve.RecPlot2 <- function
248
258
  plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
249
259
  text(1,1,labels='Insufficient data', srt=90)
250
260
  }
261
+ if(exists('3',panel.fun)) panel.fun[['3']]();
251
262
  }
252
263
 
253
- # Populations histogram
264
+ # [4] Populations histogram
254
265
  peaks <- NA;
255
266
  if(any(layout==4)){
256
267
  par(mar=mar[['4']]);
@@ -308,9 +319,10 @@ plot.enve.RecPlot2 <- function
308
319
  dpt,'X (', frx, '%', err, ')', sep=''))
309
320
  }
310
321
  }
322
+ if(exists('4',panel.fun)) panel.fun[['4']]();
311
323
  }
312
324
 
313
- # Color scale
325
+ # [5] Color scale of the counts matrix (vertical)
314
326
  count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
315
327
  length.out=1+length(palette))
316
328
  if(any(layout==5)){
@@ -319,13 +331,17 @@ plot.enve.RecPlot2 <- function
319
331
  ylim=range(count.bins), yaxs='i', ylab='')
320
332
  rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
321
333
  border=NA)
334
+ if(exists('5',panel.fun)) panel.fun[['5']]();
322
335
  }
336
+
337
+ # [6] Color scale of the coutnts matrix (horizontal)
323
338
  if(any(layout==6)){
324
339
  par(mar=mar[['6']]);
325
340
  plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
326
341
  xlim=range(count.bins), xaxs='i',xlab='');
327
342
  rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
328
343
  border=NA);
344
+ if(exists('6',panel.fun)) panel.fun[['6']]();
329
345
  }
330
346
 
331
347
  par(mar=ori.mar);
@@ -337,113 +353,129 @@ plot.enve.RecPlot2 <- function
337
353
 
338
354
  #==============> Define core functions
339
355
  enve.recplot2 <- function(
340
- ### Produces recruitment plots provided that BlastTab.catsbj.pl has
341
- ### been previously executed.
342
- prefix,
343
- ### Path to the prefix of the BlastTab.catsbj.pl output files. At
344
- ### least the files .rec and .lim must exist with this prefix.
345
- plot=TRUE,
346
- ### Should the object be plotted?
347
- pos.breaks=1e3,
348
- ### Breaks in the positions histogram. It can also be a vector of break
349
- ### points, and values outside the range are ignored. If zero (0), it
350
- ### uses the sequence breaks as defined in the .lim file, which means
351
- ### one bin per contig (or gene, if the mapping is agains genes).
352
- id.breaks=300,
353
- ### Breaks in the identity histogram. It can also be a vector of break
354
- ### points, and values outside the range are ignored.
355
- id.free.range=FALSE,
356
- ### Indicates that the range should be freely set from the observed
357
- ### values. Otherwise, 70-100% is included in the identity histogram
358
- ### (default).
359
- id.metric=c('identity', 'corrected identity', 'bit score'),
360
- ### Metric of identity to be used (Y-axis). Corrected identity is only
361
- ### supported if the original BLAST file included sequence lengths.
362
- id.summary=sum,
363
- ### Function summarizing the identity bins. Other recommended options
364
- ### include: `median` to estimate the median instead of total bins, and
365
- ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
366
- id.cutoff=95,
367
- ### Cutoff of identity metric above which the hits are considered
368
- ### 'in-group'. The 95% identity corresponds to the expectation of
369
- ### ANI<95% within species.
370
- threads=2,
371
- ### Number of threads to use.
372
- verbose=TRUE,
373
- ### Indicates if the function should report the advance.
374
- ...
375
- ### Any additional parameters supported by `plot.enve.RecPlot2`.
376
- ){
377
- # Settings
378
- id.metric <- match.arg(id.metric);
379
-
380
- #Read files
381
- if(verbose) cat("Reading files.\n")
382
- rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
383
- quote="");
384
- lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
385
- quote="", as.is=TRUE);
386
-
387
- # Build matrix
388
- if(verbose) cat("Building counts matrix.\n")
389
- if(id.metric=="corrected identity" & ncol(rec)<6){
390
- stop("Requesting corr. identity, but .rec file doesn't have 6th column")
391
- }
392
- rec.idcol <- ifelse(id.metric=="identity", 3,
393
- ifelse(id.metric=="corrected identity", 6, 4));
394
- if(length(pos.breaks)==1){
395
- if(pos.breaks>0){
396
- pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
397
- }else{
398
- pos.breaks <- c(lim[1,2], lim[,3])
399
- }
400
- }
401
- if(length(id.breaks)==1){
402
- id.range.v <- rec[,rec.idcol]
403
- if(!id.free.range) id.range.v <- c(id.range.v,70,100)
404
- id.range.v <- range(id.range.v)
405
- id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
406
- }
407
-
408
- # Run in parallel
409
- if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
410
- cl <- makeCluster(threads)
411
- rec.l <- list()
412
- thl <- ceiling(nrow(rec)/threads)
413
- for(i in 0:(threads-1)){
414
- rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
415
- verbose=ifelse(i==0, verbose, FALSE))
416
- }
417
- counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
418
- pos.breaks=pos.breaks, id.breaks=id.breaks,
419
- rec.idcol=rec.idcol)
420
- counts <- counts.l[[1]]
421
- if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
422
- stopCluster(cl)
423
-
424
- # Estimate 1D histograms
425
- if(verbose) cat("Building histograms.\n")
426
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
427
- id.ingroup <- (id.mids > id.cutoff);
428
- id.counts <- apply(counts, 2, id.summary);
429
- pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
430
- pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
356
+ ### Produces recruitment plots provided that BlastTab.catsbj.pl has
357
+ ### been previously executed.
358
+ prefix,
359
+ ### Path to the prefix of the BlastTab.catsbj.pl output files. At
360
+ ### least the files .rec and .lim must exist with this prefix.
361
+ plot=TRUE,
362
+ ### Should the object be plotted?
363
+ pos.breaks=1e3,
364
+ ### Breaks in the positions histogram. It can also be a vector of break
365
+ ### points, and values outside the range are ignored. If zero (0), it
366
+ ### uses the sequence breaks as defined in the .lim file, which means
367
+ ### one bin per contig (or gene, if the mapping is agains genes). Ignored
368
+ ### if `pos.breaks.tsv` is passed.
369
+ pos.breaks.tsv=NA,
370
+ ### Path to a list of (absolute) coordinates to use as position breaks.
371
+ ### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
372
+ ### must contain at least one column: coordinates of the break positions of
373
+ ### each position bin. If it has a second column, this is used as the name
374
+ ### of the position bin that ends at the given coordinate (the first row is
375
+ ### ignored). Any additional columns are currently ignored. If NA,
376
+ ### position bins are determined by `pos.breaks`.
377
+ id.breaks=300,
378
+ ### Breaks in the identity histogram. It can also be a vector of break
379
+ ### points, and values outside the range are ignored.
380
+ id.free.range=FALSE,
381
+ ### Indicates that the range should be freely set from the observed
382
+ ### values. Otherwise, 70-100% is included in the identity histogram
383
+ ### (default).
384
+ id.metric=c('identity', 'corrected identity', 'bit score'),
385
+ ### Metric of identity to be used (Y-axis). Corrected identity is only
386
+ ### supported if the original BLAST file included sequence lengths.
387
+ id.summary=sum,
388
+ ### Function summarizing the identity bins. Other recommended options
389
+ ### include: `median` to estimate the median instead of total bins, and
390
+ ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
391
+ id.cutoff=95,
392
+ ### Cutoff of identity metric above which the hits are considered
393
+ ### 'in-group'. The 95% identity corresponds to the expectation of
394
+ ### ANI<95% within species.
395
+ threads=2,
396
+ ### Number of threads to use.
397
+ verbose=TRUE,
398
+ ### Indicates if the function should report the advance.
399
+ ...
400
+ ### Any additional parameters supported by `plot.enve.RecPlot2`.
401
+ ){
402
+ # Settings
403
+ id.metric <- match.arg(id.metric);
431
404
 
432
- # Plot and return
433
- recplot <- new('enve.RecPlot2',
434
- counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
435
- pos.counts.out=pos.counts.out,
436
- id.breaks=id.breaks, pos.breaks=pos.breaks,
437
- seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
438
- id.ingroup=id.ingroup,id.metric=id.metric,
439
- call=match.call());
440
- if(plot){
441
- if(verbose) cat("Plotting.\n")
442
- peaks <- plot(recplot, ...);
443
- attr(recplot, "peaks") <- peaks
444
- }
445
- return(recplot);
446
- ### Returns an object of class `enve.RecPlot2`.
405
+ #Read files
406
+ if(verbose) cat("Reading files.\n")
407
+ rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
408
+ quote="");
409
+ lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
410
+ quote="", as.is=TRUE);
411
+
412
+ # Build matrix
413
+ if(verbose) cat("Building counts matrix.\n")
414
+ if(id.metric=="corrected identity" & ncol(rec)<6){
415
+ stop("Requesting corr. identity, but .rec file doesn't have 6th column")
416
+ }
417
+ rec.idcol <- ifelse(id.metric=="identity", 3,
418
+ ifelse(id.metric=="corrected identity", 6, 4))
419
+ pos.names <- as.character(NULL)
420
+ if(!is.na(pos.breaks.tsv)){
421
+ tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
422
+ pos.breaks <- as.numeric(tmp[,1])
423
+ if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
424
+ }else if(length(pos.breaks)==1){
425
+ if(pos.breaks>0){
426
+ pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
427
+ }else{
428
+ pos.breaks <- c(lim[1,2], lim[,3])
429
+ pos.names <- lim[,1]
430
+ }
431
+ }
432
+ if(length(id.breaks)==1){
433
+ id.range.v <- rec[,rec.idcol]
434
+ if(!id.free.range) id.range.v <- c(id.range.v,70,100)
435
+ id.range.v <- range(id.range.v)
436
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
437
+ }
438
+
439
+ # Run in parallel
440
+ if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
441
+ cl <- makeCluster(threads)
442
+ rec.l <- list()
443
+ thl <- ceiling(nrow(rec)/threads)
444
+ for(i in 0:(threads-1)){
445
+ rec.l[[i+1]] <- list(
446
+ rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
447
+ verbose=ifelse(i==0, verbose, FALSE))
448
+ }
449
+ counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
450
+ pos.breaks=pos.breaks, id.breaks=id.breaks,
451
+ rec.idcol=rec.idcol)
452
+ counts <- counts.l[[1]]
453
+ if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
454
+ stopCluster(cl)
455
+
456
+ # Estimate 1D histograms
457
+ if(verbose) cat("Building histograms.\n")
458
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
459
+ id.ingroup <- (id.mids > id.cutoff);
460
+ id.counts <- apply(counts, 2, id.summary);
461
+ pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
462
+ pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
463
+
464
+ # Plot and return
465
+ recplot <- new('enve.RecPlot2',
466
+ counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
467
+ pos.counts.out=pos.counts.out,
468
+ id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
469
+ seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
470
+ id.ingroup=id.ingroup,id.metric=id.metric,
471
+ call=match.call());
472
+ if(plot){
473
+ if(verbose) cat("Plotting.\n")
474
+ peaks <- plot(recplot, ...);
475
+ attr(recplot, "peaks") <- peaks
476
+ }
477
+ return(recplot);
478
+ ### Returns an object of class `enve.RecPlot2`.
447
479
  }
448
480
 
449
481
  enve.recplot2.findPeaks <- function(
@@ -502,9 +534,11 @@ enve.recplot2.findPeaks.emauto <- function(
502
534
  stop('Invalid criterion ', criterion)
503
535
  }
504
536
  for(comp in components){
537
+ if(verbose) cat('Testing:',comp,'\n')
505
538
  best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
506
539
  verbose, ...)
507
540
  }
541
+ if(length(best[['peaks']])==0) return(list())
508
542
 
509
543
  seqdepths.r <- signif(log(sapply(best[['peaks']],
510
544
  function(x) x$seq.depth)), merge.tol)
@@ -609,7 +643,7 @@ enve.recplot2.findPeaks.mower <- function(
609
643
  ### Range of quantiles to be used in the estimation of a peak's
610
644
  ### parameters.
611
645
  mlv.opts=list(method='parzen'),
612
- ### Options passed to `mlv` to estimate the mode.
646
+ ### Ignored. For backwards compatibility.
613
647
  fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
614
648
  start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
615
649
  ### Options passed to `fitdist` to estimate the standard deviation if
@@ -758,53 +792,75 @@ enve.recplot2.changeCutoff <- function
758
792
  return(rp)
759
793
  }
760
794
 
761
- enve.recplot2.extractWindows <- function
762
- ### Extract windows significantly below (or above) the peak in sequencing
763
- ### depth.
764
- (rp,
765
- ### Recruitment plot, a enve.RecPlot2 object.
766
- peak,
767
- ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
768
- ### list of enve.RecPlot2.Peak objects, in which case the core peak is
769
- ### used (see `enve.recplot2.corePeak`).
770
- lower.tail=TRUE,
771
- ### If FALSE, it returns windows significantly above the peak in
772
- ### sequencing depth.
773
- significance=0.05,
774
- ### Significance threshold (alpha) to select windows.
775
- seq.names=FALSE
776
- ### Returns subject sequence names instead of a vector of Booleans. If
777
- ### the recruitment plot was generated with pos.breaks=0 it returns a
778
- ### vector of characters (the sequence identifiers), otherwise it returns
779
- ### a data.frame with a name column and two columns of coordinates.
780
- ){
781
- # Determine the threshold
795
+ enve.recplot2.windowDepthThreshold <- function
796
+ ### Identifies the threshold below which windows should be identified as
797
+ ### variable or absent.
798
+ (rp,
799
+ ### Recruitment plot, an `enve.RecPlot2` object.
800
+ peak,
801
+ ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
802
+ ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
803
+ ### used (see `enve.recplot2.corePeak`).
804
+ lower.tail=TRUE,
805
+ ### If FALSE, it returns windows significantly above the peak in
806
+ ### sequencing depth.
807
+ significance=0.05
808
+ ### Significance threshold (alpha) to select windows.
809
+ ){
782
810
  if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
783
811
  par <- peak$param.hat
784
812
  par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
785
813
  thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
786
814
  if(peak$log) thr <- exp(thr)
787
-
788
- # Select windows past the threshold
789
- seqdepth.in <- enve.recplot2.seqdepth(rp)
790
- if(lower.tail){
791
- sel <- seqdepth.in < thr
792
- }else{
793
- sel <- seqdepth.in > thr
794
- }
795
-
796
- # seq.names=FALSE
797
- if(!seq.names) return(sel)
798
- # seq.names=TRUE and pos.breaks=0
799
- if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
800
- rp$pos.breaks==rp$seq.breaks)
801
- return(rp$seq.names[sel])
802
- # seq.names=TRUE and pos.breaks!=0
803
- return(enve.recplot2.coordinates(rp,sel))
804
- ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
805
- ### it returns a vector of characters if the object was built with
806
- ### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
807
- ### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
815
+
816
+ return(thr)
817
+ ### Returns a float. The units are depth if the peaks were estimated in
818
+ ### linear scale, or log-depth otherwise (`peak$log`).
819
+ }
820
+
821
+ enve.recplot2.extractWindows <- function
822
+ ### Extract windows significantly below (or above) the peak in sequencing
823
+ ### depth.
824
+ (rp,
825
+ ### Recruitment plot, a `enve.RecPlot2` object.
826
+ peak,
827
+ ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
828
+ ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
829
+ ### used (see `enve.recplot2.corePeak`).
830
+ lower.tail=TRUE,
831
+ ### If FALSE, it returns windows significantly above the peak in
832
+ ### sequencing depth.
833
+ significance=0.05,
834
+ ### Significance threshold (alpha) to select windows.
835
+ seq.names=FALSE
836
+ ### Returns subject sequence names instead of a vector of Booleans. If
837
+ ### the recruitment plot was generated with named position bins (e.g, using
838
+ ### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
839
+ ### characters (the sequence identifiers), otherwise it returns a data.frame
840
+ ### with a name column and two columns of coordinates.
841
+ ){
842
+ # Determine the threshold
843
+ thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
844
+
845
+ # Select windows past the threshold
846
+ seqdepth.in <- enve.recplot2.seqdepth(rp)
847
+ if(lower.tail){
848
+ sel <- seqdepth.in < thr
849
+ }else{
850
+ sel <- seqdepth.in > thr
851
+ }
852
+
853
+ # seq.names=FALSE
854
+ if(!seq.names) return(sel)
855
+ # seq.names=TRUE and pos.names defined
856
+ if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
857
+ # seq.names=TRUE and pos.names undefined
858
+ return(enve.recplot2.coordinates(rp,sel))
859
+ ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
860
+ ### it returns a vector of characters if the object has `pos.names` defined,
861
+ ### or a data.frame with four columns otherwise:
862
+ ### name.from, name.to, pos.from, and pos.to
863
+ ### (see `enve.recplot2.coordinates`).
808
864
  }
809
865
 
810
866
  enve.recplot2.compareIdentities <- function
@@ -931,7 +987,11 @@ enve.recplot2.seqdepth <- function
931
987
  ){
932
988
  if(!inherits(x, "enve.RecPlot2"))
933
989
  stop("'x' must inherit from class `enve.RecPlot2`")
934
- pos.cnts.in <- x$pos.counts.in
990
+ if(low.identity){
991
+ pos.cnts.in <- x$pos.counts.out
992
+ }else{
993
+ pos.cnts.in <- x$pos.counts.in
994
+ }
935
995
  pos.breaks <- x$pos.breaks
936
996
  pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
937
997
  seqdepth.in <- pos.cnts.in/pos.binsize
@@ -987,6 +1047,7 @@ enve.recplot2.findPeaks.__emauto_one <- function
987
1047
  ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
988
1048
  (x, comp, do_crit, best, verbose, ...){
989
1049
  peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1050
+ if(length(peaks)==0) return(best)
990
1051
  k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
991
1052
  crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
992
1053
  if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
@@ -1049,7 +1110,7 @@ enve.recplot2.findPeaks.__mow_one <- function
1049
1110
 
1050
1111
  # Find peak
1051
1112
  o <- mlv.opts; o$x = lsd1;
1052
- mode1 <- do.call(mlv, o)$M;
1113
+ mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1053
1114
  if(verbose) cat('Anchoring at mode =',mode1,'\n')
1054
1115
  param.hat <- fitdist.opts$start; last.hat <- param.hat;
1055
1116
  lim <- NA;
@@ -7,10 +7,28 @@ enve.col.alpha <- function
7
7
  ### such as 'darkred' or '#009988'.
8
8
  alpha=1/2
9
9
  ### Alpha value to add to the color, from 0 to 1.
10
- ){
10
+ ){
11
11
  return(
12
12
  apply(col2rgb(col), 2,
13
13
  function(x) do.call(rgb, as.list(c(x[1:3]/256, alpha))) ) )
14
14
  ### Returns a color or a vector of colors in hex notation including alpha.
15
15
  }
16
16
 
17
+ enve.truncate <- function
18
+ ### Removes the `n` highest and lowest values from a vector, and applies a
19
+ ### summary function. The value of `n` is determined such that the central
20
+ ### range is used, corresponding to the `f` fraction of values.
21
+ (x,
22
+ ### A vector of numbers.
23
+ f=0.95,
24
+ ### The fraction of values to retain.
25
+ FUN=mean
26
+ ### Summary function to apply to the vectors. To obtain the truncated
27
+ ### vector itself, use `c`.
28
+ ){
29
+ n <- round(length(x)*(1-f)/2)
30
+ y <- sort(x)[ -c(seq(1, n), seq(length(x)+1-n, length(x))) ]
31
+ return(FUN(y))
32
+ ### Returns the summary (`FUN`) of the truncated vector.
33
+ }
34
+
@@ -32,6 +32,7 @@ And open help messages using any of the following commands:
32
32
  ?enve.recplot2.changeCutoff
33
33
  ?enve.recplot2.findPeaks
34
34
  ?enve.recplot2.corePeak
35
+ ?enve.recplot2.windowDepthThreshold
35
36
  ?enve.recplot2.extractWindows
36
37
  ?enve.recplot2.coordinates
37
38
  ?enve.recplot2.seqdepth
@@ -41,6 +42,7 @@ And open help messages using any of the following commands:
41
42
  ?enve.tribs.test
42
43
  ?enve.growthcurve
43
44
  ?enve.col.alpha
45
+ ?enve.truncate
44
46
  ```
45
47
 
46
48
  You can run some examples using these libraries in the
@@ -50,6 +52,15 @@ For additional information on recruitment plots, see the
50
52
  [Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
51
53
 
52
54
  ## Changelog
55
+ * 1.4.4: Removes modeest library as requirement, and replaces mower peak-finder
56
+ initialization to median (instead of mode).
57
+ * 1.4.2: Solved bug #36.
58
+ * 1.4.0: New option `pos.breaks.tsv` for `enve.recplot2`.
59
+ * 1.3.4: Gracefully handles and plots recruitment plots with insufficient data
60
+ to find peaks.
61
+ * 1.3.3: New function `enve.recplot2.windowDepthThreshold`.
62
+ * 1.3.2: New option `panel.fun` for `plot.enve.RecPlot2`.
63
+ * 1.3.1: New function enve.truncate.
53
64
  * 1.3: Several bug fixes and new utilities for recruitment plots (recplot2).
54
65
  * 1.1.0: New function enve.growthcurve and related class enve.GrowthCurve
55
66
  with S3 methods plot and summary.
@@ -37,7 +37,7 @@ If non-zero, requires the stats package.}
37
37
  \item{main}{Title of the plot.}
38
38
  \item{contig.col}{Color of the Contig boundaries. Set to NA to ignore Contig boundaries.}
39
39
  \item{ret.recplot}{Indicates if the matrix of the recruitment plot is to be returned.}
40
- \item{ret.hist}{Indicates if the vectors of the identity and position histograms are to be returned.}
40
+ \item{ret.hist}{Ignored, for backwards compatibility.}
41
41
  \item{ret.mode}{Indicates if the mode of the identity is to be computed. It requires the modeest
42
42
  package.}
43
43
  \item{id.cutoff}{Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
@@ -59,7 +59,7 @@ id.mean: Mean identity.
59
59
 
60
60
  id.median: Median identity.
61
61
 
62
- id.mode (if ret.mode=TRUE): Mode of the identity.
62
+ id.mode (if ret.mode=TRUE): Mode of the identity. Deprecated.
63
63
 
64
64
  id.hist (if ret.hist=TRUE): Values of the identity histogram.
65
65
 
@@ -17,6 +17,7 @@ be produced by `enve.recplot2` and supports S4 method plot.}
17
17
  \item{\code{id.counts}:}{(\code{numeric}) Counts per ID bin.}
18
18
  \item{\code{id.breaks}:}{(\code{numeric}) Breaks of identity bins.}
19
19
  \item{\code{pos.breaks}:}{(\code{numeric}) Breaks of position bins.}
20
+ \item{\code{pos.names}:}{(\code{character}) Names of the position bins.}
20
21
  \item{\code{seq.breaks}:}{(\code{numeric}) Breaks of input sequences.}
21
22
  \item{\code{peaks}:}{(\code{list}) Peaks identified in the recplot.
22
23
  Limits of the subject sequences after concatenation.}
@@ -3,10 +3,10 @@
3
3
  \title{enve recplot2}
4
4
  \description{Produces recruitment plots provided that BlastTab.catsbj.pl has
5
5
  been previously executed.}
6
- \usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, id.breaks = 300,
7
- id.free.range = FALSE, id.metric = c("identity", "corrected identity",
8
- "bit score"), id.summary = sum, id.cutoff = 95, threads = 2,
9
- verbose = TRUE, ...)}
6
+ \usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, pos.breaks.tsv = NA,
7
+ id.breaks = 300, id.free.range = FALSE, id.metric = c("identity",
8
+ "corrected identity", "bit score"), id.summary = sum,
9
+ id.cutoff = 95, threads = 2, verbose = TRUE, ...)}
10
10
  \arguments{
11
11
  \item{prefix}{Path to the prefix of the BlastTab.catsbj.pl output files. At
12
12
  least the files .rec and .lim must exist with this prefix.}
@@ -14,7 +14,15 @@ least the files .rec and .lim must exist with this prefix.}
14
14
  \item{pos.breaks}{Breaks in the positions histogram. It can also be a vector of break
15
15
  points, and values outside the range are ignored. If zero (0), it
16
16
  uses the sequence breaks as defined in the .lim file, which means
17
- one bin per contig (or gene, if the mapping is agains genes).}
17
+ one bin per contig (or gene, if the mapping is agains genes). Ignored
18
+ if `pos.breaks.tsv` is passed.}
19
+ \item{pos.breaks.tsv}{Path to a list of (absolute) coordinates to use as position breaks.
20
+ This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
21
+ must contain at least one column: coordinates of the break positions of
22
+ each position bin. If it has a second column, this is used as the name
23
+ of the position bin that ends at the given coordinate (the first row is
24
+ ignored). Any additional columns are currently ignored. If NA,
25
+ position bins are determined by `pos.breaks`.}
18
26
  \item{id.breaks}{Breaks in the identity histogram. It can also be a vector of break
19
27
  points, and values outside the range are ignored.}
20
28
  \item{id.free.range}{Indicates that the range should be freely set from the observed