miga-base 0.3.9.0 → 0.3.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/add.rb +33 -33
- data/actions/edit.rb +33 -0
- data/actions/new.rb +17 -18
- data/actions/next_step.rb +33 -0
- data/actions/run.rb +15 -12
- data/bin/miga +43 -37
- data/lib/miga/daemon.rb +2 -2
- data/lib/miga/project/result.rb +16 -1
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +1 -3
- data/scripts/ani_distances.bash +1 -3
- data/scripts/assembly.bash +1 -3
- data/scripts/cds.bash +1 -3
- data/scripts/clade_finding.bash +1 -3
- data/scripts/d.bash +13 -0
- data/scripts/distances.bash +1 -3
- data/scripts/essential_genes.bash +1 -3
- data/scripts/haai_distances.bash +1 -3
- data/scripts/miga.bash +12 -9
- data/scripts/mytaxa.bash +1 -3
- data/scripts/mytaxa_scan.bash +1 -3
- data/scripts/ogs.bash +36 -33
- data/scripts/p.bash +23 -0
- data/scripts/project_stats.bash +1 -3
- data/scripts/read_quality.bash +1 -3
- data/scripts/ssu.bash +1 -3
- data/scripts/stats.bash +1 -3
- data/scripts/subclades.bash +1 -3
- data/scripts/taxonomy.bash +1 -3
- data/scripts/trimmed_fasta.bash +1 -3
- data/scripts/trimmed_reads.bash +1 -3
- data/test/daemon_test.rb +3 -3
- data/utils/distance/runner.rb +1 -1
- data/utils/enveomics/Docs/recplot2.md +13 -2
- data/utils/enveomics/Examples/aai-matrix.bash +3 -3
- data/utils/enveomics/Examples/ani-matrix.bash +3 -3
- data/utils/enveomics/Makefile +2 -2
- data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
- data/utils/enveomics/Manifest/Tasks/other.json +49 -0
- data/utils/enveomics/Manifest/categories.json +4 -0
- data/utils/enveomics/Manifest/examples.json +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/aai.rb +4 -3
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
- data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
- data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
- data/utils/enveomics/enveomics.R/R/utils.R +19 -1
- data/utils/enveomics/enveomics.R/README.md +11 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
- data/utils/subclade/runner.rb +4 -0
- metadata +14 -3
@@ -9,6 +9,7 @@ setClass("enve.RecPlot2",
|
|
9
9
|
id.counts='numeric', ##<< Counts per ID bin.
|
10
10
|
id.breaks='numeric', ##<< Breaks of identity bins.
|
11
11
|
pos.breaks='numeric', ##<< Breaks of position bins.
|
12
|
+
pos.names='character', ##<< Names of the position bins.
|
12
13
|
seq.breaks='numeric', ##<< Breaks of input sequences.
|
13
14
|
peaks='list', ##<< Peaks identified in the recplot.
|
14
15
|
### Limits of the subject sequences after concatenation.
|
@@ -70,10 +71,17 @@ plot.enve.RecPlot2 <- function
|
|
70
71
|
### 3: identity histogram,
|
71
72
|
### 4: Populations histogram (histogram of sequencing depths),
|
72
73
|
### 5: Color scale for the counts matrix (vertical),
|
73
|
-
### 6: Color scale of the counts
|
74
|
-
###
|
75
|
-
###
|
76
|
-
|
74
|
+
### 6: Color scale of the counts matrix (horizontal)
|
75
|
+
### Only panels indicated here will be plotted. To plot only one panel
|
76
|
+
### simply set this to the number of the panel you want to plot.
|
77
|
+
panel.fun=list(),
|
78
|
+
### List of functions to be executed after drawing each panel. Use the
|
79
|
+
### indices in `layout` (as characters) as keys. Functions for indices
|
80
|
+
### missing in `layout` are ignored. For example, to add a vertical line
|
81
|
+
### at the 3Mbp mark in both the position histogram and the counts matrix:
|
82
|
+
### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
|
83
|
+
### Note that the X-axis in both panels is in Mbp by default. To change
|
84
|
+
### this behavior, set `pos.units` accordingly.
|
77
85
|
widths=c(1,7,2),
|
78
86
|
### Relative widths of the columns of `layout`.
|
79
87
|
heights=c(1,2),
|
@@ -166,7 +174,7 @@ plot.enve.RecPlot2 <- function
|
|
166
174
|
list(maxColorValue=256, alpha=52)));
|
167
175
|
}
|
168
176
|
|
169
|
-
# Counts matrix
|
177
|
+
# [1] Counts matrix
|
170
178
|
if(any(layout==1)){
|
171
179
|
par(mar=mar[['1']]);
|
172
180
|
plot(1, t='n', bty='l',
|
@@ -182,9 +190,10 @@ plot.enve.RecPlot2 <- function
|
|
182
190
|
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
183
191
|
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
184
192
|
length.out=1+length(palette)), add=TRUE);
|
193
|
+
if(exists('1',panel.fun)) panel.fun[['1']]();
|
185
194
|
}
|
186
195
|
|
187
|
-
# Position histogram
|
196
|
+
# [2] Position histogram
|
188
197
|
if(any(layout==2)){
|
189
198
|
par(mar=mar[['2']]);
|
190
199
|
if(any(layout==1)){
|
@@ -213,9 +222,10 @@ plot.enve.RecPlot2 <- function
|
|
213
222
|
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
214
223
|
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
215
224
|
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
225
|
+
if(exists('2',panel.fun)) panel.fun[['2']]();
|
216
226
|
}
|
217
227
|
|
218
|
-
# Identity histogram
|
228
|
+
# [3] Identity histogram
|
219
229
|
if(any(layout==3)){
|
220
230
|
par(mar=mar[['3']]);
|
221
231
|
if(any(layout==1)){
|
@@ -248,9 +258,10 @@ plot.enve.RecPlot2 <- function
|
|
248
258
|
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
249
259
|
text(1,1,labels='Insufficient data', srt=90)
|
250
260
|
}
|
261
|
+
if(exists('3',panel.fun)) panel.fun[['3']]();
|
251
262
|
}
|
252
263
|
|
253
|
-
# Populations histogram
|
264
|
+
# [4] Populations histogram
|
254
265
|
peaks <- NA;
|
255
266
|
if(any(layout==4)){
|
256
267
|
par(mar=mar[['4']]);
|
@@ -308,9 +319,10 @@ plot.enve.RecPlot2 <- function
|
|
308
319
|
dpt,'X (', frx, '%', err, ')', sep=''))
|
309
320
|
}
|
310
321
|
}
|
322
|
+
if(exists('4',panel.fun)) panel.fun[['4']]();
|
311
323
|
}
|
312
324
|
|
313
|
-
# Color scale
|
325
|
+
# [5] Color scale of the counts matrix (vertical)
|
314
326
|
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
315
327
|
length.out=1+length(palette))
|
316
328
|
if(any(layout==5)){
|
@@ -319,13 +331,17 @@ plot.enve.RecPlot2 <- function
|
|
319
331
|
ylim=range(count.bins), yaxs='i', ylab='')
|
320
332
|
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
321
333
|
border=NA)
|
334
|
+
if(exists('5',panel.fun)) panel.fun[['5']]();
|
322
335
|
}
|
336
|
+
|
337
|
+
# [6] Color scale of the coutnts matrix (horizontal)
|
323
338
|
if(any(layout==6)){
|
324
339
|
par(mar=mar[['6']]);
|
325
340
|
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
326
341
|
xlim=range(count.bins), xaxs='i',xlab='');
|
327
342
|
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
328
343
|
border=NA);
|
344
|
+
if(exists('6',panel.fun)) panel.fun[['6']]();
|
329
345
|
}
|
330
346
|
|
331
347
|
par(mar=ori.mar);
|
@@ -337,113 +353,129 @@ plot.enve.RecPlot2 <- function
|
|
337
353
|
|
338
354
|
#==============> Define core functions
|
339
355
|
enve.recplot2 <- function(
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
if(verbose) cat("Building counts matrix.\n")
|
389
|
-
if(id.metric=="corrected identity" & ncol(rec)<6){
|
390
|
-
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
391
|
-
}
|
392
|
-
rec.idcol <- ifelse(id.metric=="identity", 3,
|
393
|
-
ifelse(id.metric=="corrected identity", 6, 4));
|
394
|
-
if(length(pos.breaks)==1){
|
395
|
-
if(pos.breaks>0){
|
396
|
-
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
397
|
-
}else{
|
398
|
-
pos.breaks <- c(lim[1,2], lim[,3])
|
399
|
-
}
|
400
|
-
}
|
401
|
-
if(length(id.breaks)==1){
|
402
|
-
id.range.v <- rec[,rec.idcol]
|
403
|
-
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
404
|
-
id.range.v <- range(id.range.v)
|
405
|
-
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
406
|
-
}
|
407
|
-
|
408
|
-
# Run in parallel
|
409
|
-
if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
|
410
|
-
cl <- makeCluster(threads)
|
411
|
-
rec.l <- list()
|
412
|
-
thl <- ceiling(nrow(rec)/threads)
|
413
|
-
for(i in 0:(threads-1)){
|
414
|
-
rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
|
415
|
-
verbose=ifelse(i==0, verbose, FALSE))
|
416
|
-
}
|
417
|
-
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
418
|
-
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
419
|
-
rec.idcol=rec.idcol)
|
420
|
-
counts <- counts.l[[1]]
|
421
|
-
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
422
|
-
stopCluster(cl)
|
423
|
-
|
424
|
-
# Estimate 1D histograms
|
425
|
-
if(verbose) cat("Building histograms.\n")
|
426
|
-
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
427
|
-
id.ingroup <- (id.mids > id.cutoff);
|
428
|
-
id.counts <- apply(counts, 2, id.summary);
|
429
|
-
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
430
|
-
pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
|
356
|
+
### Produces recruitment plots provided that BlastTab.catsbj.pl has
|
357
|
+
### been previously executed.
|
358
|
+
prefix,
|
359
|
+
### Path to the prefix of the BlastTab.catsbj.pl output files. At
|
360
|
+
### least the files .rec and .lim must exist with this prefix.
|
361
|
+
plot=TRUE,
|
362
|
+
### Should the object be plotted?
|
363
|
+
pos.breaks=1e3,
|
364
|
+
### Breaks in the positions histogram. It can also be a vector of break
|
365
|
+
### points, and values outside the range are ignored. If zero (0), it
|
366
|
+
### uses the sequence breaks as defined in the .lim file, which means
|
367
|
+
### one bin per contig (or gene, if the mapping is agains genes). Ignored
|
368
|
+
### if `pos.breaks.tsv` is passed.
|
369
|
+
pos.breaks.tsv=NA,
|
370
|
+
### Path to a list of (absolute) coordinates to use as position breaks.
|
371
|
+
### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
372
|
+
### must contain at least one column: coordinates of the break positions of
|
373
|
+
### each position bin. If it has a second column, this is used as the name
|
374
|
+
### of the position bin that ends at the given coordinate (the first row is
|
375
|
+
### ignored). Any additional columns are currently ignored. If NA,
|
376
|
+
### position bins are determined by `pos.breaks`.
|
377
|
+
id.breaks=300,
|
378
|
+
### Breaks in the identity histogram. It can also be a vector of break
|
379
|
+
### points, and values outside the range are ignored.
|
380
|
+
id.free.range=FALSE,
|
381
|
+
### Indicates that the range should be freely set from the observed
|
382
|
+
### values. Otherwise, 70-100% is included in the identity histogram
|
383
|
+
### (default).
|
384
|
+
id.metric=c('identity', 'corrected identity', 'bit score'),
|
385
|
+
### Metric of identity to be used (Y-axis). Corrected identity is only
|
386
|
+
### supported if the original BLAST file included sequence lengths.
|
387
|
+
id.summary=sum,
|
388
|
+
### Function summarizing the identity bins. Other recommended options
|
389
|
+
### include: `median` to estimate the median instead of total bins, and
|
390
|
+
### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
|
391
|
+
id.cutoff=95,
|
392
|
+
### Cutoff of identity metric above which the hits are considered
|
393
|
+
### 'in-group'. The 95% identity corresponds to the expectation of
|
394
|
+
### ANI<95% within species.
|
395
|
+
threads=2,
|
396
|
+
### Number of threads to use.
|
397
|
+
verbose=TRUE,
|
398
|
+
### Indicates if the function should report the advance.
|
399
|
+
...
|
400
|
+
### Any additional parameters supported by `plot.enve.RecPlot2`.
|
401
|
+
){
|
402
|
+
# Settings
|
403
|
+
id.metric <- match.arg(id.metric);
|
431
404
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
405
|
+
#Read files
|
406
|
+
if(verbose) cat("Reading files.\n")
|
407
|
+
rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
|
408
|
+
quote="");
|
409
|
+
lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
|
410
|
+
quote="", as.is=TRUE);
|
411
|
+
|
412
|
+
# Build matrix
|
413
|
+
if(verbose) cat("Building counts matrix.\n")
|
414
|
+
if(id.metric=="corrected identity" & ncol(rec)<6){
|
415
|
+
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
416
|
+
}
|
417
|
+
rec.idcol <- ifelse(id.metric=="identity", 3,
|
418
|
+
ifelse(id.metric=="corrected identity", 6, 4))
|
419
|
+
pos.names <- as.character(NULL)
|
420
|
+
if(!is.na(pos.breaks.tsv)){
|
421
|
+
tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
|
422
|
+
pos.breaks <- as.numeric(tmp[,1])
|
423
|
+
if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
|
424
|
+
}else if(length(pos.breaks)==1){
|
425
|
+
if(pos.breaks>0){
|
426
|
+
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
427
|
+
}else{
|
428
|
+
pos.breaks <- c(lim[1,2], lim[,3])
|
429
|
+
pos.names <- lim[,1]
|
430
|
+
}
|
431
|
+
}
|
432
|
+
if(length(id.breaks)==1){
|
433
|
+
id.range.v <- rec[,rec.idcol]
|
434
|
+
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
435
|
+
id.range.v <- range(id.range.v)
|
436
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
437
|
+
}
|
438
|
+
|
439
|
+
# Run in parallel
|
440
|
+
if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
|
441
|
+
cl <- makeCluster(threads)
|
442
|
+
rec.l <- list()
|
443
|
+
thl <- ceiling(nrow(rec)/threads)
|
444
|
+
for(i in 0:(threads-1)){
|
445
|
+
rec.l[[i+1]] <- list(
|
446
|
+
rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
|
447
|
+
verbose=ifelse(i==0, verbose, FALSE))
|
448
|
+
}
|
449
|
+
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
450
|
+
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
451
|
+
rec.idcol=rec.idcol)
|
452
|
+
counts <- counts.l[[1]]
|
453
|
+
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
454
|
+
stopCluster(cl)
|
455
|
+
|
456
|
+
# Estimate 1D histograms
|
457
|
+
if(verbose) cat("Building histograms.\n")
|
458
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
459
|
+
id.ingroup <- (id.mids > id.cutoff);
|
460
|
+
id.counts <- apply(counts, 2, id.summary);
|
461
|
+
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
462
|
+
pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
|
463
|
+
|
464
|
+
# Plot and return
|
465
|
+
recplot <- new('enve.RecPlot2',
|
466
|
+
counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
|
467
|
+
pos.counts.out=pos.counts.out,
|
468
|
+
id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
|
469
|
+
seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
|
470
|
+
id.ingroup=id.ingroup,id.metric=id.metric,
|
471
|
+
call=match.call());
|
472
|
+
if(plot){
|
473
|
+
if(verbose) cat("Plotting.\n")
|
474
|
+
peaks <- plot(recplot, ...);
|
475
|
+
attr(recplot, "peaks") <- peaks
|
476
|
+
}
|
477
|
+
return(recplot);
|
478
|
+
### Returns an object of class `enve.RecPlot2`.
|
447
479
|
}
|
448
480
|
|
449
481
|
enve.recplot2.findPeaks <- function(
|
@@ -502,9 +534,11 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
502
534
|
stop('Invalid criterion ', criterion)
|
503
535
|
}
|
504
536
|
for(comp in components){
|
537
|
+
if(verbose) cat('Testing:',comp,'\n')
|
505
538
|
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
506
539
|
verbose, ...)
|
507
540
|
}
|
541
|
+
if(length(best[['peaks']])==0) return(list())
|
508
542
|
|
509
543
|
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
510
544
|
function(x) x$seq.depth)), merge.tol)
|
@@ -609,7 +643,7 @@ enve.recplot2.findPeaks.mower <- function(
|
|
609
643
|
### Range of quantiles to be used in the estimation of a peak's
|
610
644
|
### parameters.
|
611
645
|
mlv.opts=list(method='parzen'),
|
612
|
-
###
|
646
|
+
### Ignored. For backwards compatibility.
|
613
647
|
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
614
648
|
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
615
649
|
### Options passed to `fitdist` to estimate the standard deviation if
|
@@ -758,53 +792,75 @@ enve.recplot2.changeCutoff <- function
|
|
758
792
|
return(rp)
|
759
793
|
}
|
760
794
|
|
761
|
-
enve.recplot2.
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
### Returns subject sequence names instead of a vector of Booleans. If
|
777
|
-
### the recruitment plot was generated with pos.breaks=0 it returns a
|
778
|
-
### vector of characters (the sequence identifiers), otherwise it returns
|
779
|
-
### a data.frame with a name column and two columns of coordinates.
|
780
|
-
){
|
781
|
-
# Determine the threshold
|
795
|
+
enve.recplot2.windowDepthThreshold <- function
|
796
|
+
### Identifies the threshold below which windows should be identified as
|
797
|
+
### variable or absent.
|
798
|
+
(rp,
|
799
|
+
### Recruitment plot, an `enve.RecPlot2` object.
|
800
|
+
peak,
|
801
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
802
|
+
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
803
|
+
### used (see `enve.recplot2.corePeak`).
|
804
|
+
lower.tail=TRUE,
|
805
|
+
### If FALSE, it returns windows significantly above the peak in
|
806
|
+
### sequencing depth.
|
807
|
+
significance=0.05
|
808
|
+
### Significance threshold (alpha) to select windows.
|
809
|
+
){
|
782
810
|
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
783
811
|
par <- peak$param.hat
|
784
812
|
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
785
813
|
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
786
814
|
if(peak$log) thr <- exp(thr)
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
815
|
+
|
816
|
+
return(thr)
|
817
|
+
### Returns a float. The units are depth if the peaks were estimated in
|
818
|
+
### linear scale, or log-depth otherwise (`peak$log`).
|
819
|
+
}
|
820
|
+
|
821
|
+
enve.recplot2.extractWindows <- function
|
822
|
+
### Extract windows significantly below (or above) the peak in sequencing
|
823
|
+
### depth.
|
824
|
+
(rp,
|
825
|
+
### Recruitment plot, a `enve.RecPlot2` object.
|
826
|
+
peak,
|
827
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
828
|
+
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
829
|
+
### used (see `enve.recplot2.corePeak`).
|
830
|
+
lower.tail=TRUE,
|
831
|
+
### If FALSE, it returns windows significantly above the peak in
|
832
|
+
### sequencing depth.
|
833
|
+
significance=0.05,
|
834
|
+
### Significance threshold (alpha) to select windows.
|
835
|
+
seq.names=FALSE
|
836
|
+
### Returns subject sequence names instead of a vector of Booleans. If
|
837
|
+
### the recruitment plot was generated with named position bins (e.g, using
|
838
|
+
### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
|
839
|
+
### characters (the sequence identifiers), otherwise it returns a data.frame
|
840
|
+
### with a name column and two columns of coordinates.
|
841
|
+
){
|
842
|
+
# Determine the threshold
|
843
|
+
thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
|
844
|
+
|
845
|
+
# Select windows past the threshold
|
846
|
+
seqdepth.in <- enve.recplot2.seqdepth(rp)
|
847
|
+
if(lower.tail){
|
848
|
+
sel <- seqdepth.in < thr
|
849
|
+
}else{
|
850
|
+
sel <- seqdepth.in > thr
|
851
|
+
}
|
852
|
+
|
853
|
+
# seq.names=FALSE
|
854
|
+
if(!seq.names) return(sel)
|
855
|
+
# seq.names=TRUE and pos.names defined
|
856
|
+
if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
|
857
|
+
# seq.names=TRUE and pos.names undefined
|
858
|
+
return(enve.recplot2.coordinates(rp,sel))
|
859
|
+
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
860
|
+
### it returns a vector of characters if the object has `pos.names` defined,
|
861
|
+
### or a data.frame with four columns otherwise:
|
862
|
+
### name.from, name.to, pos.from, and pos.to
|
863
|
+
### (see `enve.recplot2.coordinates`).
|
808
864
|
}
|
809
865
|
|
810
866
|
enve.recplot2.compareIdentities <- function
|
@@ -931,7 +987,11 @@ enve.recplot2.seqdepth <- function
|
|
931
987
|
){
|
932
988
|
if(!inherits(x, "enve.RecPlot2"))
|
933
989
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
934
|
-
|
990
|
+
if(low.identity){
|
991
|
+
pos.cnts.in <- x$pos.counts.out
|
992
|
+
}else{
|
993
|
+
pos.cnts.in <- x$pos.counts.in
|
994
|
+
}
|
935
995
|
pos.breaks <- x$pos.breaks
|
936
996
|
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
937
997
|
seqdepth.in <- pos.cnts.in/pos.binsize
|
@@ -987,6 +1047,7 @@ enve.recplot2.findPeaks.__emauto_one <- function
|
|
987
1047
|
### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
|
988
1048
|
(x, comp, do_crit, best, verbose, ...){
|
989
1049
|
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
1050
|
+
if(length(peaks)==0) return(best)
|
990
1051
|
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
991
1052
|
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
992
1053
|
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
@@ -1049,7 +1110,7 @@ enve.recplot2.findPeaks.__mow_one <- function
|
|
1049
1110
|
|
1050
1111
|
# Find peak
|
1051
1112
|
o <- mlv.opts; o$x = lsd1;
|
1052
|
-
mode1 <- do.call(mlv, o)$M;
|
1113
|
+
mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
|
1053
1114
|
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
1054
1115
|
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
1055
1116
|
lim <- NA;
|
@@ -7,10 +7,28 @@ enve.col.alpha <- function
|
|
7
7
|
### such as 'darkred' or '#009988'.
|
8
8
|
alpha=1/2
|
9
9
|
### Alpha value to add to the color, from 0 to 1.
|
10
|
-
|
10
|
+
){
|
11
11
|
return(
|
12
12
|
apply(col2rgb(col), 2,
|
13
13
|
function(x) do.call(rgb, as.list(c(x[1:3]/256, alpha))) ) )
|
14
14
|
### Returns a color or a vector of colors in hex notation including alpha.
|
15
15
|
}
|
16
16
|
|
17
|
+
enve.truncate <- function
|
18
|
+
### Removes the `n` highest and lowest values from a vector, and applies a
|
19
|
+
### summary function. The value of `n` is determined such that the central
|
20
|
+
### range is used, corresponding to the `f` fraction of values.
|
21
|
+
(x,
|
22
|
+
### A vector of numbers.
|
23
|
+
f=0.95,
|
24
|
+
### The fraction of values to retain.
|
25
|
+
FUN=mean
|
26
|
+
### Summary function to apply to the vectors. To obtain the truncated
|
27
|
+
### vector itself, use `c`.
|
28
|
+
){
|
29
|
+
n <- round(length(x)*(1-f)/2)
|
30
|
+
y <- sort(x)[ -c(seq(1, n), seq(length(x)+1-n, length(x))) ]
|
31
|
+
return(FUN(y))
|
32
|
+
### Returns the summary (`FUN`) of the truncated vector.
|
33
|
+
}
|
34
|
+
|
@@ -32,6 +32,7 @@ And open help messages using any of the following commands:
|
|
32
32
|
?enve.recplot2.changeCutoff
|
33
33
|
?enve.recplot2.findPeaks
|
34
34
|
?enve.recplot2.corePeak
|
35
|
+
?enve.recplot2.windowDepthThreshold
|
35
36
|
?enve.recplot2.extractWindows
|
36
37
|
?enve.recplot2.coordinates
|
37
38
|
?enve.recplot2.seqdepth
|
@@ -41,6 +42,7 @@ And open help messages using any of the following commands:
|
|
41
42
|
?enve.tribs.test
|
42
43
|
?enve.growthcurve
|
43
44
|
?enve.col.alpha
|
45
|
+
?enve.truncate
|
44
46
|
```
|
45
47
|
|
46
48
|
You can run some examples using these libraries in the
|
@@ -50,6 +52,15 @@ For additional information on recruitment plots, see the
|
|
50
52
|
[Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
|
51
53
|
|
52
54
|
## Changelog
|
55
|
+
* 1.4.4: Removes modeest library as requirement, and replaces mower peak-finder
|
56
|
+
initialization to median (instead of mode).
|
57
|
+
* 1.4.2: Solved bug #36.
|
58
|
+
* 1.4.0: New option `pos.breaks.tsv` for `enve.recplot2`.
|
59
|
+
* 1.3.4: Gracefully handles and plots recruitment plots with insufficient data
|
60
|
+
to find peaks.
|
61
|
+
* 1.3.3: New function `enve.recplot2.windowDepthThreshold`.
|
62
|
+
* 1.3.2: New option `panel.fun` for `plot.enve.RecPlot2`.
|
63
|
+
* 1.3.1: New function enve.truncate.
|
53
64
|
* 1.3: Several bug fixes and new utilities for recruitment plots (recplot2).
|
54
65
|
* 1.1.0: New function enve.growthcurve and related class enve.GrowthCurve
|
55
66
|
with S3 methods plot and summary.
|
@@ -37,7 +37,7 @@ If non-zero, requires the stats package.}
|
|
37
37
|
\item{main}{Title of the plot.}
|
38
38
|
\item{contig.col}{Color of the Contig boundaries. Set to NA to ignore Contig boundaries.}
|
39
39
|
\item{ret.recplot}{Indicates if the matrix of the recruitment plot is to be returned.}
|
40
|
-
\item{ret.hist}{
|
40
|
+
\item{ret.hist}{Ignored, for backwards compatibility.}
|
41
41
|
\item{ret.mode}{Indicates if the mode of the identity is to be computed. It requires the modeest
|
42
42
|
package.}
|
43
43
|
\item{id.cutoff}{Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
|
@@ -59,7 +59,7 @@ id.mean: Mean identity.
|
|
59
59
|
|
60
60
|
id.median: Median identity.
|
61
61
|
|
62
|
-
id.mode (if ret.mode=TRUE): Mode of the identity.
|
62
|
+
id.mode (if ret.mode=TRUE): Mode of the identity. Deprecated.
|
63
63
|
|
64
64
|
id.hist (if ret.hist=TRUE): Values of the identity histogram.
|
65
65
|
|
@@ -17,6 +17,7 @@ be produced by `enve.recplot2` and supports S4 method plot.}
|
|
17
17
|
\item{\code{id.counts}:}{(\code{numeric}) Counts per ID bin.}
|
18
18
|
\item{\code{id.breaks}:}{(\code{numeric}) Breaks of identity bins.}
|
19
19
|
\item{\code{pos.breaks}:}{(\code{numeric}) Breaks of position bins.}
|
20
|
+
\item{\code{pos.names}:}{(\code{character}) Names of the position bins.}
|
20
21
|
\item{\code{seq.breaks}:}{(\code{numeric}) Breaks of input sequences.}
|
21
22
|
\item{\code{peaks}:}{(\code{list}) Peaks identified in the recplot.
|
22
23
|
Limits of the subject sequences after concatenation.}
|
@@ -3,10 +3,10 @@
|
|
3
3
|
\title{enve recplot2}
|
4
4
|
\description{Produces recruitment plots provided that BlastTab.catsbj.pl has
|
5
5
|
been previously executed.}
|
6
|
-
\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000,
|
7
|
-
id.free.range = FALSE, id.metric = c("identity",
|
8
|
-
"bit score"), id.summary = sum,
|
9
|
-
verbose = TRUE, ...)}
|
6
|
+
\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, pos.breaks.tsv = NA,
|
7
|
+
id.breaks = 300, id.free.range = FALSE, id.metric = c("identity",
|
8
|
+
"corrected identity", "bit score"), id.summary = sum,
|
9
|
+
id.cutoff = 95, threads = 2, verbose = TRUE, ...)}
|
10
10
|
\arguments{
|
11
11
|
\item{prefix}{Path to the prefix of the BlastTab.catsbj.pl output files. At
|
12
12
|
least the files .rec and .lim must exist with this prefix.}
|
@@ -14,7 +14,15 @@ least the files .rec and .lim must exist with this prefix.}
|
|
14
14
|
\item{pos.breaks}{Breaks in the positions histogram. It can also be a vector of break
|
15
15
|
points, and values outside the range are ignored. If zero (0), it
|
16
16
|
uses the sequence breaks as defined in the .lim file, which means
|
17
|
-
one bin per contig (or gene, if the mapping is agains genes).
|
17
|
+
one bin per contig (or gene, if the mapping is agains genes). Ignored
|
18
|
+
if `pos.breaks.tsv` is passed.}
|
19
|
+
\item{pos.breaks.tsv}{Path to a list of (absolute) coordinates to use as position breaks.
|
20
|
+
This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
21
|
+
must contain at least one column: coordinates of the break positions of
|
22
|
+
each position bin. If it has a second column, this is used as the name
|
23
|
+
of the position bin that ends at the given coordinate (the first row is
|
24
|
+
ignored). Any additional columns are currently ignored. If NA,
|
25
|
+
position bins are determined by `pos.breaks`.}
|
18
26
|
\item{id.breaks}{Breaks in the identity histogram. It can also be a vector of break
|
19
27
|
points, and values outside the range are ignored.}
|
20
28
|
\item{id.free.range}{Indicates that the range should be freely set from the observed
|