miga-base 0.3.9.0 → 0.3.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/add.rb +33 -33
- data/actions/edit.rb +33 -0
- data/actions/new.rb +17 -18
- data/actions/next_step.rb +33 -0
- data/actions/run.rb +15 -12
- data/bin/miga +43 -37
- data/lib/miga/daemon.rb +2 -2
- data/lib/miga/project/result.rb +16 -1
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +1 -3
- data/scripts/ani_distances.bash +1 -3
- data/scripts/assembly.bash +1 -3
- data/scripts/cds.bash +1 -3
- data/scripts/clade_finding.bash +1 -3
- data/scripts/d.bash +13 -0
- data/scripts/distances.bash +1 -3
- data/scripts/essential_genes.bash +1 -3
- data/scripts/haai_distances.bash +1 -3
- data/scripts/miga.bash +12 -9
- data/scripts/mytaxa.bash +1 -3
- data/scripts/mytaxa_scan.bash +1 -3
- data/scripts/ogs.bash +36 -33
- data/scripts/p.bash +23 -0
- data/scripts/project_stats.bash +1 -3
- data/scripts/read_quality.bash +1 -3
- data/scripts/ssu.bash +1 -3
- data/scripts/stats.bash +1 -3
- data/scripts/subclades.bash +1 -3
- data/scripts/taxonomy.bash +1 -3
- data/scripts/trimmed_fasta.bash +1 -3
- data/scripts/trimmed_reads.bash +1 -3
- data/test/daemon_test.rb +3 -3
- data/utils/distance/runner.rb +1 -1
- data/utils/enveomics/Docs/recplot2.md +13 -2
- data/utils/enveomics/Examples/aai-matrix.bash +3 -3
- data/utils/enveomics/Examples/ani-matrix.bash +3 -3
- data/utils/enveomics/Makefile +2 -2
- data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
- data/utils/enveomics/Manifest/Tasks/other.json +49 -0
- data/utils/enveomics/Manifest/categories.json +4 -0
- data/utils/enveomics/Manifest/examples.json +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/aai.rb +4 -3
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
- data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
- data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
- data/utils/enveomics/enveomics.R/R/utils.R +19 -1
- data/utils/enveomics/enveomics.R/README.md +11 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
- data/utils/subclade/runner.rb +4 -0
- metadata +14 -3
@@ -9,6 +9,7 @@ setClass("enve.RecPlot2",
|
|
9
9
|
id.counts='numeric', ##<< Counts per ID bin.
|
10
10
|
id.breaks='numeric', ##<< Breaks of identity bins.
|
11
11
|
pos.breaks='numeric', ##<< Breaks of position bins.
|
12
|
+
pos.names='character', ##<< Names of the position bins.
|
12
13
|
seq.breaks='numeric', ##<< Breaks of input sequences.
|
13
14
|
peaks='list', ##<< Peaks identified in the recplot.
|
14
15
|
### Limits of the subject sequences after concatenation.
|
@@ -70,10 +71,17 @@ plot.enve.RecPlot2 <- function
|
|
70
71
|
### 3: identity histogram,
|
71
72
|
### 4: Populations histogram (histogram of sequencing depths),
|
72
73
|
### 5: Color scale for the counts matrix (vertical),
|
73
|
-
### 6: Color scale of the counts
|
74
|
-
###
|
75
|
-
###
|
76
|
-
|
74
|
+
### 6: Color scale of the counts matrix (horizontal)
|
75
|
+
### Only panels indicated here will be plotted. To plot only one panel
|
76
|
+
### simply set this to the number of the panel you want to plot.
|
77
|
+
panel.fun=list(),
|
78
|
+
### List of functions to be executed after drawing each panel. Use the
|
79
|
+
### indices in `layout` (as characters) as keys. Functions for indices
|
80
|
+
### missing in `layout` are ignored. For example, to add a vertical line
|
81
|
+
### at the 3Mbp mark in both the position histogram and the counts matrix:
|
82
|
+
### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
|
83
|
+
### Note that the X-axis in both panels is in Mbp by default. To change
|
84
|
+
### this behavior, set `pos.units` accordingly.
|
77
85
|
widths=c(1,7,2),
|
78
86
|
### Relative widths of the columns of `layout`.
|
79
87
|
heights=c(1,2),
|
@@ -166,7 +174,7 @@ plot.enve.RecPlot2 <- function
|
|
166
174
|
list(maxColorValue=256, alpha=52)));
|
167
175
|
}
|
168
176
|
|
169
|
-
# Counts matrix
|
177
|
+
# [1] Counts matrix
|
170
178
|
if(any(layout==1)){
|
171
179
|
par(mar=mar[['1']]);
|
172
180
|
plot(1, t='n', bty='l',
|
@@ -182,9 +190,10 @@ plot.enve.RecPlot2 <- function
|
|
182
190
|
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
183
191
|
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
184
192
|
length.out=1+length(palette)), add=TRUE);
|
193
|
+
if(exists('1',panel.fun)) panel.fun[['1']]();
|
185
194
|
}
|
186
195
|
|
187
|
-
# Position histogram
|
196
|
+
# [2] Position histogram
|
188
197
|
if(any(layout==2)){
|
189
198
|
par(mar=mar[['2']]);
|
190
199
|
if(any(layout==1)){
|
@@ -213,9 +222,10 @@ plot.enve.RecPlot2 <- function
|
|
213
222
|
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
214
223
|
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
215
224
|
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
225
|
+
if(exists('2',panel.fun)) panel.fun[['2']]();
|
216
226
|
}
|
217
227
|
|
218
|
-
# Identity histogram
|
228
|
+
# [3] Identity histogram
|
219
229
|
if(any(layout==3)){
|
220
230
|
par(mar=mar[['3']]);
|
221
231
|
if(any(layout==1)){
|
@@ -248,9 +258,10 @@ plot.enve.RecPlot2 <- function
|
|
248
258
|
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
249
259
|
text(1,1,labels='Insufficient data', srt=90)
|
250
260
|
}
|
261
|
+
if(exists('3',panel.fun)) panel.fun[['3']]();
|
251
262
|
}
|
252
263
|
|
253
|
-
# Populations histogram
|
264
|
+
# [4] Populations histogram
|
254
265
|
peaks <- NA;
|
255
266
|
if(any(layout==4)){
|
256
267
|
par(mar=mar[['4']]);
|
@@ -308,9 +319,10 @@ plot.enve.RecPlot2 <- function
|
|
308
319
|
dpt,'X (', frx, '%', err, ')', sep=''))
|
309
320
|
}
|
310
321
|
}
|
322
|
+
if(exists('4',panel.fun)) panel.fun[['4']]();
|
311
323
|
}
|
312
324
|
|
313
|
-
# Color scale
|
325
|
+
# [5] Color scale of the counts matrix (vertical)
|
314
326
|
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
315
327
|
length.out=1+length(palette))
|
316
328
|
if(any(layout==5)){
|
@@ -319,13 +331,17 @@ plot.enve.RecPlot2 <- function
|
|
319
331
|
ylim=range(count.bins), yaxs='i', ylab='')
|
320
332
|
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
321
333
|
border=NA)
|
334
|
+
if(exists('5',panel.fun)) panel.fun[['5']]();
|
322
335
|
}
|
336
|
+
|
337
|
+
# [6] Color scale of the coutnts matrix (horizontal)
|
323
338
|
if(any(layout==6)){
|
324
339
|
par(mar=mar[['6']]);
|
325
340
|
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
326
341
|
xlim=range(count.bins), xaxs='i',xlab='');
|
327
342
|
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
328
343
|
border=NA);
|
344
|
+
if(exists('6',panel.fun)) panel.fun[['6']]();
|
329
345
|
}
|
330
346
|
|
331
347
|
par(mar=ori.mar);
|
@@ -337,113 +353,129 @@ plot.enve.RecPlot2 <- function
|
|
337
353
|
|
338
354
|
#==============> Define core functions
|
339
355
|
enve.recplot2 <- function(
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
if(verbose) cat("Building counts matrix.\n")
|
389
|
-
if(id.metric=="corrected identity" & ncol(rec)<6){
|
390
|
-
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
391
|
-
}
|
392
|
-
rec.idcol <- ifelse(id.metric=="identity", 3,
|
393
|
-
ifelse(id.metric=="corrected identity", 6, 4));
|
394
|
-
if(length(pos.breaks)==1){
|
395
|
-
if(pos.breaks>0){
|
396
|
-
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
397
|
-
}else{
|
398
|
-
pos.breaks <- c(lim[1,2], lim[,3])
|
399
|
-
}
|
400
|
-
}
|
401
|
-
if(length(id.breaks)==1){
|
402
|
-
id.range.v <- rec[,rec.idcol]
|
403
|
-
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
404
|
-
id.range.v <- range(id.range.v)
|
405
|
-
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
406
|
-
}
|
407
|
-
|
408
|
-
# Run in parallel
|
409
|
-
if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
|
410
|
-
cl <- makeCluster(threads)
|
411
|
-
rec.l <- list()
|
412
|
-
thl <- ceiling(nrow(rec)/threads)
|
413
|
-
for(i in 0:(threads-1)){
|
414
|
-
rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
|
415
|
-
verbose=ifelse(i==0, verbose, FALSE))
|
416
|
-
}
|
417
|
-
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
418
|
-
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
419
|
-
rec.idcol=rec.idcol)
|
420
|
-
counts <- counts.l[[1]]
|
421
|
-
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
422
|
-
stopCluster(cl)
|
423
|
-
|
424
|
-
# Estimate 1D histograms
|
425
|
-
if(verbose) cat("Building histograms.\n")
|
426
|
-
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
427
|
-
id.ingroup <- (id.mids > id.cutoff);
|
428
|
-
id.counts <- apply(counts, 2, id.summary);
|
429
|
-
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
430
|
-
pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
|
356
|
+
### Produces recruitment plots provided that BlastTab.catsbj.pl has
|
357
|
+
### been previously executed.
|
358
|
+
prefix,
|
359
|
+
### Path to the prefix of the BlastTab.catsbj.pl output files. At
|
360
|
+
### least the files .rec and .lim must exist with this prefix.
|
361
|
+
plot=TRUE,
|
362
|
+
### Should the object be plotted?
|
363
|
+
pos.breaks=1e3,
|
364
|
+
### Breaks in the positions histogram. It can also be a vector of break
|
365
|
+
### points, and values outside the range are ignored. If zero (0), it
|
366
|
+
### uses the sequence breaks as defined in the .lim file, which means
|
367
|
+
### one bin per contig (or gene, if the mapping is agains genes). Ignored
|
368
|
+
### if `pos.breaks.tsv` is passed.
|
369
|
+
pos.breaks.tsv=NA,
|
370
|
+
### Path to a list of (absolute) coordinates to use as position breaks.
|
371
|
+
### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
372
|
+
### must contain at least one column: coordinates of the break positions of
|
373
|
+
### each position bin. If it has a second column, this is used as the name
|
374
|
+
### of the position bin that ends at the given coordinate (the first row is
|
375
|
+
### ignored). Any additional columns are currently ignored. If NA,
|
376
|
+
### position bins are determined by `pos.breaks`.
|
377
|
+
id.breaks=300,
|
378
|
+
### Breaks in the identity histogram. It can also be a vector of break
|
379
|
+
### points, and values outside the range are ignored.
|
380
|
+
id.free.range=FALSE,
|
381
|
+
### Indicates that the range should be freely set from the observed
|
382
|
+
### values. Otherwise, 70-100% is included in the identity histogram
|
383
|
+
### (default).
|
384
|
+
id.metric=c('identity', 'corrected identity', 'bit score'),
|
385
|
+
### Metric of identity to be used (Y-axis). Corrected identity is only
|
386
|
+
### supported if the original BLAST file included sequence lengths.
|
387
|
+
id.summary=sum,
|
388
|
+
### Function summarizing the identity bins. Other recommended options
|
389
|
+
### include: `median` to estimate the median instead of total bins, and
|
390
|
+
### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
|
391
|
+
id.cutoff=95,
|
392
|
+
### Cutoff of identity metric above which the hits are considered
|
393
|
+
### 'in-group'. The 95% identity corresponds to the expectation of
|
394
|
+
### ANI<95% within species.
|
395
|
+
threads=2,
|
396
|
+
### Number of threads to use.
|
397
|
+
verbose=TRUE,
|
398
|
+
### Indicates if the function should report the advance.
|
399
|
+
...
|
400
|
+
### Any additional parameters supported by `plot.enve.RecPlot2`.
|
401
|
+
){
|
402
|
+
# Settings
|
403
|
+
id.metric <- match.arg(id.metric);
|
431
404
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
405
|
+
#Read files
|
406
|
+
if(verbose) cat("Reading files.\n")
|
407
|
+
rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
|
408
|
+
quote="");
|
409
|
+
lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
|
410
|
+
quote="", as.is=TRUE);
|
411
|
+
|
412
|
+
# Build matrix
|
413
|
+
if(verbose) cat("Building counts matrix.\n")
|
414
|
+
if(id.metric=="corrected identity" & ncol(rec)<6){
|
415
|
+
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
416
|
+
}
|
417
|
+
rec.idcol <- ifelse(id.metric=="identity", 3,
|
418
|
+
ifelse(id.metric=="corrected identity", 6, 4))
|
419
|
+
pos.names <- as.character(NULL)
|
420
|
+
if(!is.na(pos.breaks.tsv)){
|
421
|
+
tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
|
422
|
+
pos.breaks <- as.numeric(tmp[,1])
|
423
|
+
if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
|
424
|
+
}else if(length(pos.breaks)==1){
|
425
|
+
if(pos.breaks>0){
|
426
|
+
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
427
|
+
}else{
|
428
|
+
pos.breaks <- c(lim[1,2], lim[,3])
|
429
|
+
pos.names <- lim[,1]
|
430
|
+
}
|
431
|
+
}
|
432
|
+
if(length(id.breaks)==1){
|
433
|
+
id.range.v <- rec[,rec.idcol]
|
434
|
+
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
435
|
+
id.range.v <- range(id.range.v)
|
436
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
437
|
+
}
|
438
|
+
|
439
|
+
# Run in parallel
|
440
|
+
if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
|
441
|
+
cl <- makeCluster(threads)
|
442
|
+
rec.l <- list()
|
443
|
+
thl <- ceiling(nrow(rec)/threads)
|
444
|
+
for(i in 0:(threads-1)){
|
445
|
+
rec.l[[i+1]] <- list(
|
446
|
+
rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
|
447
|
+
verbose=ifelse(i==0, verbose, FALSE))
|
448
|
+
}
|
449
|
+
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
450
|
+
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
451
|
+
rec.idcol=rec.idcol)
|
452
|
+
counts <- counts.l[[1]]
|
453
|
+
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
454
|
+
stopCluster(cl)
|
455
|
+
|
456
|
+
# Estimate 1D histograms
|
457
|
+
if(verbose) cat("Building histograms.\n")
|
458
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
459
|
+
id.ingroup <- (id.mids > id.cutoff);
|
460
|
+
id.counts <- apply(counts, 2, id.summary);
|
461
|
+
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
462
|
+
pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
|
463
|
+
|
464
|
+
# Plot and return
|
465
|
+
recplot <- new('enve.RecPlot2',
|
466
|
+
counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
|
467
|
+
pos.counts.out=pos.counts.out,
|
468
|
+
id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
|
469
|
+
seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
|
470
|
+
id.ingroup=id.ingroup,id.metric=id.metric,
|
471
|
+
call=match.call());
|
472
|
+
if(plot){
|
473
|
+
if(verbose) cat("Plotting.\n")
|
474
|
+
peaks <- plot(recplot, ...);
|
475
|
+
attr(recplot, "peaks") <- peaks
|
476
|
+
}
|
477
|
+
return(recplot);
|
478
|
+
### Returns an object of class `enve.RecPlot2`.
|
447
479
|
}
|
448
480
|
|
449
481
|
enve.recplot2.findPeaks <- function(
|
@@ -502,9 +534,11 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
502
534
|
stop('Invalid criterion ', criterion)
|
503
535
|
}
|
504
536
|
for(comp in components){
|
537
|
+
if(verbose) cat('Testing:',comp,'\n')
|
505
538
|
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
506
539
|
verbose, ...)
|
507
540
|
}
|
541
|
+
if(length(best[['peaks']])==0) return(list())
|
508
542
|
|
509
543
|
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
510
544
|
function(x) x$seq.depth)), merge.tol)
|
@@ -609,7 +643,7 @@ enve.recplot2.findPeaks.mower <- function(
|
|
609
643
|
### Range of quantiles to be used in the estimation of a peak's
|
610
644
|
### parameters.
|
611
645
|
mlv.opts=list(method='parzen'),
|
612
|
-
###
|
646
|
+
### Ignored. For backwards compatibility.
|
613
647
|
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
614
648
|
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
615
649
|
### Options passed to `fitdist` to estimate the standard deviation if
|
@@ -758,53 +792,75 @@ enve.recplot2.changeCutoff <- function
|
|
758
792
|
return(rp)
|
759
793
|
}
|
760
794
|
|
761
|
-
enve.recplot2.
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
### Returns subject sequence names instead of a vector of Booleans. If
|
777
|
-
### the recruitment plot was generated with pos.breaks=0 it returns a
|
778
|
-
### vector of characters (the sequence identifiers), otherwise it returns
|
779
|
-
### a data.frame with a name column and two columns of coordinates.
|
780
|
-
){
|
781
|
-
# Determine the threshold
|
795
|
+
enve.recplot2.windowDepthThreshold <- function
|
796
|
+
### Identifies the threshold below which windows should be identified as
|
797
|
+
### variable or absent.
|
798
|
+
(rp,
|
799
|
+
### Recruitment plot, an `enve.RecPlot2` object.
|
800
|
+
peak,
|
801
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
802
|
+
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
803
|
+
### used (see `enve.recplot2.corePeak`).
|
804
|
+
lower.tail=TRUE,
|
805
|
+
### If FALSE, it returns windows significantly above the peak in
|
806
|
+
### sequencing depth.
|
807
|
+
significance=0.05
|
808
|
+
### Significance threshold (alpha) to select windows.
|
809
|
+
){
|
782
810
|
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
783
811
|
par <- peak$param.hat
|
784
812
|
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
785
813
|
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
786
814
|
if(peak$log) thr <- exp(thr)
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
815
|
+
|
816
|
+
return(thr)
|
817
|
+
### Returns a float. The units are depth if the peaks were estimated in
|
818
|
+
### linear scale, or log-depth otherwise (`peak$log`).
|
819
|
+
}
|
820
|
+
|
821
|
+
enve.recplot2.extractWindows <- function
|
822
|
+
### Extract windows significantly below (or above) the peak in sequencing
|
823
|
+
### depth.
|
824
|
+
(rp,
|
825
|
+
### Recruitment plot, a `enve.RecPlot2` object.
|
826
|
+
peak,
|
827
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
828
|
+
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
829
|
+
### used (see `enve.recplot2.corePeak`).
|
830
|
+
lower.tail=TRUE,
|
831
|
+
### If FALSE, it returns windows significantly above the peak in
|
832
|
+
### sequencing depth.
|
833
|
+
significance=0.05,
|
834
|
+
### Significance threshold (alpha) to select windows.
|
835
|
+
seq.names=FALSE
|
836
|
+
### Returns subject sequence names instead of a vector of Booleans. If
|
837
|
+
### the recruitment plot was generated with named position bins (e.g, using
|
838
|
+
### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
|
839
|
+
### characters (the sequence identifiers), otherwise it returns a data.frame
|
840
|
+
### with a name column and two columns of coordinates.
|
841
|
+
){
|
842
|
+
# Determine the threshold
|
843
|
+
thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
|
844
|
+
|
845
|
+
# Select windows past the threshold
|
846
|
+
seqdepth.in <- enve.recplot2.seqdepth(rp)
|
847
|
+
if(lower.tail){
|
848
|
+
sel <- seqdepth.in < thr
|
849
|
+
}else{
|
850
|
+
sel <- seqdepth.in > thr
|
851
|
+
}
|
852
|
+
|
853
|
+
# seq.names=FALSE
|
854
|
+
if(!seq.names) return(sel)
|
855
|
+
# seq.names=TRUE and pos.names defined
|
856
|
+
if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
|
857
|
+
# seq.names=TRUE and pos.names undefined
|
858
|
+
return(enve.recplot2.coordinates(rp,sel))
|
859
|
+
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
860
|
+
### it returns a vector of characters if the object has `pos.names` defined,
|
861
|
+
### or a data.frame with four columns otherwise:
|
862
|
+
### name.from, name.to, pos.from, and pos.to
|
863
|
+
### (see `enve.recplot2.coordinates`).
|
808
864
|
}
|
809
865
|
|
810
866
|
enve.recplot2.compareIdentities <- function
|
@@ -931,7 +987,11 @@ enve.recplot2.seqdepth <- function
|
|
931
987
|
){
|
932
988
|
if(!inherits(x, "enve.RecPlot2"))
|
933
989
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
934
|
-
|
990
|
+
if(low.identity){
|
991
|
+
pos.cnts.in <- x$pos.counts.out
|
992
|
+
}else{
|
993
|
+
pos.cnts.in <- x$pos.counts.in
|
994
|
+
}
|
935
995
|
pos.breaks <- x$pos.breaks
|
936
996
|
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
937
997
|
seqdepth.in <- pos.cnts.in/pos.binsize
|
@@ -987,6 +1047,7 @@ enve.recplot2.findPeaks.__emauto_one <- function
|
|
987
1047
|
### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
|
988
1048
|
(x, comp, do_crit, best, verbose, ...){
|
989
1049
|
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
1050
|
+
if(length(peaks)==0) return(best)
|
990
1051
|
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
991
1052
|
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
992
1053
|
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
@@ -1049,7 +1110,7 @@ enve.recplot2.findPeaks.__mow_one <- function
|
|
1049
1110
|
|
1050
1111
|
# Find peak
|
1051
1112
|
o <- mlv.opts; o$x = lsd1;
|
1052
|
-
mode1 <- do.call(mlv, o)$M;
|
1113
|
+
mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
|
1053
1114
|
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
1054
1115
|
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
1055
1116
|
lim <- NA;
|
@@ -7,10 +7,28 @@ enve.col.alpha <- function
|
|
7
7
|
### such as 'darkred' or '#009988'.
|
8
8
|
alpha=1/2
|
9
9
|
### Alpha value to add to the color, from 0 to 1.
|
10
|
-
|
10
|
+
){
|
11
11
|
return(
|
12
12
|
apply(col2rgb(col), 2,
|
13
13
|
function(x) do.call(rgb, as.list(c(x[1:3]/256, alpha))) ) )
|
14
14
|
### Returns a color or a vector of colors in hex notation including alpha.
|
15
15
|
}
|
16
16
|
|
17
|
+
enve.truncate <- function
|
18
|
+
### Removes the `n` highest and lowest values from a vector, and applies a
|
19
|
+
### summary function. The value of `n` is determined such that the central
|
20
|
+
### range is used, corresponding to the `f` fraction of values.
|
21
|
+
(x,
|
22
|
+
### A vector of numbers.
|
23
|
+
f=0.95,
|
24
|
+
### The fraction of values to retain.
|
25
|
+
FUN=mean
|
26
|
+
### Summary function to apply to the vectors. To obtain the truncated
|
27
|
+
### vector itself, use `c`.
|
28
|
+
){
|
29
|
+
n <- round(length(x)*(1-f)/2)
|
30
|
+
y <- sort(x)[ -c(seq(1, n), seq(length(x)+1-n, length(x))) ]
|
31
|
+
return(FUN(y))
|
32
|
+
### Returns the summary (`FUN`) of the truncated vector.
|
33
|
+
}
|
34
|
+
|
@@ -32,6 +32,7 @@ And open help messages using any of the following commands:
|
|
32
32
|
?enve.recplot2.changeCutoff
|
33
33
|
?enve.recplot2.findPeaks
|
34
34
|
?enve.recplot2.corePeak
|
35
|
+
?enve.recplot2.windowDepthThreshold
|
35
36
|
?enve.recplot2.extractWindows
|
36
37
|
?enve.recplot2.coordinates
|
37
38
|
?enve.recplot2.seqdepth
|
@@ -41,6 +42,7 @@ And open help messages using any of the following commands:
|
|
41
42
|
?enve.tribs.test
|
42
43
|
?enve.growthcurve
|
43
44
|
?enve.col.alpha
|
45
|
+
?enve.truncate
|
44
46
|
```
|
45
47
|
|
46
48
|
You can run some examples using these libraries in the
|
@@ -50,6 +52,15 @@ For additional information on recruitment plots, see the
|
|
50
52
|
[Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
|
51
53
|
|
52
54
|
## Changelog
|
55
|
+
* 1.4.4: Removes modeest library as requirement, and replaces mower peak-finder
|
56
|
+
initialization to median (instead of mode).
|
57
|
+
* 1.4.2: Solved bug #36.
|
58
|
+
* 1.4.0: New option `pos.breaks.tsv` for `enve.recplot2`.
|
59
|
+
* 1.3.4: Gracefully handles and plots recruitment plots with insufficient data
|
60
|
+
to find peaks.
|
61
|
+
* 1.3.3: New function `enve.recplot2.windowDepthThreshold`.
|
62
|
+
* 1.3.2: New option `panel.fun` for `plot.enve.RecPlot2`.
|
63
|
+
* 1.3.1: New function enve.truncate.
|
53
64
|
* 1.3: Several bug fixes and new utilities for recruitment plots (recplot2).
|
54
65
|
* 1.1.0: New function enve.growthcurve and related class enve.GrowthCurve
|
55
66
|
with S3 methods plot and summary.
|
@@ -37,7 +37,7 @@ If non-zero, requires the stats package.}
|
|
37
37
|
\item{main}{Title of the plot.}
|
38
38
|
\item{contig.col}{Color of the Contig boundaries. Set to NA to ignore Contig boundaries.}
|
39
39
|
\item{ret.recplot}{Indicates if the matrix of the recruitment plot is to be returned.}
|
40
|
-
\item{ret.hist}{
|
40
|
+
\item{ret.hist}{Ignored, for backwards compatibility.}
|
41
41
|
\item{ret.mode}{Indicates if the mode of the identity is to be computed. It requires the modeest
|
42
42
|
package.}
|
43
43
|
\item{id.cutoff}{Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
|
@@ -59,7 +59,7 @@ id.mean: Mean identity.
|
|
59
59
|
|
60
60
|
id.median: Median identity.
|
61
61
|
|
62
|
-
id.mode (if ret.mode=TRUE): Mode of the identity.
|
62
|
+
id.mode (if ret.mode=TRUE): Mode of the identity. Deprecated.
|
63
63
|
|
64
64
|
id.hist (if ret.hist=TRUE): Values of the identity histogram.
|
65
65
|
|
@@ -17,6 +17,7 @@ be produced by `enve.recplot2` and supports S4 method plot.}
|
|
17
17
|
\item{\code{id.counts}:}{(\code{numeric}) Counts per ID bin.}
|
18
18
|
\item{\code{id.breaks}:}{(\code{numeric}) Breaks of identity bins.}
|
19
19
|
\item{\code{pos.breaks}:}{(\code{numeric}) Breaks of position bins.}
|
20
|
+
\item{\code{pos.names}:}{(\code{character}) Names of the position bins.}
|
20
21
|
\item{\code{seq.breaks}:}{(\code{numeric}) Breaks of input sequences.}
|
21
22
|
\item{\code{peaks}:}{(\code{list}) Peaks identified in the recplot.
|
22
23
|
Limits of the subject sequences after concatenation.}
|
@@ -3,10 +3,10 @@
|
|
3
3
|
\title{enve recplot2}
|
4
4
|
\description{Produces recruitment plots provided that BlastTab.catsbj.pl has
|
5
5
|
been previously executed.}
|
6
|
-
\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000,
|
7
|
-
id.free.range = FALSE, id.metric = c("identity",
|
8
|
-
"bit score"), id.summary = sum,
|
9
|
-
verbose = TRUE, ...)}
|
6
|
+
\usage{enve.recplot2(prefix, plot = TRUE, pos.breaks = 1000, pos.breaks.tsv = NA,
|
7
|
+
id.breaks = 300, id.free.range = FALSE, id.metric = c("identity",
|
8
|
+
"corrected identity", "bit score"), id.summary = sum,
|
9
|
+
id.cutoff = 95, threads = 2, verbose = TRUE, ...)}
|
10
10
|
\arguments{
|
11
11
|
\item{prefix}{Path to the prefix of the BlastTab.catsbj.pl output files. At
|
12
12
|
least the files .rec and .lim must exist with this prefix.}
|
@@ -14,7 +14,15 @@ least the files .rec and .lim must exist with this prefix.}
|
|
14
14
|
\item{pos.breaks}{Breaks in the positions histogram. It can also be a vector of break
|
15
15
|
points, and values outside the range are ignored. If zero (0), it
|
16
16
|
uses the sequence breaks as defined in the .lim file, which means
|
17
|
-
one bin per contig (or gene, if the mapping is agains genes).
|
17
|
+
one bin per contig (or gene, if the mapping is agains genes). Ignored
|
18
|
+
if `pos.breaks.tsv` is passed.}
|
19
|
+
\item{pos.breaks.tsv}{Path to a list of (absolute) coordinates to use as position breaks.
|
20
|
+
This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
21
|
+
must contain at least one column: coordinates of the break positions of
|
22
|
+
each position bin. If it has a second column, this is used as the name
|
23
|
+
of the position bin that ends at the given coordinate (the first row is
|
24
|
+
ignored). Any additional columns are currently ignored. If NA,
|
25
|
+
position bins are determined by `pos.breaks`.}
|
18
26
|
\item{id.breaks}{Breaks in the identity histogram. It can also be a vector of break
|
19
27
|
points, and values outside the range are ignored.}
|
20
28
|
\item{id.free.range}{Indicates that the range should be freely set from the observed
|