miga-base 0.3.1.7 → 0.3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -1,43 +1,49 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
2
|
|
3
|
-
# Interpose sequences in FastA format from two files into one output file. If more than two files are
|
4
|
-
# provided, the script will interpose all the input files.
|
5
|
-
# Please note that this script will check for the consistency of the names (assuming a pair of related reads
|
6
|
-
# contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn this
|
7
|
-
# feature off just set the $eval_T variable to zero. If you want to decrease the sampling period (to speed
|
8
|
-
# the script up) or increase it (to make it more sensitive to errors) just change $eval_T accordingly.
|
9
|
-
#
|
10
3
|
# @author Luis M. Rodriguez-R
|
11
|
-
# @version 1.0
|
12
|
-
# @created Nov-27-2012
|
13
|
-
# @update Mar-23-2015
|
14
4
|
# @license artistic license 2.0
|
15
|
-
#
|
16
|
-
# Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
17
5
|
|
18
6
|
use strict;
|
19
7
|
use warnings;
|
20
8
|
use Symbol;
|
21
9
|
|
22
10
|
my $HELP = <<HELP
|
11
|
+
|
12
|
+
Description:
|
13
|
+
Interposes sequences in FastA format from two files into one output file.
|
14
|
+
If more than two files are provided, the script will interpose all the input
|
15
|
+
files.
|
16
|
+
Note that this script will check for the consistency of the names (assuming
|
17
|
+
a pair of related reads contains the same name varying only in a trailing
|
18
|
+
slash (/) followed by a digit. If you want to turn this feature off just
|
19
|
+
set the -T option to zero. If you want to decrease the sampling period (to
|
20
|
+
speed the script up) or increase it (to make it more sensitive to errors)
|
21
|
+
just change the -T option accordingly.
|
22
|
+
|
23
23
|
Usage:
|
24
|
-
|
24
|
+
$0 [-T <int> ]<output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
|
25
25
|
|
26
26
|
Where,
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
-T <int> : Optional. Integer indicating the sampling period for
|
28
|
+
names evaluation (see Description above).
|
29
|
+
By default: 1000.
|
30
|
+
output_fasta : Output file
|
31
|
+
input_fasta_1 : First FastA file
|
32
|
+
input_fasta_2 : Second FastA file
|
33
|
+
... : Any additional FastA files (or none)
|
31
34
|
|
32
35
|
HELP
|
33
36
|
;
|
34
|
-
my $eval_T = 1000;
|
35
|
-
|
37
|
+
my $eval_T = 1000;
|
38
|
+
if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
|
39
|
+
$eval_T = $ARGV[1]+0;
|
40
|
+
shift @ARGV;
|
41
|
+
shift @ARGV;
|
42
|
+
}
|
36
43
|
my $out = shift @ARGV;
|
37
44
|
my @in = @ARGV;
|
38
45
|
$/ = "\n>";
|
39
46
|
|
40
|
-
|
41
47
|
die $HELP unless $out and $#in >= 1;
|
42
48
|
open OUT, ">", $out or die "Unable to write on $out: $!\n";
|
43
49
|
print "Output file: $out\n";
|
@@ -1,11 +1,7 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
|
-
|
2
|
+
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
|
-
# @version 2.0
|
5
|
-
# @update: Mar-23-2015
|
6
4
|
# @license artistic license 2.0
|
7
|
-
#
|
8
|
-
# Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
9
5
|
|
10
6
|
use strict;
|
11
7
|
use warnings;
|
@@ -14,24 +10,27 @@ use Symbol;
|
|
14
10
|
my $HELP = <<HELP
|
15
11
|
|
16
12
|
Description:
|
17
|
-
Interposes sequences in FastQ format from two files into one output file.
|
18
|
-
provided, the script will interpose all the input
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
(
|
23
|
-
|
24
|
-
|
13
|
+
Interposes sequences in FastQ format from two files into one output file.
|
14
|
+
If more than two files are provided, the script will interpose all the input
|
15
|
+
files.
|
16
|
+
Note that this script will check for the consistency of the names (assuming
|
17
|
+
a pair of related reads contains the same name varying only in a trailing
|
18
|
+
slash (/) followed by a digit. If you want to turn this feature off just
|
19
|
+
set the -T option to zero. If you want to decrease the sampling period (to
|
20
|
+
speed the script up) or increase it (to make it more sensitive to errors)
|
21
|
+
just change the -T option accordingly.
|
22
|
+
|
25
23
|
Usage:
|
26
|
-
|
24
|
+
$0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
27
25
|
|
28
26
|
Where,
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
-T <int> : Optional. Integer indicating the sampling period for
|
28
|
+
names evaluation (see Description above).
|
29
|
+
By default: 1000.
|
30
|
+
output_fastq : Output file
|
31
|
+
input_fastq_1 : First FastQ file
|
32
|
+
input_fastq_2 : Second FastQ file
|
33
|
+
... : Any additional FastQ files (or none)
|
35
34
|
|
36
35
|
HELP
|
37
36
|
;
|
@@ -44,6 +43,7 @@ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
|
|
44
43
|
my $out = shift @ARGV;
|
45
44
|
my @in = @ARGV;
|
46
45
|
|
46
|
+
|
47
47
|
die $HELP unless $out and $#in >= 1;
|
48
48
|
open OUT, ">", $out or die "Unable to write on $out: $!\n";
|
49
49
|
print "Output file: $out\n";
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Jan-04-2016
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
#= Load stuff
|
9
|
+
args <- commandArgs(trailingOnly = F)
|
10
|
+
enveomics_R <- file.path(dirname(
|
11
|
+
sub("^--file=", "", args[grep("^--file=", args)])),
|
12
|
+
"lib", "enveomics.R")
|
13
|
+
library(methods)
|
14
|
+
source(file.path(enveomics_R, "R", "cliopts.R"))
|
15
|
+
source(file.path(enveomics_R, "R", "recplot2.R"))
|
16
|
+
|
17
|
+
#= Generate interface
|
18
|
+
opt <- enve.cliopts(enve.recplot2.compareIdentities,
|
19
|
+
file.path(enveomics_R, "man", "enve.recplot2.compareIdentities.Rd"),
|
20
|
+
positional_arguments=2,
|
21
|
+
usage="usage: %prog [options] recplot-A.Rdata recplot-B.Rdata",
|
22
|
+
number=c("pseudocounts", "max.deviation"), ignore=c("x", "y"),
|
23
|
+
p_desc="Calculates the difference between identity distributions of two recruitment plots.")
|
24
|
+
|
25
|
+
#= Run it!
|
26
|
+
load(opt$args[1])
|
27
|
+
opt$options[['x']] <- rp
|
28
|
+
load(opt$args[2])
|
29
|
+
opt$options[['y']] <- rp
|
30
|
+
dist <- do.call("enve.recplot2.compareIdentities", opt$options)
|
31
|
+
cat(dist, '\n')
|
32
|
+
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
#
|
4
4
|
# @author Luis M. Rodriguez-R
|
5
|
-
# @update Nov-27-2015
|
6
5
|
# @license artistic license 2.0
|
7
6
|
#
|
8
7
|
|
9
|
-
DATA_LINK="
|
8
|
+
DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
|
10
9
|
DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
|
11
10
|
SRX=$1
|
12
|
-
DIR=${
|
11
|
+
DIR=${2:-$SRX}
|
13
12
|
|
14
13
|
if [[ "$SRX" == "" ]] ; then
|
15
14
|
echo "
|
@@ -27,24 +26,32 @@ fi
|
|
27
26
|
|
28
27
|
[[ -d "$DIR" ]] || mkdir "$DIR"
|
29
28
|
|
29
|
+
function md5value {
|
30
|
+
local file=$1
|
31
|
+
o=$(md5 "$file" | perl -pe 's/.* //')
|
32
|
+
[[ -n $o ]] || o=$(md5sum-lite "$file" | awk '{print $1}')
|
33
|
+
[[ -n $o ]] || o=$(md5sum "$file" | awk '{print $1}')
|
34
|
+
echo "$o"
|
35
|
+
}
|
36
|
+
|
30
37
|
curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
|
31
38
|
tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
39
|
+
srr=$(echo "$ln"|cut -f 1)
|
40
|
+
ftp=$(echo "$ln"|cut -f 2)
|
41
|
+
md5=$(echo "$ln"|cut -f 3)
|
42
|
+
dir="$DIR/$srr"
|
43
|
+
[[ -d "$dir" ]] || mkdir "$dir"
|
44
|
+
echo "o $srr" >&2
|
45
|
+
for uri in $(echo "$ftp" | tr ";" " ") ; do
|
46
|
+
file="$dir/$(basename $uri)"
|
47
|
+
curl "$uri" -o "$file"
|
48
|
+
md5obs=$(md5value "$file")
|
49
|
+
if [[ "$md5" == "$md5obs"* ]] ; then
|
50
|
+
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
51
|
+
else
|
52
|
+
echo "Corrupt file: $file" >&2
|
53
|
+
echo " MD5 mismatch: $md5obs not in $md5" >&2
|
54
|
+
exit 1;
|
55
|
+
fi
|
56
|
+
done
|
50
57
|
done
|
@@ -11,6 +11,7 @@ enveomics_R <- file.path(dirname(
|
|
11
11
|
sub("^--file=", "", args[grep("^--file=", args)])),
|
12
12
|
"lib", "enveomics.R")
|
13
13
|
source(file.path(enveomics_R, "R", "cliopts.R"))
|
14
|
+
source(file.path(enveomics_R, "R", "utils.R"))
|
14
15
|
source(file.path(enveomics_R, "R", "barplot.R"))
|
15
16
|
|
16
17
|
#= Generate interface
|
@@ -296,8 +296,10 @@ Dir.mktmpdir do |dir|
|
|
296
296
|
"#{dir}/#{i}.tab.uns"`
|
297
297
|
`sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
|
298
298
|
when "diamond"
|
299
|
-
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --
|
300
|
-
--
|
299
|
+
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
|
300
|
+
--query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" \
|
301
|
+
&& "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
|
302
|
+
--out "#{dir}/#{i}.tab"`
|
301
303
|
else
|
302
304
|
abort "Unsupported program: #{o[:program]}."
|
303
305
|
end
|
@@ -34,11 +34,11 @@ library(inlinedocs)
|
|
34
34
|
package.skeleton.dx('./');
|
35
35
|
" | R --vanilla
|
36
36
|
cat man/enveomics.R-package.Rd | tr -d '\r' \
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
| grep -v '^}$' | grep -v '^\\author{' \
|
38
|
+
| grep -v '^Maintainer' \
|
39
|
+
| perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
|
40
|
+
| perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
|
41
|
+
> o && mv o man/enveomics.R-package.Rd
|
42
42
|
#[[ ! -d inst/doc ]] && mkdir -p inst/doc
|
43
43
|
#pandoc -o inst/doc/enveomics.R.pdf -f markdown_github README.md
|
44
44
|
|
@@ -10,7 +10,8 @@ importFrom("graphics", "abline", "axis", "barplot", "hist", "image",
|
|
10
10
|
"layout", "legend", "lines", "par", "points", "polygon",
|
11
11
|
"rect", "text")
|
12
12
|
importFrom("stats", "as.dist", "cophenetic", "cor", "median",
|
13
|
-
"quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm"
|
13
|
+
"quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm",
|
14
|
+
"dnorm", "kmeans")
|
14
15
|
importFrom("utils", "head", "read.table", "setTxtProgressBar", "tail",
|
15
16
|
"txtProgressBar")
|
16
17
|
exportClasses(enve.RecPlot2, enve.RecPlot2.Peak, enve.TRIBS, enve.TRIBStest,
|
@@ -29,7 +30,10 @@ export(
|
|
29
30
|
plot.enve.TRIBStest, summary.enve.TRIBStest,
|
30
31
|
enve.df2dist, enve.df2dist.group, enve.df2dist.list,
|
31
32
|
enve.recplot2, plot.enve.RecPlot2, enve.recplot2.findPeaks,
|
33
|
+
enve.recplot2.findPeaks.emauto, enve.recplot2.findPeaks.em,
|
34
|
+
enve.recplot2.findPeaks.mower,
|
32
35
|
enve.recplot2.corePeak, enve.recplot2.changeCutoff,
|
33
|
-
enve.recplot2.extractWindows,
|
36
|
+
enve.recplot2.extractWindows, enve.recplot2.compareIdentities,
|
37
|
+
enve.recplot2.coordinates, enve.recplot2.seqdepth, enve.recplot2.ANIr,
|
34
38
|
enve.growthcurve, plot.enve.GrowthCurve, summary.enve.GrowthCurve,
|
35
39
|
enve.col2alpha)
|
@@ -9,7 +9,7 @@ setClass("enve.RecPlot2",
|
|
9
9
|
id.counts='numeric', ##<< Counts per ID bin.
|
10
10
|
id.breaks='numeric', ##<< Breaks of identity bins.
|
11
11
|
pos.breaks='numeric', ##<< Breaks of position bins.
|
12
|
-
seq.breaks='numeric',
|
12
|
+
seq.breaks='numeric', ##<< Breaks of input sequences.
|
13
13
|
peaks='list', ##<< Peaks identified in the recplot.
|
14
14
|
### Limits of the subject sequences after concatenation.
|
15
15
|
seq.names='character', ##<< Names of the subject sequences.
|
@@ -45,9 +45,13 @@ setClass("enve.RecPlot2.Peak",
|
|
45
45
|
### number of position bins with non-zero sequencing depth in the recruitment
|
46
46
|
### plot (regardless of peak count).
|
47
47
|
err.res='numeric',
|
48
|
-
### Error left after adding the peak.
|
49
|
-
merge.logdist='numeric'
|
48
|
+
### Error left after adding the peak (mower) or log-likelihood (em or emauto).
|
49
|
+
merge.logdist='numeric',
|
50
50
|
### Attempted `merge.logdist` parameter.
|
51
|
+
seq.depth='numeric',
|
52
|
+
### Best estimate available for the sequencing depth of the peak (centrality).
|
53
|
+
log='logical'
|
54
|
+
### Indicates if the estimation was performed in natural logarithm space
|
51
55
|
));
|
52
56
|
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
53
57
|
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
@@ -83,6 +87,9 @@ plot.enve.RecPlot2 <- function
|
|
83
87
|
peaks.col='darkred',
|
84
88
|
### If not NA, it attempts to represent peaks in the population histogram
|
85
89
|
### in the specified color. Set to NA to avoid peak-finding.
|
90
|
+
use.peaks,
|
91
|
+
### A list of `enve.RecPlot2.Peak` objects, as returned by
|
92
|
+
### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
|
86
93
|
id.lim=range(x$id.breaks),
|
87
94
|
### Limits of identities to represent.
|
88
95
|
pos.lim=range(x$pos.breaks),
|
@@ -98,15 +105,17 @@ plot.enve.RecPlot2 <- function
|
|
98
105
|
### the number of the panel as index (see `layout`).
|
99
106
|
pos.splines=0,
|
100
107
|
### Smoothing parameter for the splines in the position histogram. Zero
|
101
|
-
### (0) for no splines.
|
108
|
+
### (0) for no splines. Use NULL to automatically detect by leave-one-out
|
109
|
+
### cross-validation.
|
102
110
|
id.splines=1/2,
|
103
111
|
### Smoothing parameter for the splines in the identity histogram. Zero
|
104
|
-
### (0) for no splines.
|
105
|
-
|
112
|
+
### (0) for no splines. Use NULL to automatically detect by leave-one-out
|
113
|
+
### cross-validation.
|
114
|
+
in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
106
115
|
### Line width for the sequencing depth of in-group matches.
|
107
|
-
out.lwd=ifelse(pos.splines>0, 1/2, 2),
|
116
|
+
out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
108
117
|
### Line width for the sequencing depth of out-group matches.
|
109
|
-
id.lwd=ifelse(id.splines>0, 1/2, 2),
|
118
|
+
id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
|
110
119
|
### Line width for the identity histogram.
|
111
120
|
in.col='darkblue',
|
112
121
|
### Color associated to in-group matches.
|
@@ -193,7 +202,7 @@ plot.enve.RecPlot2 <- function
|
|
193
202
|
pos.f <- rep(seqdepth.in,each=2)
|
194
203
|
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
195
204
|
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
196
|
-
if(pos.splines > 0){
|
205
|
+
if(is.null(pos.splines) || pos.splines > 0){
|
197
206
|
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
198
207
|
spar=pos.splines)
|
199
208
|
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
@@ -230,7 +239,7 @@ plot.enve.RecPlot2 <- function
|
|
230
239
|
id.f <- rep(id.counts,each=2)
|
231
240
|
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
232
241
|
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
233
|
-
if(id.splines > 0){
|
242
|
+
if(is.null(id.splines) || id.splines > 0){
|
234
243
|
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
235
244
|
spar=id.splines)
|
236
245
|
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
@@ -267,8 +276,12 @@ plot.enve.RecPlot2 <- function
|
|
267
276
|
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
268
277
|
y.tmp.in, border=NA, col=in.col)
|
269
278
|
if(!is.na(peaks.col)){
|
270
|
-
o
|
271
|
-
|
279
|
+
o <- peaks.opts; o$x = x;
|
280
|
+
if(missing(use.peaks)){
|
281
|
+
peaks <- do.call(enve.recplot2.findPeaks, o)
|
282
|
+
}else{
|
283
|
+
peaks <- use.peaks
|
284
|
+
}
|
272
285
|
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
273
286
|
if(!is.null(peaks) & length(peaks)>0){
|
274
287
|
pf <- h.mids*0;
|
@@ -276,18 +289,23 @@ plot.enve.RecPlot2 <- function
|
|
276
289
|
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
277
290
|
lines(cnt, h.mids, col='red');
|
278
291
|
pf <- pf+cnt;
|
279
|
-
axis(4, at=peaks[[i]]$
|
280
|
-
letters[i], las=1, hadj=1/2)
|
292
|
+
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
281
293
|
}
|
282
294
|
lines(pf, h.mids, col='red',lwd=1.5);
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
295
|
+
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
296
|
+
frx <- signif(100*as.numeric(
|
297
|
+
lapply(peaks,
|
298
|
+
function(x) ifelse(length(x$values)==0, x$n.hat,
|
299
|
+
length(x$values))/x$n.total)), 2)
|
300
|
+
if(peaks[[1]]$err.res < 0){
|
301
|
+
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
302
|
+
}else{
|
303
|
+
err <- paste(', err:',
|
304
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
305
|
+
}
|
306
|
+
legend('topright', bty='n', cex=1/2,
|
307
|
+
legend=paste(letters[1:length(peaks)],'. ',
|
308
|
+
dpt,'X (', frx, '%', err, ')', sep=''))
|
291
309
|
}
|
292
310
|
}
|
293
311
|
}
|
@@ -334,6 +352,10 @@ enve.recplot2 <- function(
|
|
334
352
|
id.breaks=300,
|
335
353
|
### Breaks in the identity histogram. It can also be a vector of break
|
336
354
|
### points, and values outside the range are ignored.
|
355
|
+
id.free.range=FALSE,
|
356
|
+
### Indicates that the range should be freely set from the observed
|
357
|
+
### values. Otherwise, 70-100% is included in the identity histogram
|
358
|
+
### (default).
|
337
359
|
id.metric=c('identity', 'corrected identity', 'bit score'),
|
338
360
|
### Metric of identity to be used (Y-axis). Corrected identity is only
|
339
361
|
### supported if the original BLAST file included sequence lengths.
|
@@ -373,12 +395,14 @@ enve.recplot2 <- function(
|
|
373
395
|
if(pos.breaks>0){
|
374
396
|
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
375
397
|
}else{
|
376
|
-
pos.breaks <- c(lim[,2],
|
398
|
+
pos.breaks <- c(lim[1,2], lim[,3])
|
377
399
|
}
|
378
400
|
}
|
379
401
|
if(length(id.breaks)==1){
|
380
|
-
id.
|
381
|
-
|
402
|
+
id.range.v <- rec[,rec.idcol]
|
403
|
+
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
404
|
+
id.range.v <- range(id.range.v)
|
405
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
382
406
|
}
|
383
407
|
|
384
408
|
# Run in parallel
|
@@ -423,8 +447,159 @@ enve.recplot2 <- function(
|
|
423
447
|
}
|
424
448
|
|
425
449
|
enve.recplot2.findPeaks <- function(
|
450
|
+
### Identifies peaks in the population histogram potentially indicating
|
451
|
+
### sub-population mixtures
|
452
|
+
x,
|
453
|
+
### An `enve.RecPlot2` object.
|
454
|
+
method="emauto",
|
455
|
+
### Peak-finder method. This should be one of:
|
456
|
+
### "emauto" (Expectation-Maximization with auto-selection of components),
|
457
|
+
### "em" (Expectation-Maximization),
|
458
|
+
### "mower" (Custom distribution-mowing method).
|
459
|
+
...
|
460
|
+
### Any additional parameters supported by
|
461
|
+
### `enve.recplot2.findPeaks.<method>`.
|
462
|
+
){
|
463
|
+
if(method == "emauto"){
|
464
|
+
peaks <- enve.recplot2.findPeaks.emauto(x, ...)
|
465
|
+
}else if(method == "em"){
|
466
|
+
peaks <- enve.recplot2.findPeaks.em(x, ...)
|
467
|
+
}else if(method == "mower"){
|
468
|
+
peaks <- enve.recplot2.findPeaks.mower(x, ...)
|
469
|
+
}else{
|
470
|
+
stop("Invalid peak-finder method ", method)
|
471
|
+
}
|
472
|
+
return(peaks)
|
473
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
474
|
+
}
|
475
|
+
|
476
|
+
enve.recplot2.findPeaks.emauto <- function(
|
477
|
+
### Identifies peaks in the population histogram using a Gaussian Mixture
|
478
|
+
### Model Expectation Maximization (GMM-EM) method with number of components
|
479
|
+
### automatically detected.
|
480
|
+
x,
|
481
|
+
### An `enve.RecPlot2` object.
|
482
|
+
components=seq(1,10),
|
483
|
+
### A vector of number of components to evaluate.
|
484
|
+
criterion='aic',
|
485
|
+
### Criterion to use for components selection. Must be one of:
|
486
|
+
### 'aic' (Akaike Information Criterion),
|
487
|
+
### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
|
488
|
+
merge.tol=2L,
|
489
|
+
### When attempting to merge peaks with very similar sequencing depth, use
|
490
|
+
### this number of significant digits (in log-scale).
|
491
|
+
verbose=FALSE,
|
492
|
+
### Display (mostly debugging) information.
|
493
|
+
...
|
494
|
+
### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
|
495
|
+
){
|
496
|
+
best <- list(crit=0, pstore=list())
|
497
|
+
if(criterion == 'aic'){
|
498
|
+
do_crit <- function(ll, k, n) 2*k - 2*ll
|
499
|
+
}else if(criterion %in% c('bic', 'sbc')){
|
500
|
+
do_crit <- function(ll, k, n) log(n)*k - 2*ll
|
501
|
+
}else{
|
502
|
+
stop('Invalid criterion ', criterion)
|
503
|
+
}
|
504
|
+
for(comp in components){
|
505
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
506
|
+
verbose, ...)
|
507
|
+
}
|
508
|
+
|
509
|
+
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
510
|
+
function(x) x$seq.depth)), merge.tol)
|
511
|
+
distinct <- length(unique(seqdepths.r))
|
512
|
+
if(distinct < length(best[['peaks']])){
|
513
|
+
if(verbose) cat('Attempting merge to', distinct, 'components\n')
|
514
|
+
init <- apply(sapply(best[['peaks']],
|
515
|
+
function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
|
516
|
+
init <- init[!duplicated(seqdepths.r),]
|
517
|
+
init <- list(mu=init[,'mean'], sd=init[,'sd'],
|
518
|
+
alpha=init[,'alpha']/sum(init[,'alpha']))
|
519
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
|
520
|
+
verbose, ...)
|
521
|
+
}
|
522
|
+
return(best[['peaks']])
|
523
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
524
|
+
}
|
525
|
+
|
526
|
+
enve.recplot2.findPeaks.em <- function(
|
527
|
+
### Identifies peaks in the population histogram using a Gaussian Mixture
|
528
|
+
### Model Expectation Maximization (GMM-EM) method.
|
529
|
+
x,
|
530
|
+
### An `enve.RecPlot2` object.
|
531
|
+
max.iter=1000,
|
532
|
+
### Maximum number of EM iterations.
|
533
|
+
ll.diff.res=1e-8,
|
534
|
+
### Maximum Log-Likelihood difference to be considered as convergent.
|
535
|
+
components=2,
|
536
|
+
### Number of distributions assumed in the mixture.
|
537
|
+
rm.top=0.05,
|
538
|
+
### Top-values to remove before finding peaks, as a quantile probability.
|
539
|
+
### This step is useful to remove highly conserved regions, but can be
|
540
|
+
### turned off by setting rm.top=0. The quantile is determined *after*
|
541
|
+
### removing zero-coverage windows.
|
542
|
+
verbose=FALSE,
|
543
|
+
### Display (mostly debugging) information.
|
544
|
+
init,
|
545
|
+
### Initialization parameters. By default, these are derived from k-means
|
546
|
+
### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
|
547
|
+
### of length `components`.
|
548
|
+
log=TRUE
|
549
|
+
### Logical value indicating if the estimations should be performed in
|
550
|
+
### natural logarithm units. Do not change unless you know what you're
|
551
|
+
### doing.
|
552
|
+
){
|
553
|
+
|
554
|
+
# Essential vars
|
555
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
556
|
+
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
557
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
558
|
+
if(log) lsd1 <- log(lsd1)
|
559
|
+
|
560
|
+
# 1. Initialize
|
561
|
+
if(missing(init)){
|
562
|
+
km.clust <- kmeans(lsd1, components)$cluster
|
563
|
+
init <- list(
|
564
|
+
mu = tapply(lsd1, km.clust, mean),
|
565
|
+
sd = tapply(lsd1, km.clust, sd),
|
566
|
+
alpha = table(km.clust)/length(km.clust)
|
567
|
+
)
|
568
|
+
}
|
569
|
+
m.step <- init
|
570
|
+
ll <- c()
|
571
|
+
cur.ll <- -Inf
|
572
|
+
|
573
|
+
for(i in 1:max.iter){
|
574
|
+
# 2/3. EM
|
575
|
+
e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
|
576
|
+
m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
|
577
|
+
# 4. Convergence
|
578
|
+
ll <- c(ll, e.step[["ll"]])
|
579
|
+
ll.diff <- abs(cur.ll - e.step[["ll"]])
|
580
|
+
cur.ll <- e.step[["ll"]]
|
581
|
+
if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
|
582
|
+
if(ll.diff <= ll.diff.res) break
|
583
|
+
}
|
584
|
+
|
585
|
+
# Return
|
586
|
+
peaks <- list()
|
587
|
+
for(i in 1:components){
|
588
|
+
n.hat <- m.step[['alpha']][i]*length(lsd1)
|
589
|
+
peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
|
590
|
+
values.res=0, mode=m.step[['mu']][i],
|
591
|
+
param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
|
592
|
+
n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
|
593
|
+
merge.logdist=as.numeric(), log=log,
|
594
|
+
seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
|
595
|
+
}
|
596
|
+
return(peaks)
|
597
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
598
|
+
}
|
599
|
+
|
600
|
+
enve.recplot2.findPeaks.mower <- function(
|
426
601
|
### Identifies peaks in the population histogram potentially indicating
|
427
|
-
### sub-population mixtures.
|
602
|
+
### sub-population mixtures, using a custom distribution-mowing method.
|
428
603
|
x,
|
429
604
|
### An `enve.RecPlot2` object.
|
430
605
|
min.points=10,
|
@@ -436,13 +611,12 @@ enve.recplot2.findPeaks <- function(
|
|
436
611
|
mlv.opts=list(method='parzen'),
|
437
612
|
### Options passed to `mlv` to estimate the mode.
|
438
613
|
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
439
|
-
start=list(omega=1, alpha=-1), lower=c(
|
440
|
-
upper=c(Inf, 0, Inf)),
|
614
|
+
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
441
615
|
### Options passed to `fitdist` to estimate the standard deviation if
|
442
616
|
### with.skewness=TRUE. Note that the `start` parameter will be ammended
|
443
617
|
### with xi=estimated mode for each peak.
|
444
|
-
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4
|
445
|
-
start=list(sd=1), lower=c(
|
618
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
|
619
|
+
start=list(sd=1), lower=c(0, -Inf)),
|
446
620
|
### Options passed to `fitdist` to estimate the standard deviation if
|
447
621
|
### with.skewness=FALSE. Note that the `start` parameter will be ammended
|
448
622
|
### with mean=estimated mode for each peak.
|
@@ -460,7 +634,7 @@ enve.recplot2.findPeaks <- function(
|
|
460
634
|
### "tail distribution".
|
461
635
|
optim.rounds=200,
|
462
636
|
### Maximum rounds of peak optimization.
|
463
|
-
optim.epsilon=1e-
|
637
|
+
optim.epsilon=1e-4,
|
464
638
|
### Trace change at which optimization stops (unless `optim.rounds` is
|
465
639
|
### reached first). The trace change is estimated as the sum of square
|
466
640
|
### differences between parameters in one round and those from two rounds
|
@@ -469,8 +643,12 @@ enve.recplot2.findPeaks <- function(
|
|
469
643
|
### Maximum value of |log-ratio| between centrality parameters in peaks to
|
470
644
|
### attempt merging. The default of ~0.22 corresponds to a maximum
|
471
645
|
### difference of 25%.
|
472
|
-
verbose=FALSE
|
646
|
+
verbose=FALSE,
|
473
647
|
### Display (mostly debugging) information.
|
648
|
+
log=TRUE
|
649
|
+
### Logical value indicating if the estimations should be performed in
|
650
|
+
### natural logarithm units. Do not change unless you know what you're
|
651
|
+
### doing.
|
474
652
|
){
|
475
653
|
|
476
654
|
# Essential vars
|
@@ -478,6 +656,7 @@ enve.recplot2.findPeaks <- function(
|
|
478
656
|
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
479
657
|
lsd1 <- seqdepth.in[seqdepth.in>0];
|
480
658
|
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
659
|
+
if(log) lsd1 <- log(lsd1)
|
481
660
|
if(with.skewness){
|
482
661
|
fitdist.opts <- fitdist.opts.sn
|
483
662
|
}else{
|
@@ -486,11 +665,11 @@ enve.recplot2.findPeaks <- function(
|
|
486
665
|
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
487
666
|
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
488
667
|
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
489
|
-
n.total=length(lsd1), merge.logdist=merge.logdist)
|
668
|
+
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
490
669
|
|
491
670
|
# Find seed peaks
|
492
671
|
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
493
|
-
peaks <- enve.recplot2.
|
672
|
+
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
494
673
|
|
495
674
|
# Merge overlapping peaks
|
496
675
|
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
@@ -511,7 +690,7 @@ enve.recplot2.findPeaks <- function(
|
|
511
690
|
p$param.hat[[ length(p$param.hat) ]],'&',
|
512
691
|
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
513
692
|
peaks.opts$lsd1 <- c(p$values, p2$values)
|
514
|
-
p.new <- enve.recplot2.
|
693
|
+
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
515
694
|
if(length(p.new)==1){
|
516
695
|
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
517
696
|
ignore <- c(ignore, j)
|
@@ -542,16 +721,19 @@ enve.recplot2.corePeak <- function
|
|
542
721
|
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
543
722
|
]]
|
544
723
|
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
545
|
-
# the "merge.logdist" distance, take that one instead.
|
724
|
+
# the default "merge.logdist" distance, take that one instead.
|
546
725
|
corePeak <- maxPeak
|
547
726
|
for(p in x){
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
727
|
+
p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
|
728
|
+
corePeak.len <- ifelse(
|
729
|
+
length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
|
730
|
+
sz.d <- log(p.len/corePeak.len)
|
731
|
+
if(is.nan(sz.d) || sz.d < 0) next
|
732
|
+
sq.d.a <- as.numeric(tail(p$param.hat, n=1))
|
733
|
+
sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
|
734
|
+
if(p$log) sq.d.a <- exp(sq.d.a)
|
735
|
+
if(corePeak$log) sq.d.b <- exp(sq.d.b)
|
736
|
+
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
555
737
|
}
|
556
738
|
return(corePeak)
|
557
739
|
}
|
@@ -580,43 +762,204 @@ enve.recplot2.extractWindows <- function
|
|
580
762
|
### Extract windows significantly below (or above) the peak in sequencing
|
581
763
|
### depth.
|
582
764
|
(rp,
|
583
|
-
### Recruitment plot, a enve.
|
765
|
+
### Recruitment plot, a enve.RecPlot2 object.
|
584
766
|
peak,
|
585
|
-
### Peak,
|
586
|
-
### of enve.RecPlot2.Peak objects, in which case the core peak is
|
587
|
-
### (see enve.recplot2.corePeak).
|
767
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
768
|
+
### list of enve.RecPlot2.Peak objects, in which case the core peak is
|
769
|
+
### used (see `enve.recplot2.corePeak`).
|
588
770
|
lower.tail=TRUE,
|
589
771
|
### If FALSE, it returns windows significantly above the peak in
|
590
772
|
### sequencing depth.
|
591
773
|
significance=0.05,
|
592
774
|
### Significance threshold (alpha) to select windows.
|
593
775
|
seq.names=FALSE
|
594
|
-
### Returns subject sequence names instead of a vector of Booleans.
|
595
|
-
###
|
776
|
+
### Returns subject sequence names instead of a vector of Booleans. If
|
777
|
+
### the recruitment plot was generated with pos.breaks=0 it returns a
|
778
|
+
### vector of characters (the sequence identifiers), otherwise it returns
|
779
|
+
### a data.frame with a name column and two columns of coordinates.
|
596
780
|
){
|
597
781
|
# Determine the threshold
|
598
782
|
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
599
783
|
par <- peak$param.hat
|
600
784
|
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
601
785
|
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
786
|
+
if(peak$log) thr <- exp(thr)
|
602
787
|
|
603
|
-
# Estimate sequencing depths per window
|
604
|
-
pos.cnts.in <- rp$pos.counts.in
|
605
|
-
pos.breaks <- rp$pos.breaks
|
606
|
-
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
607
|
-
seqdepth.in <- pos.cnts.in/pos.binsize
|
608
|
-
|
609
788
|
# Select windows past the threshold
|
789
|
+
seqdepth.in <- enve.recplot2.seqdepth(rp)
|
610
790
|
if(lower.tail){
|
611
791
|
sel <- seqdepth.in < thr
|
612
792
|
}else{
|
613
793
|
sel <- seqdepth.in > thr
|
614
794
|
}
|
795
|
+
|
796
|
+
# seq.names=FALSE
|
615
797
|
if(!seq.names) return(sel)
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
798
|
+
# seq.names=TRUE and pos.breaks=0
|
799
|
+
if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
|
800
|
+
rp$pos.breaks==rp$seq.breaks)
|
801
|
+
return(rp$seq.names[sel])
|
802
|
+
# seq.names=TRUE and pos.breaks!=0
|
803
|
+
return(enve.recplot2.coordinates(rp,sel))
|
804
|
+
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
805
|
+
### it returns a vector of characters if the object was built with
|
806
|
+
### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
|
807
|
+
### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
|
808
|
+
}
|
809
|
+
|
810
|
+
enve.recplot2.compareIdentities <- function
|
811
|
+
### Compare the distribution of identities between two enve.RecPlot2 objects.
|
812
|
+
(x,
|
813
|
+
### First enve.RecPlot2 object.
|
814
|
+
y,
|
815
|
+
### Second enve.RecPlot2 object.
|
816
|
+
method="hellinger",
|
817
|
+
### Distance method to use. This should be (an unambiguous abbreviation of)
|
818
|
+
### one of:
|
819
|
+
### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
|
820
|
+
### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
|
821
|
+
### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
|
822
|
+
### doi:10.1214/aoms/1177729694), or "euclidean".
|
823
|
+
smooth.par=NULL,
|
824
|
+
### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
825
|
+
### Use NULL to automatically determine this value using leave-one-out
|
826
|
+
### cross-validation (see `smooth.spline` parameter `spar`).
|
827
|
+
pseudocounts=0,
|
828
|
+
### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
829
|
+
### 1 for add-one smoothing.
|
830
|
+
max.deviation=0.75
|
831
|
+
### Maximum mean deviation between identity breaks tolerated (as percent
|
832
|
+
### identity). Difference in number of id.breaks is never tolerated.
|
833
|
+
){
|
834
|
+
METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
|
835
|
+
i.meth <- pmatch(method, METHODS)
|
836
|
+
if (is.na(i.meth)) stop("Invalid distance ", method)
|
837
|
+
if(!inherits(x, "enve.RecPlot2"))
|
838
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
839
|
+
if(!inherits(y, "enve.RecPlot2"))
|
840
|
+
stop("'y' must inherit from class `enve.RecPlot2`")
|
841
|
+
if(length(x$id.breaks) != length(y$id.breaks))
|
842
|
+
stop("'x' and 'y' must have the same number of `id.breaks`")
|
843
|
+
dev <- mean(abs(x$id.breaks - y$id.breaks))
|
844
|
+
if(dev > max.deviation)
|
845
|
+
stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
|
846
|
+
dev)
|
847
|
+
x.cnt <- x$id.counts
|
848
|
+
y.cnt <- y$id.counts
|
849
|
+
if(is.null(smooth.par) || smooth.par > 0){
|
850
|
+
x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
|
851
|
+
y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
|
852
|
+
p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
|
853
|
+
q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
|
854
|
+
x.cnt <- pmax(p.spline$y, 0)
|
855
|
+
y.cnt <- pmax(q.spline$y, 0)
|
856
|
+
}
|
857
|
+
a <- as.numeric(pseudocounts)
|
858
|
+
p <- (x.cnt + a) / sum(x.cnt + a)
|
859
|
+
q <- (y.cnt + a) / sum(y.cnt + a)
|
860
|
+
d <- NA
|
861
|
+
if(i.meth %in% c(1L, 2L)){
|
862
|
+
d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
|
863
|
+
if(i.meth==2L) d <- 1 - d**2
|
864
|
+
}else if(i.meth %in% c(3L, 4L)){
|
865
|
+
sel <- p>0
|
866
|
+
if(any(q[sel]==0))
|
867
|
+
stop("Undefined distance without absolute continuity, use pseudocounts")
|
868
|
+
d <- -sum(p[sel]*log(q[sel]/p[sel]))
|
869
|
+
}else if(i.meth == 5L){
|
870
|
+
d <- sqrt(sum((q-p)**2))
|
871
|
+
}
|
872
|
+
return(d)
|
873
|
+
}
|
874
|
+
|
875
|
+
enve.recplot2.coordinates <- function
|
876
|
+
### Returns the sequence name and coordinates of the requested position bins.
|
877
|
+
(x,
|
878
|
+
### `enve.RecPlot2` object.
|
879
|
+
bins
|
880
|
+
### Vector of selected bins to return. It can be a vector of logical values
|
881
|
+
### with the same length as `x$pos.breaks`-1 or a vector of integers. If
|
882
|
+
### missing, returns the coordinates of all windows.
|
883
|
+
){
|
884
|
+
if(!inherits(x, "enve.RecPlot2"))
|
885
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
886
|
+
if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
|
887
|
+
if(!is.vector(bins)) stop("'bins' must be a vector")
|
888
|
+
if(inherits(bins, "logical")) bins <- which(bins)
|
889
|
+
|
890
|
+
y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
|
891
|
+
|
892
|
+
for(i in 1:length(bins)){
|
893
|
+
j <- bins[i]
|
894
|
+
# Concatenated coordinates
|
895
|
+
cc <- x$pos.breaks[c(j, j+1)]
|
896
|
+
# Find the corresponding `seq.breaks`
|
897
|
+
sb.from <- which(
|
898
|
+
cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
|
899
|
+
cc[1] < x$seq.breaks[-1])
|
900
|
+
sb.to <- which(
|
901
|
+
cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
|
902
|
+
cc[2] <=x$seq.breaks[-1])
|
903
|
+
# Translate coordinates
|
904
|
+
if(length(sb.from)==1 & length(sb.to)==1){
|
905
|
+
y[i, 'name.from'] <- x$seq.names[sb.from]
|
906
|
+
y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
|
907
|
+
y[i, 'name.to'] <- x$seq.names[sb.to]
|
908
|
+
y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
|
909
|
+
}
|
910
|
+
}
|
911
|
+
|
912
|
+
return(y)
|
913
|
+
### Returns a data.frame with four columns: name.from (character), pos.from
|
914
|
+
### (numeric) name.to (character), and pos.to (numeric). The first two
|
915
|
+
### correspond to sequence and position of the start point of the bin, the
|
916
|
+
### last two correspond to the sequence and position of the end point of the
|
917
|
+
### bin.
|
918
|
+
}
|
919
|
+
|
920
|
+
enve.recplot2.seqdepth <- function
|
921
|
+
### Calculate the sequencing depth of the given window(s)
|
922
|
+
(x,
|
923
|
+
### `enve.RecPlot2` object.
|
924
|
+
sel,
|
925
|
+
### Window(s) for which the sequencing depth is to be calculated. If not
|
926
|
+
### passed, it returns the sequencing depth of all windows
|
927
|
+
low.identity=FALSE
|
928
|
+
### A logical indicating if the sequencing depth is to be estimated only
|
929
|
+
### with low-identity matches. By default, only high-identity matches are
|
930
|
+
### used.
|
931
|
+
){
|
932
|
+
if(!inherits(x, "enve.RecPlot2"))
|
933
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
934
|
+
pos.cnts.in <- x$pos.counts.in
|
935
|
+
pos.breaks <- x$pos.breaks
|
936
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
937
|
+
seqdepth.in <- pos.cnts.in/pos.binsize
|
938
|
+
if(missing(sel)) return(seqdepth.in)
|
939
|
+
return(seqdepth.in[sel])
|
940
|
+
### Returns a numeric vector of sequencing depths (in bp/bp).
|
941
|
+
}
|
942
|
+
|
943
|
+
enve.recplot2.ANIr <- function
|
944
|
+
### Estimate the Average Nucleotide Identity from reads (ANIr) from a
|
945
|
+
### recruitment plot
|
946
|
+
(x,
|
947
|
+
### `enve.RecPlot2` object.
|
948
|
+
range=c(0,Inf)
|
949
|
+
### Range of identities to be considered. By default, the full range
|
950
|
+
### is used (note that the upper boundary is `Inf` and not 100 because
|
951
|
+
### recruitment plots can also be built with bit-scores). To use only
|
952
|
+
### intra-population matches (with identities), use c(95,100). To use only
|
953
|
+
### inter-population values, use c(0,95).
|
954
|
+
){
|
955
|
+
if(!inherits(x, "enve.RecPlot2"))
|
956
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
957
|
+
id.b <- x$id.breaks
|
958
|
+
id <- (id.b[-1]+id.b[-length(id.b)])/2
|
959
|
+
cnt <- x$id.counts
|
960
|
+
cnt[id < range[1]] <- 0
|
961
|
+
cnt[id > range[2]] <- 0
|
962
|
+
return(sum(id*cnt/sum(cnt)))
|
620
963
|
}
|
621
964
|
|
622
965
|
#==============> Define internal functions
|
@@ -640,21 +983,67 @@ enve.recplot2.__counts <- function
|
|
640
983
|
return(counts);
|
641
984
|
}
|
642
985
|
|
986
|
+
enve.recplot2.findPeaks.__emauto_one <- function
|
987
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
|
988
|
+
(x, comp, do_crit, best, verbose, ...){
|
989
|
+
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
990
|
+
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
991
|
+
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
992
|
+
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
993
|
+
ifelse(crit > best[['crit']], '*', ''), '\n')
|
994
|
+
if(crit > best[['crit']]){
|
995
|
+
best[['crit']] <- crit
|
996
|
+
best[['peaks']] <- peaks
|
997
|
+
}
|
998
|
+
best[['pstore']][[comp]] <- peaks
|
999
|
+
return(best)
|
1000
|
+
}
|
1001
|
+
enve.recplot2.findPeaks.__em_e <- function
|
1002
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
|
1003
|
+
(x, theta){
|
1004
|
+
components <- length(theta[['mu']])
|
1005
|
+
product <- do.call(cbind,
|
1006
|
+
lapply(1:components,
|
1007
|
+
function(i) dnorm(x, theta[['mu']][i],
|
1008
|
+
theta[['sd']][i])*theta[['alpha']][i]))
|
1009
|
+
sum.of.components <- rowSums(product)
|
1010
|
+
posterior <- product / sum.of.components
|
1011
|
+
|
1012
|
+
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
enve.recplot2.findPeaks.__em_m <- function
|
1016
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
|
1017
|
+
(x, posterior){
|
1018
|
+
components <- ncol(posterior)
|
1019
|
+
n <- colSums(posterior)
|
1020
|
+
mu <- colSums(posterior * x) / n
|
1021
|
+
sd <- sqrt( colSums(
|
1022
|
+
posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
|
1023
|
+
alpha <- n/length(x)
|
1024
|
+
return(list(mu=mu, sd=sd, alpha=alpha))
|
1025
|
+
}
|
1026
|
+
|
643
1027
|
enve.recplot2.__peakHist <- function
|
644
1028
|
### Internal ancilliary function (see `enve.RecPlot2.Peak`).
|
645
1029
|
(x, mids, counts=TRUE){
|
646
1030
|
d.o <- x$param.hat
|
647
|
-
|
1031
|
+
if(length(x$log)==0) x$log <- FALSE
|
1032
|
+
if(x$log){
|
1033
|
+
d.o$x <- log(mids)
|
1034
|
+
}else{
|
1035
|
+
d.o$x <- mids
|
1036
|
+
}
|
648
1037
|
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
649
1038
|
if(!counts) return(prob)
|
650
1039
|
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
651
1040
|
return(prob*x$n.hat/sum(prob))
|
652
1041
|
}
|
653
1042
|
|
654
|
-
enve.recplot2.
|
655
|
-
### Internall ancilliary function (see `enve.recplot2.findPeaks`).
|
1043
|
+
enve.recplot2.findPeaks.__mow_one <- function
|
1044
|
+
### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
|
656
1045
|
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
657
|
-
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
|
1046
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
|
658
1047
|
){
|
659
1048
|
dist <- ifelse(with.skewness, 'sn', 'norm');
|
660
1049
|
|
@@ -683,8 +1072,14 @@ enve.recplot2.__findPeak <- function
|
|
683
1072
|
if(round>1) param.hat <- last.hat;
|
684
1073
|
break;
|
685
1074
|
}
|
686
|
-
|
687
|
-
|
1075
|
+
if(round > 1){
|
1076
|
+
epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
|
1077
|
+
if(epsilon1 < optim.epsilon) break;
|
1078
|
+
if(round > 2){
|
1079
|
+
epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
1080
|
+
if(epsilon2 < optim.epsilon) break;
|
1081
|
+
}
|
1082
|
+
}
|
688
1083
|
}
|
689
1084
|
if(verbose) cat('\n')
|
690
1085
|
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
@@ -695,14 +1090,14 @@ enve.recplot2.__findPeak <- function
|
|
695
1090
|
n.hat <- length(lsd1.pop)/diff(quant.est)
|
696
1091
|
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
697
1092
|
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
698
|
-
merge.logdist=merge.logdist)
|
1093
|
+
merge.logdist=merge.logdist, log=log)
|
699
1094
|
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
700
1095
|
peak.cnt <- enve.recplot2.__peakHist(peak,
|
701
1096
|
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
702
1097
|
for(i in 2:length(peak.breaks)){
|
703
1098
|
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
704
1099
|
n.exp <- peak.cnt[i-1]
|
705
|
-
if(n.exp==0) n.exp
|
1100
|
+
if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
|
706
1101
|
if(length(values)==0) next
|
707
1102
|
in.peak <- runif(length(values)) <= n.exp/length(values)
|
708
1103
|
lsd2 <- c(lsd2, values[!in.peak])
|
@@ -716,17 +1111,19 @@ enve.recplot2.__findPeak <- function
|
|
716
1111
|
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
717
1112
|
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
718
1113
|
plot=FALSE)$counts)+1)/2
|
1114
|
+
mu <- tail(param.hat, n=1)
|
1115
|
+
attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
|
719
1116
|
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
720
1117
|
'with expected n =',n.hat,'\n')
|
721
1118
|
return(peak)
|
722
1119
|
}
|
723
1120
|
|
724
|
-
enve.recplot2.
|
725
|
-
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
1121
|
+
enve.recplot2.findPeaks.__mower <- function
|
1122
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
|
726
1123
|
(peaks.opts){
|
727
1124
|
peaks <- list()
|
728
1125
|
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
729
|
-
peak <- do.call(enve.recplot2.
|
1126
|
+
peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
|
730
1127
|
if(is.null(peak)) break
|
731
1128
|
peaks[[ length(peaks)+1 ]] <- peak
|
732
1129
|
peaks.opts$lsd1 <- peak$values.res
|
@@ -738,7 +1135,10 @@ enve.recplot2.__findPeaks <- function
|
|
738
1135
|
enve.recplot2.__whichClosestPeak <- function
|
739
1136
|
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
740
1137
|
(peak, peaks){
|
741
|
-
dist <- as.numeric(lapply(peaks,
|
1138
|
+
dist <- as.numeric(lapply(peaks,
|
1139
|
+
function(x)
|
1140
|
+
abs(log(x$param.hat[[ length(x$param.hat) ]] /
|
1141
|
+
peak$param.hat[[ length(peak$param.hat) ]] ))))
|
742
1142
|
dist[ dist==0 ] <- Inf
|
743
1143
|
return(which.min(dist))
|
744
1144
|
}
|