miga-base 0.3.1.7 → 0.3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -1,43 +1,49 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
2
|
|
3
|
-
# Interpose sequences in FastA format from two files into one output file. If more than two files are
|
4
|
-
# provided, the script will interpose all the input files.
|
5
|
-
# Please note that this script will check for the consistency of the names (assuming a pair of related reads
|
6
|
-
# contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn this
|
7
|
-
# feature off just set the $eval_T variable to zero. If you want to decrease the sampling period (to speed
|
8
|
-
# the script up) or increase it (to make it more sensitive to errors) just change $eval_T accordingly.
|
9
|
-
#
|
10
3
|
# @author Luis M. Rodriguez-R
|
11
|
-
# @version 1.0
|
12
|
-
# @created Nov-27-2012
|
13
|
-
# @update Mar-23-2015
|
14
4
|
# @license artistic license 2.0
|
15
|
-
#
|
16
|
-
# Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
17
5
|
|
18
6
|
use strict;
|
19
7
|
use warnings;
|
20
8
|
use Symbol;
|
21
9
|
|
22
10
|
my $HELP = <<HELP
|
11
|
+
|
12
|
+
Description:
|
13
|
+
Interposes sequences in FastA format from two files into one output file.
|
14
|
+
If more than two files are provided, the script will interpose all the input
|
15
|
+
files.
|
16
|
+
Note that this script will check for the consistency of the names (assuming
|
17
|
+
a pair of related reads contains the same name varying only in a trailing
|
18
|
+
slash (/) followed by a digit. If you want to turn this feature off just
|
19
|
+
set the -T option to zero. If you want to decrease the sampling period (to
|
20
|
+
speed the script up) or increase it (to make it more sensitive to errors)
|
21
|
+
just change the -T option accordingly.
|
22
|
+
|
23
23
|
Usage:
|
24
|
-
|
24
|
+
$0 [-T <int> ]<output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
|
25
25
|
|
26
26
|
Where,
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
-T <int> : Optional. Integer indicating the sampling period for
|
28
|
+
names evaluation (see Description above).
|
29
|
+
By default: 1000.
|
30
|
+
output_fasta : Output file
|
31
|
+
input_fasta_1 : First FastA file
|
32
|
+
input_fasta_2 : Second FastA file
|
33
|
+
... : Any additional FastA files (or none)
|
31
34
|
|
32
35
|
HELP
|
33
36
|
;
|
34
|
-
my $eval_T = 1000;
|
35
|
-
|
37
|
+
my $eval_T = 1000;
|
38
|
+
if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
|
39
|
+
$eval_T = $ARGV[1]+0;
|
40
|
+
shift @ARGV;
|
41
|
+
shift @ARGV;
|
42
|
+
}
|
36
43
|
my $out = shift @ARGV;
|
37
44
|
my @in = @ARGV;
|
38
45
|
$/ = "\n>";
|
39
46
|
|
40
|
-
|
41
47
|
die $HELP unless $out and $#in >= 1;
|
42
48
|
open OUT, ">", $out or die "Unable to write on $out: $!\n";
|
43
49
|
print "Output file: $out\n";
|
@@ -1,11 +1,7 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
|
-
|
2
|
+
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
|
-
# @version 2.0
|
5
|
-
# @update: Mar-23-2015
|
6
4
|
# @license artistic license 2.0
|
7
|
-
#
|
8
|
-
# Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
9
5
|
|
10
6
|
use strict;
|
11
7
|
use warnings;
|
@@ -14,24 +10,27 @@ use Symbol;
|
|
14
10
|
my $HELP = <<HELP
|
15
11
|
|
16
12
|
Description:
|
17
|
-
Interposes sequences in FastQ format from two files into one output file.
|
18
|
-
provided, the script will interpose all the input
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
(
|
23
|
-
|
24
|
-
|
13
|
+
Interposes sequences in FastQ format from two files into one output file.
|
14
|
+
If more than two files are provided, the script will interpose all the input
|
15
|
+
files.
|
16
|
+
Note that this script will check for the consistency of the names (assuming
|
17
|
+
a pair of related reads contains the same name varying only in a trailing
|
18
|
+
slash (/) followed by a digit. If you want to turn this feature off just
|
19
|
+
set the -T option to zero. If you want to decrease the sampling period (to
|
20
|
+
speed the script up) or increase it (to make it more sensitive to errors)
|
21
|
+
just change the -T option accordingly.
|
22
|
+
|
25
23
|
Usage:
|
26
|
-
|
24
|
+
$0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
|
27
25
|
|
28
26
|
Where,
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
-T <int> : Optional. Integer indicating the sampling period for
|
28
|
+
names evaluation (see Description above).
|
29
|
+
By default: 1000.
|
30
|
+
output_fastq : Output file
|
31
|
+
input_fastq_1 : First FastQ file
|
32
|
+
input_fastq_2 : Second FastQ file
|
33
|
+
... : Any additional FastQ files (or none)
|
35
34
|
|
36
35
|
HELP
|
37
36
|
;
|
@@ -44,6 +43,7 @@ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
|
|
44
43
|
my $out = shift @ARGV;
|
45
44
|
my @in = @ARGV;
|
46
45
|
|
46
|
+
|
47
47
|
die $HELP unless $out and $#in >= 1;
|
48
48
|
open OUT, ">", $out or die "Unable to write on $out: $!\n";
|
49
49
|
print "Output file: $out\n";
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Jan-04-2016
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
#= Load stuff
|
9
|
+
args <- commandArgs(trailingOnly = F)
|
10
|
+
enveomics_R <- file.path(dirname(
|
11
|
+
sub("^--file=", "", args[grep("^--file=", args)])),
|
12
|
+
"lib", "enveomics.R")
|
13
|
+
library(methods)
|
14
|
+
source(file.path(enveomics_R, "R", "cliopts.R"))
|
15
|
+
source(file.path(enveomics_R, "R", "recplot2.R"))
|
16
|
+
|
17
|
+
#= Generate interface
|
18
|
+
opt <- enve.cliopts(enve.recplot2.compareIdentities,
|
19
|
+
file.path(enveomics_R, "man", "enve.recplot2.compareIdentities.Rd"),
|
20
|
+
positional_arguments=2,
|
21
|
+
usage="usage: %prog [options] recplot-A.Rdata recplot-B.Rdata",
|
22
|
+
number=c("pseudocounts", "max.deviation"), ignore=c("x", "y"),
|
23
|
+
p_desc="Calculates the difference between identity distributions of two recruitment plots.")
|
24
|
+
|
25
|
+
#= Run it!
|
26
|
+
load(opt$args[1])
|
27
|
+
opt$options[['x']] <- rp
|
28
|
+
load(opt$args[2])
|
29
|
+
opt$options[['y']] <- rp
|
30
|
+
dist <- do.call("enve.recplot2.compareIdentities", opt$options)
|
31
|
+
cat(dist, '\n')
|
32
|
+
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
#
|
4
4
|
# @author Luis M. Rodriguez-R
|
5
|
-
# @update Nov-27-2015
|
6
5
|
# @license artistic license 2.0
|
7
6
|
#
|
8
7
|
|
9
|
-
DATA_LINK="
|
8
|
+
DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
|
10
9
|
DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
|
11
10
|
SRX=$1
|
12
|
-
DIR=${
|
11
|
+
DIR=${2:-$SRX}
|
13
12
|
|
14
13
|
if [[ "$SRX" == "" ]] ; then
|
15
14
|
echo "
|
@@ -27,24 +26,32 @@ fi
|
|
27
26
|
|
28
27
|
[[ -d "$DIR" ]] || mkdir "$DIR"
|
29
28
|
|
29
|
+
function md5value {
|
30
|
+
local file=$1
|
31
|
+
o=$(md5 "$file" | perl -pe 's/.* //')
|
32
|
+
[[ -n $o ]] || o=$(md5sum-lite "$file" | awk '{print $1}')
|
33
|
+
[[ -n $o ]] || o=$(md5sum "$file" | awk '{print $1}')
|
34
|
+
echo "$o"
|
35
|
+
}
|
36
|
+
|
30
37
|
curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
|
31
38
|
tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
39
|
+
srr=$(echo "$ln"|cut -f 1)
|
40
|
+
ftp=$(echo "$ln"|cut -f 2)
|
41
|
+
md5=$(echo "$ln"|cut -f 3)
|
42
|
+
dir="$DIR/$srr"
|
43
|
+
[[ -d "$dir" ]] || mkdir "$dir"
|
44
|
+
echo "o $srr" >&2
|
45
|
+
for uri in $(echo "$ftp" | tr ";" " ") ; do
|
46
|
+
file="$dir/$(basename $uri)"
|
47
|
+
curl "$uri" -o "$file"
|
48
|
+
md5obs=$(md5value "$file")
|
49
|
+
if [[ "$md5" == "$md5obs"* ]] ; then
|
50
|
+
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
51
|
+
else
|
52
|
+
echo "Corrupt file: $file" >&2
|
53
|
+
echo " MD5 mismatch: $md5obs not in $md5" >&2
|
54
|
+
exit 1;
|
55
|
+
fi
|
56
|
+
done
|
50
57
|
done
|
@@ -11,6 +11,7 @@ enveomics_R <- file.path(dirname(
|
|
11
11
|
sub("^--file=", "", args[grep("^--file=", args)])),
|
12
12
|
"lib", "enveomics.R")
|
13
13
|
source(file.path(enveomics_R, "R", "cliopts.R"))
|
14
|
+
source(file.path(enveomics_R, "R", "utils.R"))
|
14
15
|
source(file.path(enveomics_R, "R", "barplot.R"))
|
15
16
|
|
16
17
|
#= Generate interface
|
@@ -296,8 +296,10 @@ Dir.mktmpdir do |dir|
|
|
296
296
|
"#{dir}/#{i}.tab.uns"`
|
297
297
|
`sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
|
298
298
|
when "diamond"
|
299
|
-
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --
|
300
|
-
--
|
299
|
+
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
|
300
|
+
--query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" \
|
301
|
+
&& "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
|
302
|
+
--out "#{dir}/#{i}.tab"`
|
301
303
|
else
|
302
304
|
abort "Unsupported program: #{o[:program]}."
|
303
305
|
end
|
@@ -34,11 +34,11 @@ library(inlinedocs)
|
|
34
34
|
package.skeleton.dx('./');
|
35
35
|
" | R --vanilla
|
36
36
|
cat man/enveomics.R-package.Rd | tr -d '\r' \
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
| grep -v '^}$' | grep -v '^\\author{' \
|
38
|
+
| grep -v '^Maintainer' \
|
39
|
+
| perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
|
40
|
+
| perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
|
41
|
+
> o && mv o man/enveomics.R-package.Rd
|
42
42
|
#[[ ! -d inst/doc ]] && mkdir -p inst/doc
|
43
43
|
#pandoc -o inst/doc/enveomics.R.pdf -f markdown_github README.md
|
44
44
|
|
@@ -10,7 +10,8 @@ importFrom("graphics", "abline", "axis", "barplot", "hist", "image",
|
|
10
10
|
"layout", "legend", "lines", "par", "points", "polygon",
|
11
11
|
"rect", "text")
|
12
12
|
importFrom("stats", "as.dist", "cophenetic", "cor", "median",
|
13
|
-
"quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm"
|
13
|
+
"quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm",
|
14
|
+
"dnorm", "kmeans")
|
14
15
|
importFrom("utils", "head", "read.table", "setTxtProgressBar", "tail",
|
15
16
|
"txtProgressBar")
|
16
17
|
exportClasses(enve.RecPlot2, enve.RecPlot2.Peak, enve.TRIBS, enve.TRIBStest,
|
@@ -29,7 +30,10 @@ export(
|
|
29
30
|
plot.enve.TRIBStest, summary.enve.TRIBStest,
|
30
31
|
enve.df2dist, enve.df2dist.group, enve.df2dist.list,
|
31
32
|
enve.recplot2, plot.enve.RecPlot2, enve.recplot2.findPeaks,
|
33
|
+
enve.recplot2.findPeaks.emauto, enve.recplot2.findPeaks.em,
|
34
|
+
enve.recplot2.findPeaks.mower,
|
32
35
|
enve.recplot2.corePeak, enve.recplot2.changeCutoff,
|
33
|
-
enve.recplot2.extractWindows,
|
36
|
+
enve.recplot2.extractWindows, enve.recplot2.compareIdentities,
|
37
|
+
enve.recplot2.coordinates, enve.recplot2.seqdepth, enve.recplot2.ANIr,
|
34
38
|
enve.growthcurve, plot.enve.GrowthCurve, summary.enve.GrowthCurve,
|
35
39
|
enve.col2alpha)
|
@@ -9,7 +9,7 @@ setClass("enve.RecPlot2",
|
|
9
9
|
id.counts='numeric', ##<< Counts per ID bin.
|
10
10
|
id.breaks='numeric', ##<< Breaks of identity bins.
|
11
11
|
pos.breaks='numeric', ##<< Breaks of position bins.
|
12
|
-
seq.breaks='numeric',
|
12
|
+
seq.breaks='numeric', ##<< Breaks of input sequences.
|
13
13
|
peaks='list', ##<< Peaks identified in the recplot.
|
14
14
|
### Limits of the subject sequences after concatenation.
|
15
15
|
seq.names='character', ##<< Names of the subject sequences.
|
@@ -45,9 +45,13 @@ setClass("enve.RecPlot2.Peak",
|
|
45
45
|
### number of position bins with non-zero sequencing depth in the recruitment
|
46
46
|
### plot (regardless of peak count).
|
47
47
|
err.res='numeric',
|
48
|
-
### Error left after adding the peak.
|
49
|
-
merge.logdist='numeric'
|
48
|
+
### Error left after adding the peak (mower) or log-likelihood (em or emauto).
|
49
|
+
merge.logdist='numeric',
|
50
50
|
### Attempted `merge.logdist` parameter.
|
51
|
+
seq.depth='numeric',
|
52
|
+
### Best estimate available for the sequencing depth of the peak (centrality).
|
53
|
+
log='logical'
|
54
|
+
### Indicates if the estimation was performed in natural logarithm space
|
51
55
|
));
|
52
56
|
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
53
57
|
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
@@ -83,6 +87,9 @@ plot.enve.RecPlot2 <- function
|
|
83
87
|
peaks.col='darkred',
|
84
88
|
### If not NA, it attempts to represent peaks in the population histogram
|
85
89
|
### in the specified color. Set to NA to avoid peak-finding.
|
90
|
+
use.peaks,
|
91
|
+
### A list of `enve.RecPlot2.Peak` objects, as returned by
|
92
|
+
### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
|
86
93
|
id.lim=range(x$id.breaks),
|
87
94
|
### Limits of identities to represent.
|
88
95
|
pos.lim=range(x$pos.breaks),
|
@@ -98,15 +105,17 @@ plot.enve.RecPlot2 <- function
|
|
98
105
|
### the number of the panel as index (see `layout`).
|
99
106
|
pos.splines=0,
|
100
107
|
### Smoothing parameter for the splines in the position histogram. Zero
|
101
|
-
### (0) for no splines.
|
108
|
+
### (0) for no splines. Use NULL to automatically detect by leave-one-out
|
109
|
+
### cross-validation.
|
102
110
|
id.splines=1/2,
|
103
111
|
### Smoothing parameter for the splines in the identity histogram. Zero
|
104
|
-
### (0) for no splines.
|
105
|
-
|
112
|
+
### (0) for no splines. Use NULL to automatically detect by leave-one-out
|
113
|
+
### cross-validation.
|
114
|
+
in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
106
115
|
### Line width for the sequencing depth of in-group matches.
|
107
|
-
out.lwd=ifelse(pos.splines>0, 1/2, 2),
|
116
|
+
out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
108
117
|
### Line width for the sequencing depth of out-group matches.
|
109
|
-
id.lwd=ifelse(id.splines>0, 1/2, 2),
|
118
|
+
id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
|
110
119
|
### Line width for the identity histogram.
|
111
120
|
in.col='darkblue',
|
112
121
|
### Color associated to in-group matches.
|
@@ -193,7 +202,7 @@ plot.enve.RecPlot2 <- function
|
|
193
202
|
pos.f <- rep(seqdepth.in,each=2)
|
194
203
|
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
195
204
|
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
196
|
-
if(pos.splines > 0){
|
205
|
+
if(is.null(pos.splines) || pos.splines > 0){
|
197
206
|
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
198
207
|
spar=pos.splines)
|
199
208
|
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
@@ -230,7 +239,7 @@ plot.enve.RecPlot2 <- function
|
|
230
239
|
id.f <- rep(id.counts,each=2)
|
231
240
|
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
232
241
|
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
233
|
-
if(id.splines > 0){
|
242
|
+
if(is.null(id.splines) || id.splines > 0){
|
234
243
|
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
235
244
|
spar=id.splines)
|
236
245
|
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
@@ -267,8 +276,12 @@ plot.enve.RecPlot2 <- function
|
|
267
276
|
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
268
277
|
y.tmp.in, border=NA, col=in.col)
|
269
278
|
if(!is.na(peaks.col)){
|
270
|
-
o
|
271
|
-
|
279
|
+
o <- peaks.opts; o$x = x;
|
280
|
+
if(missing(use.peaks)){
|
281
|
+
peaks <- do.call(enve.recplot2.findPeaks, o)
|
282
|
+
}else{
|
283
|
+
peaks <- use.peaks
|
284
|
+
}
|
272
285
|
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
273
286
|
if(!is.null(peaks) & length(peaks)>0){
|
274
287
|
pf <- h.mids*0;
|
@@ -276,18 +289,23 @@ plot.enve.RecPlot2 <- function
|
|
276
289
|
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
277
290
|
lines(cnt, h.mids, col='red');
|
278
291
|
pf <- pf+cnt;
|
279
|
-
axis(4, at=peaks[[i]]$
|
280
|
-
letters[i], las=1, hadj=1/2)
|
292
|
+
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
281
293
|
}
|
282
294
|
lines(pf, h.mids, col='red',lwd=1.5);
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
295
|
+
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
296
|
+
frx <- signif(100*as.numeric(
|
297
|
+
lapply(peaks,
|
298
|
+
function(x) ifelse(length(x$values)==0, x$n.hat,
|
299
|
+
length(x$values))/x$n.total)), 2)
|
300
|
+
if(peaks[[1]]$err.res < 0){
|
301
|
+
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
302
|
+
}else{
|
303
|
+
err <- paste(', err:',
|
304
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
305
|
+
}
|
306
|
+
legend('topright', bty='n', cex=1/2,
|
307
|
+
legend=paste(letters[1:length(peaks)],'. ',
|
308
|
+
dpt,'X (', frx, '%', err, ')', sep=''))
|
291
309
|
}
|
292
310
|
}
|
293
311
|
}
|
@@ -334,6 +352,10 @@ enve.recplot2 <- function(
|
|
334
352
|
id.breaks=300,
|
335
353
|
### Breaks in the identity histogram. It can also be a vector of break
|
336
354
|
### points, and values outside the range are ignored.
|
355
|
+
id.free.range=FALSE,
|
356
|
+
### Indicates that the range should be freely set from the observed
|
357
|
+
### values. Otherwise, 70-100% is included in the identity histogram
|
358
|
+
### (default).
|
337
359
|
id.metric=c('identity', 'corrected identity', 'bit score'),
|
338
360
|
### Metric of identity to be used (Y-axis). Corrected identity is only
|
339
361
|
### supported if the original BLAST file included sequence lengths.
|
@@ -373,12 +395,14 @@ enve.recplot2 <- function(
|
|
373
395
|
if(pos.breaks>0){
|
374
396
|
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
375
397
|
}else{
|
376
|
-
pos.breaks <- c(lim[,2],
|
398
|
+
pos.breaks <- c(lim[1,2], lim[,3])
|
377
399
|
}
|
378
400
|
}
|
379
401
|
if(length(id.breaks)==1){
|
380
|
-
id.
|
381
|
-
|
402
|
+
id.range.v <- rec[,rec.idcol]
|
403
|
+
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
404
|
+
id.range.v <- range(id.range.v)
|
405
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
|
382
406
|
}
|
383
407
|
|
384
408
|
# Run in parallel
|
@@ -423,8 +447,159 @@ enve.recplot2 <- function(
|
|
423
447
|
}
|
424
448
|
|
425
449
|
enve.recplot2.findPeaks <- function(
|
450
|
+
### Identifies peaks in the population histogram potentially indicating
|
451
|
+
### sub-population mixtures
|
452
|
+
x,
|
453
|
+
### An `enve.RecPlot2` object.
|
454
|
+
method="emauto",
|
455
|
+
### Peak-finder method. This should be one of:
|
456
|
+
### "emauto" (Expectation-Maximization with auto-selection of components),
|
457
|
+
### "em" (Expectation-Maximization),
|
458
|
+
### "mower" (Custom distribution-mowing method).
|
459
|
+
...
|
460
|
+
### Any additional parameters supported by
|
461
|
+
### `enve.recplot2.findPeaks.<method>`.
|
462
|
+
){
|
463
|
+
if(method == "emauto"){
|
464
|
+
peaks <- enve.recplot2.findPeaks.emauto(x, ...)
|
465
|
+
}else if(method == "em"){
|
466
|
+
peaks <- enve.recplot2.findPeaks.em(x, ...)
|
467
|
+
}else if(method == "mower"){
|
468
|
+
peaks <- enve.recplot2.findPeaks.mower(x, ...)
|
469
|
+
}else{
|
470
|
+
stop("Invalid peak-finder method ", method)
|
471
|
+
}
|
472
|
+
return(peaks)
|
473
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
474
|
+
}
|
475
|
+
|
476
|
+
enve.recplot2.findPeaks.emauto <- function(
|
477
|
+
### Identifies peaks in the population histogram using a Gaussian Mixture
|
478
|
+
### Model Expectation Maximization (GMM-EM) method with number of components
|
479
|
+
### automatically detected.
|
480
|
+
x,
|
481
|
+
### An `enve.RecPlot2` object.
|
482
|
+
components=seq(1,10),
|
483
|
+
### A vector of number of components to evaluate.
|
484
|
+
criterion='aic',
|
485
|
+
### Criterion to use for components selection. Must be one of:
|
486
|
+
### 'aic' (Akaike Information Criterion),
|
487
|
+
### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
|
488
|
+
merge.tol=2L,
|
489
|
+
### When attempting to merge peaks with very similar sequencing depth, use
|
490
|
+
### this number of significant digits (in log-scale).
|
491
|
+
verbose=FALSE,
|
492
|
+
### Display (mostly debugging) information.
|
493
|
+
...
|
494
|
+
### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
|
495
|
+
){
|
496
|
+
best <- list(crit=0, pstore=list())
|
497
|
+
if(criterion == 'aic'){
|
498
|
+
do_crit <- function(ll, k, n) 2*k - 2*ll
|
499
|
+
}else if(criterion %in% c('bic', 'sbc')){
|
500
|
+
do_crit <- function(ll, k, n) log(n)*k - 2*ll
|
501
|
+
}else{
|
502
|
+
stop('Invalid criterion ', criterion)
|
503
|
+
}
|
504
|
+
for(comp in components){
|
505
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
506
|
+
verbose, ...)
|
507
|
+
}
|
508
|
+
|
509
|
+
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
510
|
+
function(x) x$seq.depth)), merge.tol)
|
511
|
+
distinct <- length(unique(seqdepths.r))
|
512
|
+
if(distinct < length(best[['peaks']])){
|
513
|
+
if(verbose) cat('Attempting merge to', distinct, 'components\n')
|
514
|
+
init <- apply(sapply(best[['peaks']],
|
515
|
+
function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
|
516
|
+
init <- init[!duplicated(seqdepths.r),]
|
517
|
+
init <- list(mu=init[,'mean'], sd=init[,'sd'],
|
518
|
+
alpha=init[,'alpha']/sum(init[,'alpha']))
|
519
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
|
520
|
+
verbose, ...)
|
521
|
+
}
|
522
|
+
return(best[['peaks']])
|
523
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
524
|
+
}
|
525
|
+
|
526
|
+
enve.recplot2.findPeaks.em <- function(
|
527
|
+
### Identifies peaks in the population histogram using a Gaussian Mixture
|
528
|
+
### Model Expectation Maximization (GMM-EM) method.
|
529
|
+
x,
|
530
|
+
### An `enve.RecPlot2` object.
|
531
|
+
max.iter=1000,
|
532
|
+
### Maximum number of EM iterations.
|
533
|
+
ll.diff.res=1e-8,
|
534
|
+
### Maximum Log-Likelihood difference to be considered as convergent.
|
535
|
+
components=2,
|
536
|
+
### Number of distributions assumed in the mixture.
|
537
|
+
rm.top=0.05,
|
538
|
+
### Top-values to remove before finding peaks, as a quantile probability.
|
539
|
+
### This step is useful to remove highly conserved regions, but can be
|
540
|
+
### turned off by setting rm.top=0. The quantile is determined *after*
|
541
|
+
### removing zero-coverage windows.
|
542
|
+
verbose=FALSE,
|
543
|
+
### Display (mostly debugging) information.
|
544
|
+
init,
|
545
|
+
### Initialization parameters. By default, these are derived from k-means
|
546
|
+
### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
|
547
|
+
### of length `components`.
|
548
|
+
log=TRUE
|
549
|
+
### Logical value indicating if the estimations should be performed in
|
550
|
+
### natural logarithm units. Do not change unless you know what you're
|
551
|
+
### doing.
|
552
|
+
){
|
553
|
+
|
554
|
+
# Essential vars
|
555
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
556
|
+
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
557
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
558
|
+
if(log) lsd1 <- log(lsd1)
|
559
|
+
|
560
|
+
# 1. Initialize
|
561
|
+
if(missing(init)){
|
562
|
+
km.clust <- kmeans(lsd1, components)$cluster
|
563
|
+
init <- list(
|
564
|
+
mu = tapply(lsd1, km.clust, mean),
|
565
|
+
sd = tapply(lsd1, km.clust, sd),
|
566
|
+
alpha = table(km.clust)/length(km.clust)
|
567
|
+
)
|
568
|
+
}
|
569
|
+
m.step <- init
|
570
|
+
ll <- c()
|
571
|
+
cur.ll <- -Inf
|
572
|
+
|
573
|
+
for(i in 1:max.iter){
|
574
|
+
# 2/3. EM
|
575
|
+
e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
|
576
|
+
m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
|
577
|
+
# 4. Convergence
|
578
|
+
ll <- c(ll, e.step[["ll"]])
|
579
|
+
ll.diff <- abs(cur.ll - e.step[["ll"]])
|
580
|
+
cur.ll <- e.step[["ll"]]
|
581
|
+
if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
|
582
|
+
if(ll.diff <= ll.diff.res) break
|
583
|
+
}
|
584
|
+
|
585
|
+
# Return
|
586
|
+
peaks <- list()
|
587
|
+
for(i in 1:components){
|
588
|
+
n.hat <- m.step[['alpha']][i]*length(lsd1)
|
589
|
+
peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
|
590
|
+
values.res=0, mode=m.step[['mu']][i],
|
591
|
+
param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
|
592
|
+
n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
|
593
|
+
merge.logdist=as.numeric(), log=log,
|
594
|
+
seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
|
595
|
+
}
|
596
|
+
return(peaks)
|
597
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
598
|
+
}
|
599
|
+
|
600
|
+
enve.recplot2.findPeaks.mower <- function(
|
426
601
|
### Identifies peaks in the population histogram potentially indicating
|
427
|
-
### sub-population mixtures.
|
602
|
+
### sub-population mixtures, using a custom distribution-mowing method.
|
428
603
|
x,
|
429
604
|
### An `enve.RecPlot2` object.
|
430
605
|
min.points=10,
|
@@ -436,13 +611,12 @@ enve.recplot2.findPeaks <- function(
|
|
436
611
|
mlv.opts=list(method='parzen'),
|
437
612
|
### Options passed to `mlv` to estimate the mode.
|
438
613
|
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
439
|
-
start=list(omega=1, alpha=-1), lower=c(
|
440
|
-
upper=c(Inf, 0, Inf)),
|
614
|
+
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
441
615
|
### Options passed to `fitdist` to estimate the standard deviation if
|
442
616
|
### with.skewness=TRUE. Note that the `start` parameter will be ammended
|
443
617
|
### with xi=estimated mode for each peak.
|
444
|
-
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4
|
445
|
-
start=list(sd=1), lower=c(
|
618
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
|
619
|
+
start=list(sd=1), lower=c(0, -Inf)),
|
446
620
|
### Options passed to `fitdist` to estimate the standard deviation if
|
447
621
|
### with.skewness=FALSE. Note that the `start` parameter will be ammended
|
448
622
|
### with mean=estimated mode for each peak.
|
@@ -460,7 +634,7 @@ enve.recplot2.findPeaks <- function(
|
|
460
634
|
### "tail distribution".
|
461
635
|
optim.rounds=200,
|
462
636
|
### Maximum rounds of peak optimization.
|
463
|
-
optim.epsilon=1e-
|
637
|
+
optim.epsilon=1e-4,
|
464
638
|
### Trace change at which optimization stops (unless `optim.rounds` is
|
465
639
|
### reached first). The trace change is estimated as the sum of square
|
466
640
|
### differences between parameters in one round and those from two rounds
|
@@ -469,8 +643,12 @@ enve.recplot2.findPeaks <- function(
|
|
469
643
|
### Maximum value of |log-ratio| between centrality parameters in peaks to
|
470
644
|
### attempt merging. The default of ~0.22 corresponds to a maximum
|
471
645
|
### difference of 25%.
|
472
|
-
verbose=FALSE
|
646
|
+
verbose=FALSE,
|
473
647
|
### Display (mostly debugging) information.
|
648
|
+
log=TRUE
|
649
|
+
### Logical value indicating if the estimations should be performed in
|
650
|
+
### natural logarithm units. Do not change unless you know what you're
|
651
|
+
### doing.
|
474
652
|
){
|
475
653
|
|
476
654
|
# Essential vars
|
@@ -478,6 +656,7 @@ enve.recplot2.findPeaks <- function(
|
|
478
656
|
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
479
657
|
lsd1 <- seqdepth.in[seqdepth.in>0];
|
480
658
|
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
659
|
+
if(log) lsd1 <- log(lsd1)
|
481
660
|
if(with.skewness){
|
482
661
|
fitdist.opts <- fitdist.opts.sn
|
483
662
|
}else{
|
@@ -486,11 +665,11 @@ enve.recplot2.findPeaks <- function(
|
|
486
665
|
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
487
666
|
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
488
667
|
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
489
|
-
n.total=length(lsd1), merge.logdist=merge.logdist)
|
668
|
+
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
490
669
|
|
491
670
|
# Find seed peaks
|
492
671
|
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
493
|
-
peaks <- enve.recplot2.
|
672
|
+
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
494
673
|
|
495
674
|
# Merge overlapping peaks
|
496
675
|
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
@@ -511,7 +690,7 @@ enve.recplot2.findPeaks <- function(
|
|
511
690
|
p$param.hat[[ length(p$param.hat) ]],'&',
|
512
691
|
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
513
692
|
peaks.opts$lsd1 <- c(p$values, p2$values)
|
514
|
-
p.new <- enve.recplot2.
|
693
|
+
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
515
694
|
if(length(p.new)==1){
|
516
695
|
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
517
696
|
ignore <- c(ignore, j)
|
@@ -542,16 +721,19 @@ enve.recplot2.corePeak <- function
|
|
542
721
|
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
543
722
|
]]
|
544
723
|
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
545
|
-
# the "merge.logdist" distance, take that one instead.
|
724
|
+
# the default "merge.logdist" distance, take that one instead.
|
546
725
|
corePeak <- maxPeak
|
547
726
|
for(p in x){
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
727
|
+
p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
|
728
|
+
corePeak.len <- ifelse(
|
729
|
+
length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
|
730
|
+
sz.d <- log(p.len/corePeak.len)
|
731
|
+
if(is.nan(sz.d) || sz.d < 0) next
|
732
|
+
sq.d.a <- as.numeric(tail(p$param.hat, n=1))
|
733
|
+
sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
|
734
|
+
if(p$log) sq.d.a <- exp(sq.d.a)
|
735
|
+
if(corePeak$log) sq.d.b <- exp(sq.d.b)
|
736
|
+
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
555
737
|
}
|
556
738
|
return(corePeak)
|
557
739
|
}
|
@@ -580,43 +762,204 @@ enve.recplot2.extractWindows <- function
|
|
580
762
|
### Extract windows significantly below (or above) the peak in sequencing
|
581
763
|
### depth.
|
582
764
|
(rp,
|
583
|
-
### Recruitment plot, a enve.
|
765
|
+
### Recruitment plot, a enve.RecPlot2 object.
|
584
766
|
peak,
|
585
|
-
### Peak,
|
586
|
-
### of enve.RecPlot2.Peak objects, in which case the core peak is
|
587
|
-
### (see enve.recplot2.corePeak).
|
767
|
+
### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
|
768
|
+
### list of enve.RecPlot2.Peak objects, in which case the core peak is
|
769
|
+
### used (see `enve.recplot2.corePeak`).
|
588
770
|
lower.tail=TRUE,
|
589
771
|
### If FALSE, it returns windows significantly above the peak in
|
590
772
|
### sequencing depth.
|
591
773
|
significance=0.05,
|
592
774
|
### Significance threshold (alpha) to select windows.
|
593
775
|
seq.names=FALSE
|
594
|
-
### Returns subject sequence names instead of a vector of Booleans.
|
595
|
-
###
|
776
|
+
### Returns subject sequence names instead of a vector of Booleans. If
|
777
|
+
### the recruitment plot was generated with pos.breaks=0 it returns a
|
778
|
+
### vector of characters (the sequence identifiers), otherwise it returns
|
779
|
+
### a data.frame with a name column and two columns of coordinates.
|
596
780
|
){
|
597
781
|
# Determine the threshold
|
598
782
|
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
599
783
|
par <- peak$param.hat
|
600
784
|
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
601
785
|
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
786
|
+
if(peak$log) thr <- exp(thr)
|
602
787
|
|
603
|
-
# Estimate sequencing depths per window
|
604
|
-
pos.cnts.in <- rp$pos.counts.in
|
605
|
-
pos.breaks <- rp$pos.breaks
|
606
|
-
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
607
|
-
seqdepth.in <- pos.cnts.in/pos.binsize
|
608
|
-
|
609
788
|
# Select windows past the threshold
|
789
|
+
seqdepth.in <- enve.recplot2.seqdepth(rp)
|
610
790
|
if(lower.tail){
|
611
791
|
sel <- seqdepth.in < thr
|
612
792
|
}else{
|
613
793
|
sel <- seqdepth.in > thr
|
614
794
|
}
|
795
|
+
|
796
|
+
# seq.names=FALSE
|
615
797
|
if(!seq.names) return(sel)
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
798
|
+
# seq.names=TRUE and pos.breaks=0
|
799
|
+
if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
|
800
|
+
rp$pos.breaks==rp$seq.breaks)
|
801
|
+
return(rp$seq.names[sel])
|
802
|
+
# seq.names=TRUE and pos.breaks!=0
|
803
|
+
return(enve.recplot2.coordinates(rp,sel))
|
804
|
+
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
805
|
+
### it returns a vector of characters if the object was built with
|
806
|
+
### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
|
807
|
+
### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
|
808
|
+
}
|
809
|
+
|
810
|
+
enve.recplot2.compareIdentities <- function
|
811
|
+
### Compare the distribution of identities between two enve.RecPlot2 objects.
|
812
|
+
(x,
|
813
|
+
### First enve.RecPlot2 object.
|
814
|
+
y,
|
815
|
+
### Second enve.RecPlot2 object.
|
816
|
+
method="hellinger",
|
817
|
+
### Distance method to use. This should be (an unambiguous abbreviation of)
|
818
|
+
### one of:
|
819
|
+
### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
|
820
|
+
### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
|
821
|
+
### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
|
822
|
+
### doi:10.1214/aoms/1177729694), or "euclidean".
|
823
|
+
smooth.par=NULL,
|
824
|
+
### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
825
|
+
### Use NULL to automatically determine this value using leave-one-out
|
826
|
+
### cross-validation (see `smooth.spline` parameter `spar`).
|
827
|
+
pseudocounts=0,
|
828
|
+
### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
829
|
+
### 1 for add-one smoothing.
|
830
|
+
max.deviation=0.75
|
831
|
+
### Maximum mean deviation between identity breaks tolerated (as percent
|
832
|
+
### identity). Difference in number of id.breaks is never tolerated.
|
833
|
+
){
|
834
|
+
METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
|
835
|
+
i.meth <- pmatch(method, METHODS)
|
836
|
+
if (is.na(i.meth)) stop("Invalid distance ", method)
|
837
|
+
if(!inherits(x, "enve.RecPlot2"))
|
838
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
839
|
+
if(!inherits(y, "enve.RecPlot2"))
|
840
|
+
stop("'y' must inherit from class `enve.RecPlot2`")
|
841
|
+
if(length(x$id.breaks) != length(y$id.breaks))
|
842
|
+
stop("'x' and 'y' must have the same number of `id.breaks`")
|
843
|
+
dev <- mean(abs(x$id.breaks - y$id.breaks))
|
844
|
+
if(dev > max.deviation)
|
845
|
+
stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
|
846
|
+
dev)
|
847
|
+
x.cnt <- x$id.counts
|
848
|
+
y.cnt <- y$id.counts
|
849
|
+
if(is.null(smooth.par) || smooth.par > 0){
|
850
|
+
x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
|
851
|
+
y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
|
852
|
+
p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
|
853
|
+
q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
|
854
|
+
x.cnt <- pmax(p.spline$y, 0)
|
855
|
+
y.cnt <- pmax(q.spline$y, 0)
|
856
|
+
}
|
857
|
+
a <- as.numeric(pseudocounts)
|
858
|
+
p <- (x.cnt + a) / sum(x.cnt + a)
|
859
|
+
q <- (y.cnt + a) / sum(y.cnt + a)
|
860
|
+
d <- NA
|
861
|
+
if(i.meth %in% c(1L, 2L)){
|
862
|
+
d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
|
863
|
+
if(i.meth==2L) d <- 1 - d**2
|
864
|
+
}else if(i.meth %in% c(3L, 4L)){
|
865
|
+
sel <- p>0
|
866
|
+
if(any(q[sel]==0))
|
867
|
+
stop("Undefined distance without absolute continuity, use pseudocounts")
|
868
|
+
d <- -sum(p[sel]*log(q[sel]/p[sel]))
|
869
|
+
}else if(i.meth == 5L){
|
870
|
+
d <- sqrt(sum((q-p)**2))
|
871
|
+
}
|
872
|
+
return(d)
|
873
|
+
}
|
874
|
+
|
875
|
+
enve.recplot2.coordinates <- function
|
876
|
+
### Returns the sequence name and coordinates of the requested position bins.
|
877
|
+
(x,
|
878
|
+
### `enve.RecPlot2` object.
|
879
|
+
bins
|
880
|
+
### Vector of selected bins to return. It can be a vector of logical values
|
881
|
+
### with the same length as `x$pos.breaks`-1 or a vector of integers. If
|
882
|
+
### missing, returns the coordinates of all windows.
|
883
|
+
){
|
884
|
+
if(!inherits(x, "enve.RecPlot2"))
|
885
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
886
|
+
if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
|
887
|
+
if(!is.vector(bins)) stop("'bins' must be a vector")
|
888
|
+
if(inherits(bins, "logical")) bins <- which(bins)
|
889
|
+
|
890
|
+
y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
|
891
|
+
|
892
|
+
for(i in 1:length(bins)){
|
893
|
+
j <- bins[i]
|
894
|
+
# Concatenated coordinates
|
895
|
+
cc <- x$pos.breaks[c(j, j+1)]
|
896
|
+
# Find the corresponding `seq.breaks`
|
897
|
+
sb.from <- which(
|
898
|
+
cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
|
899
|
+
cc[1] < x$seq.breaks[-1])
|
900
|
+
sb.to <- which(
|
901
|
+
cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
|
902
|
+
cc[2] <=x$seq.breaks[-1])
|
903
|
+
# Translate coordinates
|
904
|
+
if(length(sb.from)==1 & length(sb.to)==1){
|
905
|
+
y[i, 'name.from'] <- x$seq.names[sb.from]
|
906
|
+
y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
|
907
|
+
y[i, 'name.to'] <- x$seq.names[sb.to]
|
908
|
+
y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
|
909
|
+
}
|
910
|
+
}
|
911
|
+
|
912
|
+
return(y)
|
913
|
+
### Returns a data.frame with four columns: name.from (character), pos.from
|
914
|
+
### (numeric) name.to (character), and pos.to (numeric). The first two
|
915
|
+
### correspond to sequence and position of the start point of the bin, the
|
916
|
+
### last two correspond to the sequence and position of the end point of the
|
917
|
+
### bin.
|
918
|
+
}
|
919
|
+
|
920
|
+
enve.recplot2.seqdepth <- function
|
921
|
+
### Calculate the sequencing depth of the given window(s)
|
922
|
+
(x,
|
923
|
+
### `enve.RecPlot2` object.
|
924
|
+
sel,
|
925
|
+
### Window(s) for which the sequencing depth is to be calculated. If not
|
926
|
+
### passed, it returns the sequencing depth of all windows
|
927
|
+
low.identity=FALSE
|
928
|
+
### A logical indicating if the sequencing depth is to be estimated only
|
929
|
+
### with low-identity matches. By default, only high-identity matches are
|
930
|
+
### used.
|
931
|
+
){
|
932
|
+
if(!inherits(x, "enve.RecPlot2"))
|
933
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
934
|
+
pos.cnts.in <- x$pos.counts.in
|
935
|
+
pos.breaks <- x$pos.breaks
|
936
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
937
|
+
seqdepth.in <- pos.cnts.in/pos.binsize
|
938
|
+
if(missing(sel)) return(seqdepth.in)
|
939
|
+
return(seqdepth.in[sel])
|
940
|
+
### Returns a numeric vector of sequencing depths (in bp/bp).
|
941
|
+
}
|
942
|
+
|
943
|
+
enve.recplot2.ANIr <- function
|
944
|
+
### Estimate the Average Nucleotide Identity from reads (ANIr) from a
|
945
|
+
### recruitment plot
|
946
|
+
(x,
|
947
|
+
### `enve.RecPlot2` object.
|
948
|
+
range=c(0,Inf)
|
949
|
+
### Range of identities to be considered. By default, the full range
|
950
|
+
### is used (note that the upper boundary is `Inf` and not 100 because
|
951
|
+
### recruitment plots can also be built with bit-scores). To use only
|
952
|
+
### intra-population matches (with identities), use c(95,100). To use only
|
953
|
+
### inter-population values, use c(0,95).
|
954
|
+
){
|
955
|
+
if(!inherits(x, "enve.RecPlot2"))
|
956
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
957
|
+
id.b <- x$id.breaks
|
958
|
+
id <- (id.b[-1]+id.b[-length(id.b)])/2
|
959
|
+
cnt <- x$id.counts
|
960
|
+
cnt[id < range[1]] <- 0
|
961
|
+
cnt[id > range[2]] <- 0
|
962
|
+
return(sum(id*cnt/sum(cnt)))
|
620
963
|
}
|
621
964
|
|
622
965
|
#==============> Define internal functions
|
@@ -640,21 +983,67 @@ enve.recplot2.__counts <- function
|
|
640
983
|
return(counts);
|
641
984
|
}
|
642
985
|
|
986
|
+
enve.recplot2.findPeaks.__emauto_one <- function
|
987
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
|
988
|
+
(x, comp, do_crit, best, verbose, ...){
|
989
|
+
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
990
|
+
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
991
|
+
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
992
|
+
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
993
|
+
ifelse(crit > best[['crit']], '*', ''), '\n')
|
994
|
+
if(crit > best[['crit']]){
|
995
|
+
best[['crit']] <- crit
|
996
|
+
best[['peaks']] <- peaks
|
997
|
+
}
|
998
|
+
best[['pstore']][[comp]] <- peaks
|
999
|
+
return(best)
|
1000
|
+
}
|
1001
|
+
enve.recplot2.findPeaks.__em_e <- function
|
1002
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
|
1003
|
+
(x, theta){
|
1004
|
+
components <- length(theta[['mu']])
|
1005
|
+
product <- do.call(cbind,
|
1006
|
+
lapply(1:components,
|
1007
|
+
function(i) dnorm(x, theta[['mu']][i],
|
1008
|
+
theta[['sd']][i])*theta[['alpha']][i]))
|
1009
|
+
sum.of.components <- rowSums(product)
|
1010
|
+
posterior <- product / sum.of.components
|
1011
|
+
|
1012
|
+
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
enve.recplot2.findPeaks.__em_m <- function
|
1016
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
|
1017
|
+
(x, posterior){
|
1018
|
+
components <- ncol(posterior)
|
1019
|
+
n <- colSums(posterior)
|
1020
|
+
mu <- colSums(posterior * x) / n
|
1021
|
+
sd <- sqrt( colSums(
|
1022
|
+
posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
|
1023
|
+
alpha <- n/length(x)
|
1024
|
+
return(list(mu=mu, sd=sd, alpha=alpha))
|
1025
|
+
}
|
1026
|
+
|
643
1027
|
enve.recplot2.__peakHist <- function
|
644
1028
|
### Internal ancilliary function (see `enve.RecPlot2.Peak`).
|
645
1029
|
(x, mids, counts=TRUE){
|
646
1030
|
d.o <- x$param.hat
|
647
|
-
|
1031
|
+
if(length(x$log)==0) x$log <- FALSE
|
1032
|
+
if(x$log){
|
1033
|
+
d.o$x <- log(mids)
|
1034
|
+
}else{
|
1035
|
+
d.o$x <- mids
|
1036
|
+
}
|
648
1037
|
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
649
1038
|
if(!counts) return(prob)
|
650
1039
|
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
651
1040
|
return(prob*x$n.hat/sum(prob))
|
652
1041
|
}
|
653
1042
|
|
654
|
-
enve.recplot2.
|
655
|
-
### Internall ancilliary function (see `enve.recplot2.findPeaks`).
|
1043
|
+
enve.recplot2.findPeaks.__mow_one <- function
|
1044
|
+
### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
|
656
1045
|
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
657
|
-
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
|
1046
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
|
658
1047
|
){
|
659
1048
|
dist <- ifelse(with.skewness, 'sn', 'norm');
|
660
1049
|
|
@@ -683,8 +1072,14 @@ enve.recplot2.__findPeak <- function
|
|
683
1072
|
if(round>1) param.hat <- last.hat;
|
684
1073
|
break;
|
685
1074
|
}
|
686
|
-
|
687
|
-
|
1075
|
+
if(round > 1){
|
1076
|
+
epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
|
1077
|
+
if(epsilon1 < optim.epsilon) break;
|
1078
|
+
if(round > 2){
|
1079
|
+
epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
1080
|
+
if(epsilon2 < optim.epsilon) break;
|
1081
|
+
}
|
1082
|
+
}
|
688
1083
|
}
|
689
1084
|
if(verbose) cat('\n')
|
690
1085
|
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
@@ -695,14 +1090,14 @@ enve.recplot2.__findPeak <- function
|
|
695
1090
|
n.hat <- length(lsd1.pop)/diff(quant.est)
|
696
1091
|
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
697
1092
|
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
698
|
-
merge.logdist=merge.logdist)
|
1093
|
+
merge.logdist=merge.logdist, log=log)
|
699
1094
|
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
700
1095
|
peak.cnt <- enve.recplot2.__peakHist(peak,
|
701
1096
|
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
702
1097
|
for(i in 2:length(peak.breaks)){
|
703
1098
|
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
704
1099
|
n.exp <- peak.cnt[i-1]
|
705
|
-
if(n.exp==0) n.exp
|
1100
|
+
if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
|
706
1101
|
if(length(values)==0) next
|
707
1102
|
in.peak <- runif(length(values)) <= n.exp/length(values)
|
708
1103
|
lsd2 <- c(lsd2, values[!in.peak])
|
@@ -716,17 +1111,19 @@ enve.recplot2.__findPeak <- function
|
|
716
1111
|
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
717
1112
|
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
718
1113
|
plot=FALSE)$counts)+1)/2
|
1114
|
+
mu <- tail(param.hat, n=1)
|
1115
|
+
attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
|
719
1116
|
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
720
1117
|
'with expected n =',n.hat,'\n')
|
721
1118
|
return(peak)
|
722
1119
|
}
|
723
1120
|
|
724
|
-
enve.recplot2.
|
725
|
-
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
1121
|
+
enve.recplot2.findPeaks.__mower <- function
|
1122
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
|
726
1123
|
(peaks.opts){
|
727
1124
|
peaks <- list()
|
728
1125
|
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
729
|
-
peak <- do.call(enve.recplot2.
|
1126
|
+
peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
|
730
1127
|
if(is.null(peak)) break
|
731
1128
|
peaks[[ length(peaks)+1 ]] <- peak
|
732
1129
|
peaks.opts$lsd1 <- peak$values.res
|
@@ -738,7 +1135,10 @@ enve.recplot2.__findPeaks <- function
|
|
738
1135
|
enve.recplot2.__whichClosestPeak <- function
|
739
1136
|
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
740
1137
|
(peak, peaks){
|
741
|
-
dist <- as.numeric(lapply(peaks,
|
1138
|
+
dist <- as.numeric(lapply(peaks,
|
1139
|
+
function(x)
|
1140
|
+
abs(log(x$param.hat[[ length(x$param.hat) ]] /
|
1141
|
+
peak$param.hat[[ length(peak$param.hat) ]] ))))
|
742
1142
|
dist[ dist==0 ] <- Inf
|
743
1143
|
return(which.min(dist))
|
744
1144
|
}
|