miga-base 0.4.3.0 → 0.5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -1,497 +1,647 @@
|
|
1
1
|
#==============> Define S4 classes
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
2
|
+
|
3
|
+
#' Enveomics: Recruitment Plot (2) - S4 Class
|
4
|
+
#'
|
5
|
+
#' Enve-omics representation of Recruitment plots. This object can
|
6
|
+
#' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
|
7
|
+
#'
|
8
|
+
#' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
|
9
|
+
#' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
|
10
|
+
#' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
|
11
|
+
#' @slot id.counts \code{(numeric)} Counts per ID bin.
|
12
|
+
#' @slot id.breaks \code{(numeric)} Breaks of identity bins.
|
13
|
+
#' @slot pos.breaks \code{(numeric)} Breaks of position bins.
|
14
|
+
#' @slot pos.names \code{(character)} Names of the position bins.
|
15
|
+
#' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
|
16
|
+
#' @slot peaks \code{(list)} Peaks identified in the recplot.
|
17
|
+
#' Limits of the subject sequences after concatenation.
|
18
|
+
#' @slot seq.names \code{(character}) Names of the subject sequences.
|
19
|
+
#' @slot id.metric \code{(character}) Metric used as 'identity'.
|
20
|
+
#' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
|
21
|
+
#' @slot call \code{(call)} Call producing this object.
|
22
|
+
#'
|
23
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
24
|
+
#'
|
25
|
+
#' @exportClass
|
26
|
+
|
27
|
+
enve.RecPlot2 <- setClass("enve.RecPlot2",
|
28
|
+
representation(
|
29
|
+
# slots = list(
|
30
|
+
counts='matrix',
|
31
|
+
pos.counts.in='numeric',
|
32
|
+
pos.counts.out='numeric',
|
33
|
+
id.counts='numeric',
|
34
|
+
id.breaks='numeric',
|
35
|
+
pos.breaks='numeric',
|
36
|
+
pos.names='character',
|
37
|
+
seq.breaks='numeric',
|
38
|
+
peaks='list',
|
39
|
+
seq.names='character',
|
40
|
+
id.metric='character',
|
41
|
+
id.ingroup='logical',
|
42
|
+
call='call')
|
43
|
+
,package='enveomics.R'
|
44
|
+
);
|
45
|
+
|
46
|
+
#' Enveomics: Recruitment Plot (2) Peak - S4 Class
|
47
|
+
#'
|
48
|
+
#' Enve-omics representation of a peak in the sequencing depth histogram
|
49
|
+
#' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
|
50
|
+
#'
|
51
|
+
#' @slot dist \code{(character)}
|
52
|
+
#' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
|
53
|
+
#' (skew-normal).
|
54
|
+
#' @slot values \code{(numeric)}
|
55
|
+
#' Sequencing depth values predicted to conform the peak.
|
56
|
+
#' @slot values.res \code{(numeric)}
|
57
|
+
#' Sequencing depth values not explained by this or previously identified
|
58
|
+
#' peaks.
|
59
|
+
#' @slot mode \code{(numeric)}
|
60
|
+
#' Seed-value of mode anchoring the peak.
|
61
|
+
#' @slot param.hat \code{(list)}
|
62
|
+
#' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
|
63
|
+
#' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
|
64
|
+
#' xi=location). Note that the "dispersion" parameter is always first and
|
65
|
+
#' the "location" parameter is always last.
|
66
|
+
#' @slot n.hat \code{(numeric)}
|
67
|
+
#' Number of bins estimated to be explained by this peak. This should
|
68
|
+
#' ideally be equal to the length of \code{values}, but it's not an integer.
|
69
|
+
#' @slot n.total \code{(numeric)}
|
70
|
+
#' Total number of bins from which the peak was extracted. I.e., total
|
71
|
+
#' number of position bins with non-zero sequencing depth in the recruitment
|
72
|
+
#' plot (regardless of peak count).
|
73
|
+
#' @slot err.res \code{(numeric)}
|
74
|
+
#' Error left after adding the peak (mower) or log-likelihood (em or emauto).
|
75
|
+
#' @slot merge.logdist \code{(numeric)}
|
76
|
+
#' Attempted \code{merge.logdist} parameter.
|
77
|
+
#' @slot seq.depth \code{(numeric)}
|
78
|
+
#' Best estimate available for the sequencing depth of the peak (centrality).
|
79
|
+
#' @slot log \code{(logical)}
|
80
|
+
#' Indicates if the estimation was performed in natural logarithm space.
|
81
|
+
#'
|
82
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
83
|
+
#'
|
84
|
+
#' @exportClass
|
85
|
+
|
86
|
+
enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
|
87
|
+
representation(
|
88
|
+
# slots = list(
|
89
|
+
dist='character',
|
90
|
+
values='numeric',
|
91
|
+
values.res='numeric',
|
92
|
+
mode='numeric',
|
93
|
+
param.hat='list',
|
94
|
+
n.hat='numeric',
|
95
|
+
n.total='numeric',
|
96
|
+
err.res='numeric',
|
97
|
+
merge.logdist='numeric',
|
98
|
+
seq.depth='numeric',
|
99
|
+
log='logical'
|
100
|
+
));
|
101
|
+
|
102
|
+
#' Attribute accessor
|
103
|
+
#'
|
104
|
+
#'
|
105
|
+
#' @param x Object
|
106
|
+
#' @param name Attribute name
|
57
107
|
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
108
|
+
|
109
|
+
#' Attribute accessor
|
110
|
+
#'
|
111
|
+
#'
|
112
|
+
#' @param x Object
|
113
|
+
#' @param name Attribute name
|
58
114
|
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
59
115
|
|
60
116
|
#==============> Define S4 methods
|
117
|
+
|
118
|
+
#' Enveomics: Recruitment Plot (2)
|
119
|
+
#'
|
120
|
+
#' Plots an \code{\link{enve.RecPlot2}} object.
|
121
|
+
#'
|
122
|
+
#' @param x
|
123
|
+
#' \code{\link{enve.RecPlot2}} object to plot.
|
124
|
+
#' @param layout
|
125
|
+
#' Matrix indicating the position of the different panels in the layout,
|
126
|
+
#' where:
|
127
|
+
#' \itemize{
|
128
|
+
#' \item 0: Empty space
|
129
|
+
#' \item 1: Counts matrix
|
130
|
+
#' \item 2: position histogram (sequencing depth)
|
131
|
+
#' \item 3: identity histogram
|
132
|
+
#' \item 4: Populations histogram (histogram of sequencing depths)
|
133
|
+
#' \item 5: Color scale for the counts matrix (vertical)
|
134
|
+
#' \item 6: Color scale of the counts matrix (horizontal)
|
135
|
+
#' }
|
136
|
+
#' Only panels indicated here will be plotted. To plot only one panel
|
137
|
+
#' simply set this to the number of the panel you want to plot.
|
138
|
+
#' @param panel.fun
|
139
|
+
#' List of functions to be executed after drawing each panel. Use the
|
140
|
+
#' indices in \code{layout} (as characters) as keys. Functions for indices
|
141
|
+
#' missing in \code{layout} are ignored. For example, to add a vertical line
|
142
|
+
#' at the 3Mbp mark in both the position histogram and the counts matrix:
|
143
|
+
#' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
|
144
|
+
#' Note that the X-axis in both panels is in Mbp by default. To change
|
145
|
+
#' this behavior, set \code{pos.units} accordingly.
|
146
|
+
#' @param widths
|
147
|
+
#' Relative widths of the columns of \code{layout}.
|
148
|
+
#' @param heights
|
149
|
+
#' Relative heights of the rows of \code{layout}.
|
150
|
+
#' @param palette
|
151
|
+
#' Colors to be used to represent the counts matrix, sorted from no hits
|
152
|
+
#' to the maximum sequencing depth.
|
153
|
+
#' @param underlay.group
|
154
|
+
#' If TRUE, it indicates the in-group and out-group areas couloured based
|
155
|
+
#' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
|
156
|
+
#' @param peaks.col
|
157
|
+
#' If not \code{NA}, it attempts to represent peaks in the population histogram
|
158
|
+
#' in the specified color. Set to \code{NA} to avoid peak-finding.
|
159
|
+
#' @param use.peaks
|
160
|
+
#' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
|
161
|
+
#' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
|
162
|
+
#' @param id.lim
|
163
|
+
#' Limits of identities to represent.
|
164
|
+
#' @param pos.lim
|
165
|
+
#' Limits of positions to represent (in bp, regardless of \code{pos.units}).
|
166
|
+
#' @param pos.units
|
167
|
+
#' Units in which the positions should be represented (powers of 1,000
|
168
|
+
#' base pairs).
|
169
|
+
#' @param mar
|
170
|
+
#' Margins of the panels as a list, with the character representation of
|
171
|
+
#' the number of the panel as index (see \code{layout}).
|
172
|
+
#' @param pos.splines
|
173
|
+
#' Smoothing parameter for the splines in the position histogram. Zero
|
174
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
175
|
+
#' cross-validation.
|
176
|
+
#' @param id.splines
|
177
|
+
#' Smoothing parameter for the splines in the identity histogram. Zero
|
178
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
179
|
+
#' cross-validation.
|
180
|
+
#' @param in.lwd
|
181
|
+
#' Line width for the sequencing depth of in-group matches.
|
182
|
+
#' @param out.lwd
|
183
|
+
#' Line width for the sequencing depth of out-group matches.
|
184
|
+
#' @param id.lwd
|
185
|
+
#' Line width for the identity histogram.
|
186
|
+
#' @param in.col
|
187
|
+
#' Color associated to in-group matches.
|
188
|
+
#' @param out.col
|
189
|
+
#' Color associated to out-group matches.
|
190
|
+
#' @param id.col
|
191
|
+
#' Color for the identity histogram.
|
192
|
+
#' @param breaks.col
|
193
|
+
#' Color of the vertical lines indicating sequence breaks.
|
194
|
+
#' @param peaks.opts
|
195
|
+
#' Options passed to \code{\link{enve.recplot2.findPeaks}},
|
196
|
+
#' if \code{peaks.col} is not \code{NA}.
|
197
|
+
#' @param ...
|
198
|
+
#' Any other graphic parameters (currently ignored).
|
199
|
+
#'
|
200
|
+
#' @return
|
201
|
+
#' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
|
202
|
+
#' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
|
203
|
+
#' \code{layout} doesn't include 4, returns \code{NA}.
|
204
|
+
#'
|
205
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
206
|
+
#'
|
207
|
+
#' @method plot enve.RecPlot2
|
208
|
+
#' @export
|
209
|
+
|
61
210
|
plot.enve.RecPlot2 <- function
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
211
|
+
(x,
|
212
|
+
layout=matrix(c(5,5,2,1,4,3), nrow=2),
|
213
|
+
panel.fun=list(),
|
214
|
+
widths=c(1,7,2),
|
215
|
+
heights=c(1,2),
|
216
|
+
palette=grey((100:0)/100),
|
217
|
+
underlay.group=TRUE,
|
218
|
+
peaks.col='darkred',
|
219
|
+
use.peaks,
|
220
|
+
id.lim=range(x$id.breaks),
|
221
|
+
pos.lim=range(x$pos.breaks),
|
222
|
+
pos.units=c('Mbp','Kbp','bp'),
|
223
|
+
mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
|
224
|
+
'3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
|
225
|
+
'4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
|
226
|
+
'5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
|
227
|
+
pos.splines=0,
|
228
|
+
id.splines=1/2,
|
229
|
+
in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
230
|
+
out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
231
|
+
id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
|
232
|
+
in.col='darkblue',
|
233
|
+
out.col='lightblue',
|
234
|
+
id.col='black',
|
235
|
+
breaks.col='#AAAAAA40',
|
236
|
+
peaks.opts=list(),
|
237
|
+
...
|
238
|
+
){
|
239
|
+
pos.units <- match.arg(pos.units);
|
240
|
+
pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
|
241
|
+
pos.lim <- pos.lim/pos.factor;
|
242
|
+
lmat <- layout;
|
243
|
+
for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
|
244
|
+
|
245
|
+
layout(lmat, widths=widths, heights=heights);
|
246
|
+
ori.mar <- par('mar');
|
247
|
+
|
248
|
+
# Essential vars
|
249
|
+
counts <- x$counts
|
250
|
+
|
251
|
+
id.ingroup <- x$id.ingroup
|
252
|
+
id.counts <- x$id.counts
|
253
|
+
id.breaks <- x$id.breaks
|
254
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
|
255
|
+
id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
|
256
|
+
|
257
|
+
pos.counts.in <- x$pos.counts.in
|
258
|
+
pos.counts.out <- x$pos.counts.out
|
259
|
+
pos.breaks <- x$pos.breaks/pos.factor
|
260
|
+
pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
|
261
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
|
262
|
+
|
263
|
+
seqdepth.in <- pos.counts.in/pos.binsize
|
264
|
+
seqdepth.out <- pos.counts.out/pos.binsize
|
265
|
+
seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
|
266
|
+
seqdepth.out[seqdepth.out>0]))*c(1/2,2)
|
267
|
+
|
268
|
+
if(underlay.group){
|
269
|
+
in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
|
270
|
+
list(maxColorValue=256, alpha=62)));
|
271
|
+
out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
|
272
|
+
list(maxColorValue=256, alpha=52)));
|
273
|
+
}
|
274
|
+
|
275
|
+
# [1] Counts matrix
|
276
|
+
if(any(layout==1)){
|
277
|
+
par(mar=mar[['1']]);
|
278
|
+
plot(1, t='n', bty='l',
|
279
|
+
xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
|
280
|
+
xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
|
281
|
+
if(underlay.group){
|
282
|
+
rect(pos.lim[1], id.lim[1], pos.lim[2],
|
283
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
284
|
+
rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
|
285
|
+
id.lim[2], col=in.bg, border=NA);
|
286
|
+
}
|
287
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
288
|
+
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
289
|
+
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
290
|
+
length.out=1+length(palette)), add=TRUE);
|
291
|
+
if(exists('1',panel.fun)) panel.fun[['1']]();
|
292
|
+
}
|
293
|
+
|
294
|
+
# [2] Position histogram
|
295
|
+
if(any(layout==2)){
|
296
|
+
par(mar=mar[['2']]);
|
297
|
+
if(any(layout==1)){
|
298
|
+
xlab=''
|
299
|
+
xaxt='n'
|
300
|
+
}else{
|
301
|
+
xlab=paste('Position in genome (',pos.units,')',sep='')
|
302
|
+
xaxt='s'
|
303
|
+
}
|
304
|
+
plot(1,t='n', bty='l', log='y',
|
305
|
+
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
306
|
+
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
307
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
308
|
+
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
309
|
+
pos.f <- rep(seqdepth.in,each=2)
|
310
|
+
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
311
|
+
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
312
|
+
if(is.null(pos.splines) || pos.splines > 0){
|
313
|
+
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
314
|
+
spar=pos.splines)
|
315
|
+
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
316
|
+
}
|
317
|
+
if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
|
318
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
|
319
|
+
seqdepth.lim[1]*3/2, col=out.col, border=NA);
|
320
|
+
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
321
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
322
|
+
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
323
|
+
if(exists('2',panel.fun)) panel.fun[['2']]();
|
324
|
+
}
|
325
|
+
|
326
|
+
# [3] Identity histogram
|
327
|
+
if(any(layout==3)){
|
328
|
+
par(mar=mar[['3']]);
|
329
|
+
if(any(layout==1)){
|
330
|
+
ylab=''
|
331
|
+
yaxt='n'
|
332
|
+
}else{
|
333
|
+
ylab=x$id.metric
|
334
|
+
yaxt='s'
|
335
|
+
}
|
336
|
+
if(sum(id.counts>0) >= 4){
|
337
|
+
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
338
|
+
plot(1,t='n', bty='l', log='x',
|
339
|
+
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
340
|
+
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
183
341
|
if(underlay.group){
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
}
|
189
|
-
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
190
|
-
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
191
|
-
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
192
|
-
length.out=1+length(palette)), add=TRUE);
|
193
|
-
if(exists('1',panel.fun)) panel.fun[['1']]();
|
194
|
-
}
|
195
|
-
|
196
|
-
# [2] Position histogram
|
197
|
-
if(any(layout==2)){
|
198
|
-
par(mar=mar[['2']]);
|
199
|
-
if(any(layout==1)){
|
200
|
-
xlab=''
|
201
|
-
xaxt='n'
|
202
|
-
}else{
|
203
|
-
xlab=paste('Position in genome (',pos.units,')',sep='')
|
204
|
-
xaxt='s'
|
205
|
-
}
|
206
|
-
plot(1,t='n', bty='l', log='y',
|
207
|
-
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
208
|
-
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
209
|
-
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
210
|
-
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
211
|
-
pos.f <- rep(seqdepth.in,each=2)
|
212
|
-
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
213
|
-
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
214
|
-
if(is.null(pos.splines) || pos.splines > 0){
|
215
|
-
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
216
|
-
spar=pos.splines)
|
217
|
-
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
342
|
+
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
343
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
344
|
+
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
345
|
+
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
218
346
|
}
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
if(
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
}
|
227
|
-
|
228
|
-
# [3] Identity histogram
|
229
|
-
if(any(layout==3)){
|
230
|
-
par(mar=mar[['3']]);
|
231
|
-
if(any(layout==1)){
|
232
|
-
ylab=''
|
233
|
-
yaxt='n'
|
234
|
-
}else{
|
235
|
-
ylab=x$id.metric
|
236
|
-
yaxt='s'
|
237
|
-
}
|
238
|
-
if(sum(id.counts>0) >= 4){
|
239
|
-
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
240
|
-
plot(1,t='n', bty='l', log='x',
|
241
|
-
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
242
|
-
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
243
|
-
if(underlay.group){
|
244
|
-
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
245
|
-
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
246
|
-
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
247
|
-
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
248
|
-
}
|
249
|
-
id.f <- rep(id.counts,each=2)
|
250
|
-
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
251
|
-
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
252
|
-
if(is.null(id.splines) || id.splines > 0){
|
253
|
-
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
254
|
-
spar=id.splines)
|
255
|
-
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
256
|
-
}
|
257
|
-
}else{
|
258
|
-
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
259
|
-
text(1,1,labels='Insufficient data', srt=90)
|
347
|
+
id.f <- rep(id.counts,each=2)
|
348
|
+
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
349
|
+
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
350
|
+
if(is.null(id.splines) || id.splines > 0){
|
351
|
+
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
352
|
+
spar=id.splines)
|
353
|
+
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
260
354
|
}
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
355
|
+
}else{
|
356
|
+
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
357
|
+
text(1,1,labels='Insufficient data', srt=90)
|
358
|
+
}
|
359
|
+
if(exists('3',panel.fun)) panel.fun[['3']]();
|
360
|
+
}
|
361
|
+
|
362
|
+
# [4] Populations histogram
|
363
|
+
peaks <- NA;
|
364
|
+
if(any(layout==4)){
|
365
|
+
par(mar=mar[['4']]);
|
366
|
+
if(any(layout==2)){
|
367
|
+
ylab=''
|
368
|
+
yaxt='n'
|
369
|
+
}else{
|
370
|
+
ylab='Sequencing depth (X)'
|
371
|
+
yaxt='s'
|
372
|
+
}
|
373
|
+
h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
|
374
|
+
length.out=200);
|
375
|
+
h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
|
376
|
+
h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
|
377
|
+
plot(1, t='n', log='y',
|
378
|
+
xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
|
379
|
+
xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
|
380
|
+
yaxt=yaxt)
|
381
|
+
y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
382
|
+
y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
383
|
+
lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
|
384
|
+
y.tmp.out, col=out.col)
|
385
|
+
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
386
|
+
y.tmp.in, border=NA, col=in.col)
|
387
|
+
if(!is.na(peaks.col)){
|
388
|
+
o <- peaks.opts; o$x = x;
|
389
|
+
if(missing(use.peaks)){
|
390
|
+
peaks <- do.call(enve.recplot2.findPeaks, o)
|
271
391
|
}else{
|
272
|
-
|
273
|
-
yaxt='s'
|
392
|
+
peaks <- use.peaks
|
274
393
|
}
|
275
|
-
h.
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
for(i in 1:length(peaks)){
|
300
|
-
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
301
|
-
lines(cnt, h.mids, col='red');
|
302
|
-
pf <- pf+cnt;
|
303
|
-
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
304
|
-
}
|
305
|
-
lines(pf, h.mids, col='red',lwd=1.5);
|
306
|
-
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
307
|
-
frx <- signif(100*as.numeric(
|
308
|
-
lapply(peaks,
|
309
|
-
function(x) ifelse(length(x$values)==0, x$n.hat,
|
310
|
-
length(x$values))/x$n.total)), 2)
|
311
|
-
if(peaks[[1]]$err.res < 0){
|
312
|
-
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
313
|
-
}else{
|
314
|
-
err <- paste(', err:',
|
315
|
-
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
316
|
-
}
|
317
|
-
legend('topright', bty='n', cex=1/2,
|
318
|
-
legend=paste(letters[1:length(peaks)],'. ',
|
319
|
-
dpt,'X (', frx, '%', err, ')', sep=''))
|
320
|
-
}
|
394
|
+
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
395
|
+
if(!is.null(peaks) & length(peaks)>0){
|
396
|
+
pf <- h.mids*0;
|
397
|
+
for(i in 1:length(peaks)){
|
398
|
+
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
399
|
+
lines(cnt, h.mids, col='red');
|
400
|
+
pf <- pf+cnt;
|
401
|
+
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
402
|
+
}
|
403
|
+
lines(pf, h.mids, col='red',lwd=1.5);
|
404
|
+
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
405
|
+
frx <- signif(100*as.numeric(
|
406
|
+
lapply(peaks,
|
407
|
+
function(x) ifelse(length(x$values)==0, x$n.hat,
|
408
|
+
length(x$values))/x$n.total)), 2)
|
409
|
+
if(peaks[[1]]$err.res < 0){
|
410
|
+
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
411
|
+
}else{
|
412
|
+
err <- paste(', err:',
|
413
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
414
|
+
}
|
415
|
+
legend('topright', bty='n', cex=1/2,
|
416
|
+
legend=paste(letters[1:length(peaks)],'. ',
|
417
|
+
dpt,'X (', frx, '%', err, ')', sep=''))
|
321
418
|
}
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
|
351
|
-
### 4, returns NA.
|
419
|
+
}
|
420
|
+
if(exists('4',panel.fun)) panel.fun[['4']]();
|
421
|
+
}
|
422
|
+
|
423
|
+
# [5] Color scale of the counts matrix (vertical)
|
424
|
+
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
425
|
+
length.out=1+length(palette))
|
426
|
+
if(any(layout==5)){
|
427
|
+
par(mar=mar[['5']]);
|
428
|
+
plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
|
429
|
+
ylim=range(count.bins), yaxs='i', ylab='')
|
430
|
+
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
431
|
+
border=NA)
|
432
|
+
if(exists('5',panel.fun)) panel.fun[['5']]();
|
433
|
+
}
|
434
|
+
|
435
|
+
# [6] Color scale of the coutnts matrix (horizontal)
|
436
|
+
if(any(layout==6)){
|
437
|
+
par(mar=mar[['6']]);
|
438
|
+
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
439
|
+
xlim=range(count.bins), xaxs='i',xlab='');
|
440
|
+
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
441
|
+
border=NA);
|
442
|
+
if(exists('6',panel.fun)) panel.fun[['6']]();
|
443
|
+
}
|
444
|
+
|
445
|
+
par(mar=ori.mar);
|
446
|
+
return(peaks);
|
352
447
|
}
|
353
448
|
|
354
449
|
#==============> Define core functions
|
450
|
+
|
451
|
+
#' Enveomics: Recruitment Plot (2)
|
452
|
+
#'
|
453
|
+
#' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
|
454
|
+
#' been previously executed.
|
455
|
+
#'
|
456
|
+
#' @param prefix
|
457
|
+
#' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
|
458
|
+
#' least the files .rec and .lim must exist with this prefix.
|
459
|
+
#' @param plot
|
460
|
+
#' Should the object be plotted?
|
461
|
+
#' @param pos.breaks
|
462
|
+
#' Breaks in the positions histogram. It can also be a vector of break
|
463
|
+
#' points, and values outside the range are ignored. If zero (0), it
|
464
|
+
#' uses the sequence breaks as defined in the .lim file, which means
|
465
|
+
#' one bin per contig (or gene, if the mapping is agains genes). Ignored
|
466
|
+
#' if `pos.breaks.tsv` is passed.
|
467
|
+
#' @param pos.breaks.tsv
|
468
|
+
#' Path to a list of (absolute) coordinates to use as position breaks.
|
469
|
+
#' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
|
470
|
+
#' must contain at least one column: coordinates of the break positions of
|
471
|
+
#' each position bin. If it has a second column, this is used as the name
|
472
|
+
#' of the position bin that ends at the given coordinate (the first row is
|
473
|
+
#' ignored). Any additional columns are currently ignored. If \code{NA},
|
474
|
+
#' position bins are determined by \code{pos.breaks}.
|
475
|
+
#' @param id.breaks
|
476
|
+
#' Breaks in the identity histogram. It can also be a vector of break
|
477
|
+
#' points, and values outside the range are ignored.
|
478
|
+
#' @param id.free.range
|
479
|
+
#' Indicates that the range should be freely set from the observed
|
480
|
+
#' values. Otherwise, 70-100\% is included in the identity histogram
|
481
|
+
#' (default).
|
482
|
+
#' @param id.metric
|
483
|
+
#' Metric of identity to be used (Y-axis). Corrected identity is only
|
484
|
+
#' supported if the original BLAST file included sequence lengths.
|
485
|
+
#' @param id.summary
|
486
|
+
#' Function summarizing the identity bins. Other recommended options
|
487
|
+
#' include: \code{median} to estimate the median instead of total bins, and
|
488
|
+
#' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
|
489
|
+
#' @param id.cutoff
|
490
|
+
#' Cutoff of identity metric above which the hits are considered
|
491
|
+
#' \code{in-group}. The 95\% identity corresponds to the expectation of
|
492
|
+
#' ANI<95\% within species.
|
493
|
+
#' @param threads
|
494
|
+
#' Number of threads to use.
|
495
|
+
#' @param verbose
|
496
|
+
#' Indicates if the function should report the advance.
|
497
|
+
#' @param ...
|
498
|
+
#' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
|
499
|
+
#'
|
500
|
+
#' @return Returns an object of class \code{\link{enve.RecPlot2}}.
|
501
|
+
#'
|
502
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
503
|
+
#' @author Kenji Gerhardt [aut]
|
504
|
+
#'
|
505
|
+
#' @export
|
506
|
+
|
355
507
|
enve.recplot2 <- function(
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
pos.breaks.tsv=NA,
|
370
|
-
### Path to a list of (absolute) coordinates to use as position breaks.
|
371
|
-
### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
372
|
-
### must contain at least one column: coordinates of the break positions of
|
373
|
-
### each position bin. If it has a second column, this is used as the name
|
374
|
-
### of the position bin that ends at the given coordinate (the first row is
|
375
|
-
### ignored). Any additional columns are currently ignored. If NA,
|
376
|
-
### position bins are determined by `pos.breaks`.
|
377
|
-
id.breaks=300,
|
378
|
-
### Breaks in the identity histogram. It can also be a vector of break
|
379
|
-
### points, and values outside the range are ignored.
|
380
|
-
id.free.range=FALSE,
|
381
|
-
### Indicates that the range should be freely set from the observed
|
382
|
-
### values. Otherwise, 70-100% is included in the identity histogram
|
383
|
-
### (default).
|
384
|
-
id.metric=c('identity', 'corrected identity', 'bit score'),
|
385
|
-
### Metric of identity to be used (Y-axis). Corrected identity is only
|
386
|
-
### supported if the original BLAST file included sequence lengths.
|
387
|
-
id.summary=sum,
|
388
|
-
### Function summarizing the identity bins. Other recommended options
|
389
|
-
### include: `median` to estimate the median instead of total bins, and
|
390
|
-
### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
|
391
|
-
id.cutoff=95,
|
392
|
-
### Cutoff of identity metric above which the hits are considered
|
393
|
-
### 'in-group'. The 95% identity corresponds to the expectation of
|
394
|
-
### ANI<95% within species.
|
395
|
-
threads=2,
|
396
|
-
### Number of threads to use.
|
397
|
-
verbose=TRUE,
|
398
|
-
### Indicates if the function should report the advance.
|
399
|
-
...
|
400
|
-
### Any additional parameters supported by `plot.enve.RecPlot2`.
|
401
|
-
){
|
508
|
+
prefix,
|
509
|
+
plot = TRUE,
|
510
|
+
pos.breaks = 1e3,
|
511
|
+
pos.breaks.tsv = NA,
|
512
|
+
id.breaks = 60,
|
513
|
+
id.free.range = FALSE,
|
514
|
+
id.metric = c('identity', 'corrected identity', 'bit score'),
|
515
|
+
id.summary = sum,
|
516
|
+
id.cutoff = 95,
|
517
|
+
threads = 2,
|
518
|
+
verbose = TRUE,
|
519
|
+
...
|
520
|
+
){
|
402
521
|
# Settings
|
403
522
|
id.metric <- match.arg(id.metric);
|
404
523
|
|
405
524
|
#Read files
|
406
|
-
if(verbose) cat("Reading files.\n")
|
407
|
-
rec <- read.table(paste(prefix, ".rec", sep
|
408
|
-
|
409
|
-
lim <- read.table(paste(prefix, ".lim", sep
|
410
|
-
|
525
|
+
if (verbose) cat("Reading files.\n")
|
526
|
+
rec <- read.table(paste(prefix, ".rec", sep = ""),
|
527
|
+
sep = "\t", comment.char = "", quote = "");
|
528
|
+
lim <- read.table(paste(prefix, ".lim", sep = ""),
|
529
|
+
sep = "\t", comment.char = "", quote = "", as.is = TRUE);
|
411
530
|
|
412
531
|
# Build matrix
|
413
|
-
if(verbose) cat("Building counts matrix.\n")
|
414
|
-
if(id.metric=="corrected identity" & ncol(rec)<6){
|
532
|
+
if (verbose) cat("Building counts matrix.\n")
|
533
|
+
if (id.metric == "corrected identity" & ncol(rec) < 6) {
|
415
534
|
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
416
535
|
}
|
417
|
-
rec.idcol <- ifelse(id.metric=="identity", 3,
|
418
|
-
|
536
|
+
rec.idcol <- ifelse(id.metric == "identity", 3,
|
537
|
+
ifelse(id.metric == "corrected identity", 6, 4))
|
419
538
|
pos.names <- as.character(NULL)
|
420
|
-
if(!is.na(pos.breaks.tsv)){
|
421
|
-
tmp <- read.table(pos.breaks.tsv, sep=
|
422
|
-
pos.breaks <- as.numeric(tmp[,1])
|
423
|
-
if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
|
424
|
-
}else if(length(pos.breaks)==1){
|
425
|
-
if(pos.breaks>0){
|
426
|
-
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1)
|
427
|
-
}else{
|
428
|
-
pos.breaks <- c(lim[1,2], lim[,3])
|
429
|
-
pos.names <- lim[,1]
|
539
|
+
if (!is.na(pos.breaks.tsv)){
|
540
|
+
tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
|
541
|
+
pos.breaks <- as.numeric(tmp[, 1])
|
542
|
+
if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
|
543
|
+
} else if (length(pos.breaks) == 1) {
|
544
|
+
if (pos.breaks > 0){
|
545
|
+
pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
|
546
|
+
} else {
|
547
|
+
pos.breaks <- c(lim[1, 2], lim[, 3])
|
548
|
+
pos.names <- lim[, 1]
|
430
549
|
}
|
431
550
|
}
|
432
|
-
if(length(id.breaks)==1){
|
433
|
-
id.range.v <- rec[,rec.idcol]
|
434
|
-
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
551
|
+
if (length(id.breaks) == 1) {
|
552
|
+
id.range.v <- rec[, rec.idcol]
|
553
|
+
if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
|
435
554
|
id.range.v <- range(id.range.v)
|
436
|
-
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1)
|
555
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
|
437
556
|
}
|
438
557
|
|
439
558
|
# Run in parallel
|
440
|
-
|
441
|
-
|
442
|
-
rec
|
443
|
-
|
444
|
-
|
445
|
-
rec.l[[
|
446
|
-
|
447
|
-
|
559
|
+
# If they already set threads to 1 manually, there's no point in launching
|
560
|
+
# clusters, it's just slower. Ditto for small files.
|
561
|
+
if (nrow(rec) < 75000 | threads == 1) {
|
562
|
+
# Coerces rec into a form that __counts is happy about
|
563
|
+
rec.l <- list()
|
564
|
+
rec.l[[1]] <- list(rec = rec, verbose = FALSE)
|
565
|
+
|
566
|
+
# No need to make a temporary variable, there's only one return for sure
|
567
|
+
# and it's not a list because it isn't coming back from an apply
|
568
|
+
counts <- enve.recplot2.__counts(
|
569
|
+
rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
|
570
|
+
rec.idcol = rec.idcol)
|
571
|
+
} else {
|
572
|
+
cl <- makeCluster(threads)
|
573
|
+
rec.l <- list()
|
574
|
+
thl <- ceiling(nrow(rec)/threads)
|
575
|
+
for (i in 0:(threads - 1)) {
|
576
|
+
rec.l[[i + 1]] <- list(
|
577
|
+
rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
|
578
|
+
verbose = ifelse(i == 0, verbose, FALSE))
|
579
|
+
}
|
580
|
+
counts.l <- clusterApply(
|
581
|
+
cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
|
582
|
+
id.breaks = id.breaks, rec.idcol = rec.idcol)
|
583
|
+
stopCluster(cl) # No spooky ghost clusters
|
584
|
+
|
585
|
+
counts <- counts.l[[1]]
|
586
|
+
for (i in 2:threads) counts <- counts + counts.l[[i]]
|
448
587
|
}
|
449
|
-
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
450
|
-
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
451
|
-
rec.idcol=rec.idcol)
|
452
|
-
counts <- counts.l[[1]]
|
453
|
-
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
454
|
-
stopCluster(cl)
|
455
588
|
|
456
589
|
# Estimate 1D histograms
|
457
|
-
if(verbose) cat("Building histograms.\n")
|
458
|
-
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
590
|
+
if (verbose) cat("Building histograms.\n")
|
591
|
+
id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
|
459
592
|
id.ingroup <- (id.mids > id.cutoff);
|
460
593
|
id.counts <- apply(counts, 2, id.summary);
|
461
|
-
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
462
|
-
pos.counts.out <- apply(counts[
|
594
|
+
pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
|
595
|
+
pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
|
463
596
|
|
464
597
|
# Plot and return
|
465
598
|
recplot <- new('enve.RecPlot2',
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
if(plot){
|
473
|
-
if(verbose) cat("Plotting.\n")
|
599
|
+
counts = counts, id.counts = id.counts,
|
600
|
+
pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
|
601
|
+
id.breaks = id.breaks, pos.breaks = pos.breaks,
|
602
|
+
pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
|
603
|
+
seq.names = lim[, 1], id.ingroup = id.ingroup,
|
604
|
+
id.metric = id.metric, call = match.call());
|
605
|
+
if (plot) {
|
606
|
+
if (verbose) cat("Plotting.\n")
|
474
607
|
peaks <- plot(recplot, ...);
|
475
608
|
attr(recplot, "peaks") <- peaks
|
476
609
|
}
|
477
610
|
return(recplot);
|
478
|
-
### Returns an object of class `enve.RecPlot2`.
|
479
611
|
}
|
480
612
|
|
613
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder
|
614
|
+
#'
|
615
|
+
#' Identifies peaks in the population histogram potentially indicating
|
616
|
+
#' sub-population mixtures.
|
617
|
+
#'
|
618
|
+
#' @param x
|
619
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
620
|
+
#' @param method
|
621
|
+
#' Peak-finder method. This should be one of:
|
622
|
+
#' \itemize{
|
623
|
+
#' \item \strong{emauto}
|
624
|
+
#' (Expectation-Maximization with auto-selection of components)
|
625
|
+
#' \item \strong{em}
|
626
|
+
#' (Expectation-Maximization)
|
627
|
+
#' \item \strong{mower}
|
628
|
+
#' (Custom distribution-mowing method)
|
629
|
+
#' }
|
630
|
+
#' @param ...
|
631
|
+
#' Any additional parameters supported by
|
632
|
+
#' \code{\link{enve.recplot2.findPeaks}}.
|
633
|
+
#'
|
634
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
635
|
+
#'
|
636
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
637
|
+
#'
|
638
|
+
#' export
|
639
|
+
|
481
640
|
enve.recplot2.findPeaks <- function(
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
method="emauto",
|
487
|
-
### Peak-finder method. This should be one of:
|
488
|
-
### "emauto" (Expectation-Maximization with auto-selection of components),
|
489
|
-
### "em" (Expectation-Maximization),
|
490
|
-
### "mower" (Custom distribution-mowing method).
|
491
|
-
...
|
492
|
-
### Any additional parameters supported by
|
493
|
-
### `enve.recplot2.findPeaks.<method>`.
|
494
|
-
){
|
641
|
+
x,
|
642
|
+
method="emauto",
|
643
|
+
...
|
644
|
+
){
|
495
645
|
if(method == "emauto"){
|
496
646
|
peaks <- enve.recplot2.findPeaks.emauto(x, ...)
|
497
647
|
}else if(method == "em"){
|
@@ -502,29 +652,44 @@ enve.recplot2.findPeaks <- function(
|
|
502
652
|
stop("Invalid peak-finder method ", method)
|
503
653
|
}
|
504
654
|
return(peaks)
|
505
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
506
655
|
}
|
507
656
|
|
657
|
+
#' Enveomics: Recruitment Plot (2) Emauto Peak Finder
|
658
|
+
#'
|
659
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
660
|
+
#' Model Expectation Maximization (GMM-EM) method with number of components
|
661
|
+
#' automatically detected.
|
662
|
+
#'
|
663
|
+
#' @param x
|
664
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
665
|
+
#' @param components
|
666
|
+
#' A vector of number of components to evaluate.
|
667
|
+
#' @param criterion
|
668
|
+
#' Criterion to use for components selection. Must be one of:
|
669
|
+
#' \code{aic} (Akaike Information Criterion),
|
670
|
+
#' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
|
671
|
+
#' @param merge.tol
|
672
|
+
#' When attempting to merge peaks with very similar sequencing depth, use
|
673
|
+
#' this number of significant digits (in log-scale).
|
674
|
+
#' @param verbose
|
675
|
+
#' Display (mostly debugging) information.
|
676
|
+
#' @param ...
|
677
|
+
#' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
|
678
|
+
#'
|
679
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
680
|
+
#'
|
681
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
682
|
+
#'
|
683
|
+
#' @export
|
684
|
+
|
508
685
|
enve.recplot2.findPeaks.emauto <- function(
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
criterion='aic',
|
517
|
-
### Criterion to use for components selection. Must be one of:
|
518
|
-
### 'aic' (Akaike Information Criterion),
|
519
|
-
### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
|
520
|
-
merge.tol=2L,
|
521
|
-
### When attempting to merge peaks with very similar sequencing depth, use
|
522
|
-
### this number of significant digits (in log-scale).
|
523
|
-
verbose=FALSE,
|
524
|
-
### Display (mostly debugging) information.
|
525
|
-
...
|
526
|
-
### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
|
527
|
-
){
|
686
|
+
x,
|
687
|
+
components=seq(1,10),
|
688
|
+
criterion='aic',
|
689
|
+
merge.tol=2L,
|
690
|
+
verbose=FALSE,
|
691
|
+
...
|
692
|
+
){
|
528
693
|
best <- list(crit=0, pstore=list())
|
529
694
|
if(criterion == 'aic'){
|
530
695
|
do_crit <- function(ll, k, n) 2*k - 2*ll
|
@@ -536,55 +701,72 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
536
701
|
for(comp in components){
|
537
702
|
if(verbose) cat('Testing:',comp,'\n')
|
538
703
|
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
539
|
-
|
704
|
+
verbose, ...)
|
540
705
|
}
|
541
706
|
if(length(best[['peaks']])==0) return(list())
|
542
707
|
|
543
708
|
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
544
|
-
|
709
|
+
function(x) x$seq.depth)), merge.tol)
|
545
710
|
distinct <- length(unique(seqdepths.r))
|
546
711
|
if(distinct < length(best[['peaks']])){
|
547
712
|
if(verbose) cat('Attempting merge to', distinct, 'components\n')
|
548
713
|
init <- apply(sapply(best[['peaks']],
|
549
|
-
|
714
|
+
function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
|
550
715
|
init <- init[!duplicated(seqdepths.r),]
|
551
716
|
init <- list(mu=init[,'mean'], sd=init[,'sd'],
|
552
|
-
|
717
|
+
alpha=init[,'alpha']/sum(init[,'alpha']))
|
553
718
|
best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
|
554
|
-
|
719
|
+
verbose, ...)
|
555
720
|
}
|
556
721
|
return(best[['peaks']])
|
557
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
558
722
|
}
|
559
723
|
|
724
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder
|
725
|
+
#'
|
726
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
727
|
+
#' Model Expectation Maximization (GMM-EM) method.
|
728
|
+
#'
|
729
|
+
#' @param x
|
730
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
731
|
+
#' @param max.iter
|
732
|
+
#' Maximum number of EM iterations.
|
733
|
+
#' @param ll.diff.res
|
734
|
+
#' Maximum Log-Likelihood difference to be considered as convergent.
|
735
|
+
#' @param components
|
736
|
+
#' Number of distributions assumed in the mixture.
|
737
|
+
#' @param rm.top
|
738
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
739
|
+
#' This step is useful to remove highly conserved regions, but can be
|
740
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
741
|
+
#' \strong{after} removing zero-coverage windows.
|
742
|
+
#' @param verbose
|
743
|
+
#' Display (mostly debugging) information.
|
744
|
+
#' @param init
|
745
|
+
#' Initialization parameters. By default, these are derived from k-means
|
746
|
+
#' clustering. A named list with vectors for \code{mu}, \code{sd}, and
|
747
|
+
#' \code{alpha}, each of length \code{components}.
|
748
|
+
#' @param log
|
749
|
+
#' Logical value indicating if the estimations should be performed in
|
750
|
+
#' natural logarithm units. Do not change unless you know what you're
|
751
|
+
#' doing.
|
752
|
+
#'
|
753
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
754
|
+
#'
|
755
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
756
|
+
#'
|
757
|
+
#' @export
|
758
|
+
|
560
759
|
enve.recplot2.findPeaks.em <- function(
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
rm.top=0.05,
|
572
|
-
### Top-values to remove before finding peaks, as a quantile probability.
|
573
|
-
### This step is useful to remove highly conserved regions, but can be
|
574
|
-
### turned off by setting rm.top=0. The quantile is determined *after*
|
575
|
-
### removing zero-coverage windows.
|
576
|
-
verbose=FALSE,
|
577
|
-
### Display (mostly debugging) information.
|
578
|
-
init,
|
579
|
-
### Initialization parameters. By default, these are derived from k-means
|
580
|
-
### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
|
581
|
-
### of length `components`.
|
582
|
-
log=TRUE
|
583
|
-
### Logical value indicating if the estimations should be performed in
|
584
|
-
### natural logarithm units. Do not change unless you know what you're
|
585
|
-
### doing.
|
586
|
-
){
|
587
|
-
|
760
|
+
x,
|
761
|
+
max.iter=1000,
|
762
|
+
ll.diff.res=1e-8,
|
763
|
+
components=2,
|
764
|
+
rm.top=0.05,
|
765
|
+
verbose=FALSE,
|
766
|
+
init,
|
767
|
+
log=TRUE
|
768
|
+
){
|
769
|
+
|
588
770
|
# Essential vars
|
589
771
|
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
590
772
|
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
@@ -603,7 +785,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
603
785
|
m.step <- init
|
604
786
|
ll <- c()
|
605
787
|
cur.ll <- -Inf
|
606
|
-
|
788
|
+
|
607
789
|
for(i in 1:max.iter){
|
608
790
|
# 2/3. EM
|
609
791
|
e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
|
@@ -621,224 +803,297 @@ enve.recplot2.findPeaks.em <- function(
|
|
621
803
|
for(i in 1:components){
|
622
804
|
n.hat <- m.step[['alpha']][i]*length(lsd1)
|
623
805
|
peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
806
|
+
values.res=0, mode=m.step[['mu']][i],
|
807
|
+
param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
|
808
|
+
n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
|
809
|
+
merge.logdist=as.numeric(), log=log,
|
810
|
+
seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
|
629
811
|
}
|
630
812
|
return(peaks)
|
631
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
632
813
|
}
|
633
814
|
|
815
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder
|
816
|
+
#'
|
817
|
+
#' Identifies peaks in the population histogram potentially indicating
|
818
|
+
#' sub-population mixtures, using a custom distribution-mowing method.
|
819
|
+
#'
|
820
|
+
#' @param x
|
821
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
822
|
+
#' @param min.points
|
823
|
+
#' Minimum number of points in the quantile-estimation-range
|
824
|
+
#' \code{(quant.est)} to estimate a peak.
|
825
|
+
#' @param quant.est
|
826
|
+
#' Range of quantiles to be used in the estimation of a peak's
|
827
|
+
#' parameters.
|
828
|
+
#' @param mlv.opts
|
829
|
+
#' Ignored. For backwards compatibility.
|
830
|
+
#' @param fitdist.opts.sn
|
831
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
832
|
+
#' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
|
833
|
+
#' ammended with \code{xi=estimated} mode for each peak.
|
834
|
+
#' @param fitdist.opts.norm
|
835
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
836
|
+
#' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
|
837
|
+
#' ammended with \code{mean=estimated} mode for each peak.
|
838
|
+
#' @param rm.top
|
839
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
840
|
+
#' This step is useful to remove highly conserved regions, but can be
|
841
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
842
|
+
#' \strong{after} removing zero-coverage windows.
|
843
|
+
#' @param with.skewness
|
844
|
+
#' Allow skewness correction of the peaks. Typically, the
|
845
|
+
#' sequencing-depth distribution for a single peak is left-skewed, due
|
846
|
+
#' partly (but not exclusively) to fragmentation and mapping sensitivity.
|
847
|
+
#' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
|
848
|
+
#' alternative solution for the first problem (fragmentation) called
|
849
|
+
#' "tail distribution".
|
850
|
+
#' @param optim.rounds
|
851
|
+
#' Maximum rounds of peak optimization.
|
852
|
+
#' @param optim.epsilon
|
853
|
+
#' Trace change at which optimization stops (unless \code{optim.rounds} is
|
854
|
+
#' reached first). The trace change is estimated as the sum of square
|
855
|
+
#' differences between parameters in one round and those from two rounds
|
856
|
+
#' earlier (to avoid infinite loops from approximation).
|
857
|
+
#' @param merge.logdist
|
858
|
+
#' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
|
859
|
+
#' to attempt merging. The default of ~0.22 corresponds to a maximum
|
860
|
+
#' difference of 25\%.
|
861
|
+
#' @param verbose
|
862
|
+
#' Display (mostly debugging) information.
|
863
|
+
#' @param log
|
864
|
+
#' Logical value indicating if the estimations should be performed in
|
865
|
+
#' natural logarithm units. Do not change unless you know what you're
|
866
|
+
#' doing.
|
867
|
+
#'
|
868
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
869
|
+
#'
|
870
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
871
|
+
#'
|
872
|
+
#' @export
|
873
|
+
|
634
874
|
enve.recplot2.findPeaks.mower <- function(
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
fitdist.opts <- fitdist.opts.norm
|
698
|
-
}
|
699
|
-
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
700
|
-
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
701
|
-
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
702
|
-
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
703
|
-
|
704
|
-
# Find seed peaks
|
705
|
-
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
706
|
-
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
707
|
-
|
708
|
-
# Merge overlapping peaks
|
709
|
-
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
710
|
-
merged <- (length(peaks)>1)
|
711
|
-
while(merged){
|
712
|
-
merged <- FALSE
|
713
|
-
ignore <- c()
|
714
|
-
peaks2 <- list();
|
715
|
-
for(i in 1:length(peaks)){
|
716
|
-
if(i %in% ignore) next
|
717
|
-
p <- peaks[[ i ]]
|
718
|
-
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
719
|
-
p2 <- peaks[[ j ]]
|
720
|
-
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
721
|
-
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
722
|
-
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
723
|
-
if(verbose) cat('==> Attempting a merge at',
|
724
|
-
p$param.hat[[ length(p$param.hat) ]],'&',
|
725
|
-
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
726
|
-
peaks.opts$lsd1 <- c(p$values, p2$values)
|
727
|
-
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
728
|
-
if(length(p.new)==1){
|
729
|
-
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
730
|
-
ignore <- c(ignore, j)
|
731
|
-
merged <- TRUE
|
732
|
-
}
|
733
|
-
}
|
734
|
-
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
875
|
+
x,
|
876
|
+
min.points=10,
|
877
|
+
quant.est=c(0.002, 0.998),
|
878
|
+
mlv.opts=list(method='parzen'),
|
879
|
+
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
880
|
+
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
881
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
|
882
|
+
start=list(sd=1), lower=c(0, -Inf)),
|
883
|
+
rm.top=0.05,
|
884
|
+
with.skewness=TRUE,
|
885
|
+
optim.rounds=200,
|
886
|
+
optim.epsilon=1e-4,
|
887
|
+
merge.logdist=log(1.75),
|
888
|
+
verbose=FALSE,
|
889
|
+
log=TRUE
|
890
|
+
){
|
891
|
+
|
892
|
+
# Essential vars
|
893
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
|
894
|
+
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
895
|
+
lsd1 <- seqdepth.in[seqdepth.in>0];
|
896
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
897
|
+
if(log) lsd1 <- log(lsd1)
|
898
|
+
if(with.skewness){
|
899
|
+
fitdist.opts <- fitdist.opts.sn
|
900
|
+
}else{
|
901
|
+
fitdist.opts <- fitdist.opts.norm
|
902
|
+
}
|
903
|
+
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
904
|
+
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
905
|
+
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
906
|
+
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
907
|
+
|
908
|
+
# Find seed peaks
|
909
|
+
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
910
|
+
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
911
|
+
|
912
|
+
# Merge overlapping peaks
|
913
|
+
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
914
|
+
merged <- (length(peaks)>1)
|
915
|
+
while(merged){
|
916
|
+
merged <- FALSE
|
917
|
+
ignore <- c()
|
918
|
+
peaks2 <- list();
|
919
|
+
for(i in 1:length(peaks)){
|
920
|
+
if(i %in% ignore) next
|
921
|
+
p <- peaks[[ i ]]
|
922
|
+
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
923
|
+
p2 <- peaks[[ j ]]
|
924
|
+
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
925
|
+
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
926
|
+
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
927
|
+
if(verbose) cat('==> Attempting a merge at',
|
928
|
+
p$param.hat[[ length(p$param.hat) ]],'&',
|
929
|
+
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
930
|
+
peaks.opts$lsd1 <- c(p$values, p2$values)
|
931
|
+
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
932
|
+
if(length(p.new)==1){
|
933
|
+
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
934
|
+
ignore <- c(ignore, j)
|
935
|
+
merged <- TRUE
|
936
|
+
}
|
735
937
|
}
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
938
|
+
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
939
|
+
}
|
940
|
+
peaks <- peaks2
|
941
|
+
if(length(peaks)==1) break
|
942
|
+
}
|
943
|
+
|
944
|
+
if(verbose) cat('Found',length(peaks),'peak(s)\n')
|
945
|
+
return(peaks);
|
743
946
|
}
|
744
947
|
|
745
948
|
#==============> Define utils
|
949
|
+
|
950
|
+
#' Enveomics: Recruitment Plot (2) Core Peak Finder
|
951
|
+
#'
|
952
|
+
#' Finds the peak in a list of peaks that is most likely to represent the
|
953
|
+
#' "core genome" of a population.
|
954
|
+
#'
|
955
|
+
#' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
|
956
|
+
#'
|
957
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
958
|
+
#'
|
959
|
+
#' @export
|
960
|
+
|
746
961
|
enve.recplot2.corePeak <- function
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
771
|
-
}
|
772
|
-
return(corePeak)
|
962
|
+
(x
|
963
|
+
){
|
964
|
+
# Find the peak with maximum depth (centrality)
|
965
|
+
maxPeak <- x[[
|
966
|
+
which.max(as.numeric(lapply(x,
|
967
|
+
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
968
|
+
]]
|
969
|
+
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
970
|
+
# the default "merge.logdist" distance, take that one instead.
|
971
|
+
corePeak <- maxPeak
|
972
|
+
for(p in x){
|
973
|
+
p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
|
974
|
+
corePeak.len <- ifelse(
|
975
|
+
length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
|
976
|
+
sz.d <- log(p.len/corePeak.len)
|
977
|
+
if(is.nan(sz.d) || sz.d < 0) next
|
978
|
+
sq.d.a <- as.numeric(tail(p$param.hat, n=1))
|
979
|
+
sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
|
980
|
+
if(p$log) sq.d.a <- exp(sq.d.a)
|
981
|
+
if(corePeak$log) sq.d.b <- exp(sq.d.b)
|
982
|
+
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
983
|
+
}
|
984
|
+
return(corePeak)
|
773
985
|
}
|
774
986
|
|
987
|
+
#' Enveomics: Recruitment Plot (2) Change Cutoff
|
988
|
+
#'
|
989
|
+
#' Change the intra-species cutoff of an existing recruitment plot.
|
990
|
+
#'
|
991
|
+
#' @param rp
|
992
|
+
#' \code{\link{enve.RecPlot2}} object.
|
993
|
+
#' @param new.cutoff
|
994
|
+
#' New cutoff to use.
|
995
|
+
#'
|
996
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
997
|
+
#'
|
998
|
+
#' @export
|
999
|
+
|
775
1000
|
enve.recplot2.changeCutoff <- function
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
attr(rp, "pos.counts.out") <- pos.counts.out
|
791
|
-
attr(rp, "call") <- match.call()
|
792
|
-
return(rp)
|
1001
|
+
(rp,
|
1002
|
+
new.cutoff=98
|
1003
|
+
){
|
1004
|
+
# Re-calculate vectors
|
1005
|
+
id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
|
1006
|
+
id.ingroup <- (id.mids > new.cutoff)
|
1007
|
+
pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
|
1008
|
+
pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
|
1009
|
+
# Update object
|
1010
|
+
attr(rp, "id.ingroup") <- id.ingroup
|
1011
|
+
attr(rp, "pos.counts.in") <- pos.counts.in
|
1012
|
+
attr(rp, "pos.counts.out") <- pos.counts.out
|
1013
|
+
attr(rp, "call") <- match.call()
|
1014
|
+
return(rp)
|
793
1015
|
}
|
794
1016
|
|
1017
|
+
#' Enveomics: Recruitment Plot (2) Window Depth Threshold
|
1018
|
+
#'
|
1019
|
+
#' Identifies the threshold below which windows should be identified as
|
1020
|
+
#' variable or absent.
|
1021
|
+
#'
|
1022
|
+
#' @param rp
|
1023
|
+
#' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
|
1024
|
+
#' @param peak
|
1025
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1026
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1027
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1028
|
+
#' @param lower.tail
|
1029
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1030
|
+
#' sequencing depth.
|
1031
|
+
#' @param significance
|
1032
|
+
#' Significance threshold (alpha) to select windows.
|
1033
|
+
#'
|
1034
|
+
#' @return
|
1035
|
+
#' Returns a float. The units are depth if the peaks were estimated in
|
1036
|
+
#' linear scale, or log-depth otherwise (\code{peak$log}).
|
1037
|
+
#'
|
1038
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1039
|
+
#'
|
1040
|
+
#' @export
|
1041
|
+
|
795
1042
|
enve.recplot2.windowDepthThreshold <- function
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
### Significance threshold (alpha) to select windows.
|
809
|
-
){
|
810
|
-
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
811
|
-
par <- peak$param.hat
|
812
|
-
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
813
|
-
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
814
|
-
if(peak$log) thr <- exp(thr)
|
815
|
-
|
816
|
-
return(thr)
|
817
|
-
### Returns a float. The units are depth if the peaks were estimated in
|
818
|
-
### linear scale, or log-depth otherwise (`peak$log`).
|
1043
|
+
(rp,
|
1044
|
+
peak,
|
1045
|
+
lower.tail=TRUE,
|
1046
|
+
significance=0.05
|
1047
|
+
){
|
1048
|
+
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
1049
|
+
par <- peak$param.hat
|
1050
|
+
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
1051
|
+
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
1052
|
+
if(peak$log) thr <- exp(thr)
|
1053
|
+
|
1054
|
+
return(thr)
|
819
1055
|
}
|
820
1056
|
|
1057
|
+
#' Enveomics: Recruitment Plot (2) Extract Windows
|
1058
|
+
#'
|
1059
|
+
#' Extract windows significantly below (or above) the peak in sequencing
|
1060
|
+
#' depth.
|
1061
|
+
#'
|
1062
|
+
#' @param rp
|
1063
|
+
#' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
|
1064
|
+
#' @param peak
|
1065
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1066
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1067
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1068
|
+
#' @param lower.tail
|
1069
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1070
|
+
#' sequencing depth.
|
1071
|
+
#' @param significance
|
1072
|
+
#' Significance threshold (alpha) to select windows.
|
1073
|
+
#' @param seq.names
|
1074
|
+
#' Returns subject sequence names instead of a vector of Booleans. If
|
1075
|
+
#' the recruitment plot was generated with named position bins (e.g, using
|
1076
|
+
#' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
|
1077
|
+
#' vector of characters (the sequence identifiers), otherwise it returns a
|
1078
|
+
#' data.frame with a name column and two columns of coordinates.
|
1079
|
+
#'
|
1080
|
+
#' @return
|
1081
|
+
#' Returns a vector of logicals if \code{seq.names = FALSE}.
|
1082
|
+
#' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
|
1083
|
+
#' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
|
1084
|
+
#' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
|
1085
|
+
#'
|
1086
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1087
|
+
#'
|
1088
|
+
#' @export
|
1089
|
+
|
821
1090
|
enve.recplot2.extractWindows <- function
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
829
|
-
### used (see `enve.recplot2.corePeak`).
|
830
|
-
lower.tail=TRUE,
|
831
|
-
### If FALSE, it returns windows significantly above the peak in
|
832
|
-
### sequencing depth.
|
833
|
-
significance=0.05,
|
834
|
-
### Significance threshold (alpha) to select windows.
|
835
|
-
seq.names=FALSE
|
836
|
-
### Returns subject sequence names instead of a vector of Booleans. If
|
837
|
-
### the recruitment plot was generated with named position bins (e.g, using
|
838
|
-
### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
|
839
|
-
### characters (the sequence identifiers), otherwise it returns a data.frame
|
840
|
-
### with a name column and two columns of coordinates.
|
841
|
-
){
|
1091
|
+
(rp,
|
1092
|
+
peak,
|
1093
|
+
lower.tail = TRUE,
|
1094
|
+
significance = 0.05,
|
1095
|
+
seq.names = FALSE
|
1096
|
+
){
|
842
1097
|
# Determine the threshold
|
843
1098
|
thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
|
844
1099
|
|
@@ -850,43 +1105,54 @@ enve.recplot2.extractWindows <- function
|
|
850
1105
|
sel <- seqdepth.in > thr
|
851
1106
|
}
|
852
1107
|
|
853
|
-
# seq.names=FALSE
|
1108
|
+
# seq.names = FALSE
|
854
1109
|
if(!seq.names) return(sel)
|
855
|
-
# seq.names=TRUE
|
856
|
-
|
857
|
-
# seq.names=TRUE and pos.names undefined
|
858
|
-
return(enve.recplot2.coordinates(rp,sel))
|
859
|
-
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
860
|
-
### it returns a vector of characters if the object has `pos.names` defined,
|
861
|
-
### or a data.frame with four columns otherwise:
|
862
|
-
### name.from, name.to, pos.from, and pos.to
|
863
|
-
### (see `enve.recplot2.coordinates`).
|
1110
|
+
# seq.names = TRUE
|
1111
|
+
return(enve.recplot2.coordinates(rp, sel))
|
864
1112
|
}
|
865
1113
|
|
1114
|
+
#' Enveomics: Recruitment Plot (2) Compare Identities
|
1115
|
+
#'
|
1116
|
+
#' Compare the distribution of identities between two
|
1117
|
+
#' \code{\link{enve.RecPlot2}} objects.
|
1118
|
+
#'
|
1119
|
+
#' @param x
|
1120
|
+
#' First \code{\link{enve.RecPlot2}} object.
|
1121
|
+
#' @param y
|
1122
|
+
#' Second \code{\link{enve.RecPlot2}} object.
|
1123
|
+
#' @param method
|
1124
|
+
#' Distance method to use. This should be (an unambiguous abbreviation of)
|
1125
|
+
#' one of:
|
1126
|
+
#' \itemize{
|
1127
|
+
#' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
|
1128
|
+
#' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
|
1129
|
+
#' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
|
1130
|
+
#' doi:10.1214/aoms/1177729694}), or}
|
1131
|
+
#' \item{"euclidean"}
|
1132
|
+
#' }
|
1133
|
+
#' @param smooth.par
|
1134
|
+
#' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
1135
|
+
#' Use \code{NULL} to automatically determine this value using leave-one-out
|
1136
|
+
#' cross-validation (see \code{smooth.spline} parameter \code{spar}).
|
1137
|
+
#' @param pseudocounts
|
1138
|
+
#' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
1139
|
+
#' 1 for add-one smoothing.
|
1140
|
+
#' @param max.deviation
|
1141
|
+
#' Maximum mean deviation between identity breaks tolerated (as percent
|
1142
|
+
#' identity). Difference in number of \code{id.breaks} is never tolerated.
|
1143
|
+
#'
|
1144
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1145
|
+
#'
|
1146
|
+
#' @export
|
1147
|
+
|
866
1148
|
enve.recplot2.compareIdentities <- function
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
### one of:
|
875
|
-
### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
|
876
|
-
### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
|
877
|
-
### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
|
878
|
-
### doi:10.1214/aoms/1177729694), or "euclidean".
|
879
|
-
smooth.par=NULL,
|
880
|
-
### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
881
|
-
### Use NULL to automatically determine this value using leave-one-out
|
882
|
-
### cross-validation (see `smooth.spline` parameter `spar`).
|
883
|
-
pseudocounts=0,
|
884
|
-
### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
885
|
-
### 1 for add-one smoothing.
|
886
|
-
max.deviation=0.75
|
887
|
-
### Maximum mean deviation between identity breaks tolerated (as percent
|
888
|
-
### identity). Difference in number of id.breaks is never tolerated.
|
889
|
-
){
|
1149
|
+
(x,
|
1150
|
+
y,
|
1151
|
+
method="hellinger",
|
1152
|
+
smooth.par=NULL,
|
1153
|
+
pseudocounts=0,
|
1154
|
+
max.deviation=0.75
|
1155
|
+
){
|
890
1156
|
METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
|
891
1157
|
i.meth <- pmatch(method, METHODS)
|
892
1158
|
if (is.na(i.meth)) stop("Invalid distance ", method)
|
@@ -899,7 +1165,7 @@ enve.recplot2.compareIdentities <- function
|
|
899
1165
|
dev <- mean(abs(x$id.breaks - y$id.breaks))
|
900
1166
|
if(dev > max.deviation)
|
901
1167
|
stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
|
902
|
-
|
1168
|
+
dev)
|
903
1169
|
x.cnt <- x$id.counts
|
904
1170
|
y.cnt <- y$id.counts
|
905
1171
|
if(is.null(smooth.par) || smooth.par > 0){
|
@@ -928,63 +1194,92 @@ enve.recplot2.compareIdentities <- function
|
|
928
1194
|
return(d)
|
929
1195
|
}
|
930
1196
|
|
1197
|
+
#' Enveomics: Recruitment Plot (2) Coordinates
|
1198
|
+
#'
|
1199
|
+
#' Returns the sequence name and coordinates of the requested position bins.
|
1200
|
+
#'
|
1201
|
+
#' @param x
|
1202
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1203
|
+
#' @param bins
|
1204
|
+
#' Vector of selected bins to return. It can be a vector of logical values
|
1205
|
+
#' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
|
1206
|
+
#' missing, returns the coordinates of all windows.
|
1207
|
+
#'
|
1208
|
+
#' @return
|
1209
|
+
#' Returns a data.frame with five columns: \code{name.from} (character),
|
1210
|
+
#' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
|
1211
|
+
#' (numeric), and \code{seq.name} (character).
|
1212
|
+
#' The first two correspond to sequence and position of the start point of the
|
1213
|
+
#' bin. The next two correspond to the sequence and position of the end point of
|
1214
|
+
#' the bin. The last one indicates the name of the sequence (if defined).
|
1215
|
+
#'
|
1216
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1217
|
+
#'
|
1218
|
+
#' @export
|
1219
|
+
|
931
1220
|
enve.recplot2.coordinates <- function
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
bins
|
936
|
-
### Vector of selected bins to return. It can be a vector of logical values
|
937
|
-
### with the same length as `x$pos.breaks`-1 or a vector of integers. If
|
938
|
-
### missing, returns the coordinates of all windows.
|
939
|
-
){
|
1221
|
+
(x,
|
1222
|
+
bins
|
1223
|
+
){
|
940
1224
|
if(!inherits(x, "enve.RecPlot2"))
|
941
1225
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
942
1226
|
if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
|
943
1227
|
if(!is.vector(bins)) stop("'bins' must be a vector")
|
944
1228
|
if(inherits(bins, "logical")) bins <- which(bins)
|
945
1229
|
|
946
|
-
y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
|
947
|
-
|
1230
|
+
y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
|
1231
|
+
|
948
1232
|
for(i in 1:length(bins)){
|
949
1233
|
j <- bins[i]
|
950
1234
|
# Concatenated coordinates
|
951
1235
|
cc <- x$pos.breaks[c(j, j+1)]
|
952
1236
|
# Find the corresponding `seq.breaks`
|
953
1237
|
sb.from <- which(
|
954
|
-
|
955
|
-
|
1238
|
+
cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
|
1239
|
+
cc[1] < x$seq.breaks[-1])
|
956
1240
|
sb.to <- which(
|
957
|
-
|
958
|
-
|
1241
|
+
cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
|
1242
|
+
cc[2] <= x$seq.breaks[-1])
|
959
1243
|
# Translate coordinates
|
960
1244
|
if(length(sb.from)==1 & length(sb.to)==1){
|
961
1245
|
y[i, 'name.from'] <- x$seq.names[sb.from]
|
962
|
-
y[i, 'pos.from']
|
1246
|
+
y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
|
963
1247
|
y[i, 'name.to'] <- x$seq.names[sb.to]
|
964
|
-
y[i, 'pos.to']
|
1248
|
+
y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
|
1249
|
+
y[i, 'seq.name'] <- x$pos.names[i]
|
965
1250
|
}
|
966
1251
|
}
|
967
1252
|
|
968
1253
|
return(y)
|
969
|
-
### Returns a data.frame with four columns: name.from (character), pos.from
|
970
|
-
### (numeric) name.to (character), and pos.to (numeric). The first two
|
971
|
-
### correspond to sequence and position of the start point of the bin, the
|
972
|
-
### last two correspond to the sequence and position of the end point of the
|
973
|
-
### bin.
|
974
1254
|
}
|
975
1255
|
|
1256
|
+
#' Enveomics: Recruitment Plot (2) Sequencing Depth
|
1257
|
+
#'
|
1258
|
+
#' Calculate the sequencing depth of the given window(s).
|
1259
|
+
#'
|
1260
|
+
#' @param x
|
1261
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1262
|
+
#' @param sel
|
1263
|
+
#' Window(s) for which the sequencing depth is to be calculated. If not
|
1264
|
+
#' passed, it returns the sequencing depth of all windows.
|
1265
|
+
#' @param low.identity
|
1266
|
+
#' A logical indicating if the sequencing depth is to be estimated only
|
1267
|
+
#' with low-identity matches. By default, only high-identity matches are
|
1268
|
+
#' used.
|
1269
|
+
#'
|
1270
|
+
#' @return
|
1271
|
+
#' Returns a numeric vector of sequencing depths (in bp/bp).
|
1272
|
+
#'
|
1273
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1274
|
+
#'
|
1275
|
+
#' @export
|
1276
|
+
|
976
1277
|
enve.recplot2.seqdepth <- function
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
### passed, it returns the sequencing depth of all windows
|
983
|
-
low.identity=FALSE
|
984
|
-
### A logical indicating if the sequencing depth is to be estimated only
|
985
|
-
### with low-identity matches. By default, only high-identity matches are
|
986
|
-
### used.
|
987
|
-
){
|
1278
|
+
|
1279
|
+
(x,
|
1280
|
+
sel,
|
1281
|
+
low.identity=FALSE
|
1282
|
+
){
|
988
1283
|
if(!inherits(x, "enve.RecPlot2"))
|
989
1284
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
990
1285
|
if(low.identity){
|
@@ -997,21 +1292,30 @@ enve.recplot2.seqdepth <- function
|
|
997
1292
|
seqdepth.in <- pos.cnts.in/pos.binsize
|
998
1293
|
if(missing(sel)) return(seqdepth.in)
|
999
1294
|
return(seqdepth.in[sel])
|
1000
|
-
### Returns a numeric vector of sequencing depths (in bp/bp).
|
1001
1295
|
}
|
1002
1296
|
|
1297
|
+
#' Enveomics: Recruitment Plot (2) ANI Estimate
|
1298
|
+
#'
|
1299
|
+
#' Estimate the Average Nucleotide Identity from reads (ANIr) from a
|
1300
|
+
#' recruitment plot.
|
1301
|
+
#'
|
1302
|
+
#' @param x
|
1303
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1304
|
+
#' @param range
|
1305
|
+
#' Range of identities to be considered. By default, the full range
|
1306
|
+
#' is used (note that the upper boundary is \code{Inf} and not 100 because
|
1307
|
+
#' recruitment plots can also be built with bit-scores). To use only
|
1308
|
+
#' intra-population matches (with identities), use c(95,100). To use only
|
1309
|
+
#' inter-population values, use c(0,95).
|
1310
|
+
#'
|
1311
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1312
|
+
#'
|
1313
|
+
#' @export
|
1314
|
+
|
1003
1315
|
enve.recplot2.ANIr <- function
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
### `enve.RecPlot2` object.
|
1008
|
-
range=c(0,Inf)
|
1009
|
-
### Range of identities to be considered. By default, the full range
|
1010
|
-
### is used (note that the upper boundary is `Inf` and not 100 because
|
1011
|
-
### recruitment plots can also be built with bit-scores). To use only
|
1012
|
-
### intra-population matches (with identities), use c(95,100). To use only
|
1013
|
-
### inter-population values, use c(0,95).
|
1014
|
-
){
|
1316
|
+
(x,
|
1317
|
+
range=c(0,Inf)
|
1318
|
+
){
|
1015
1319
|
if(!inherits(x, "enve.RecPlot2"))
|
1016
1320
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
1017
1321
|
id.b <- x$id.breaks
|
@@ -1023,35 +1327,82 @@ enve.recplot2.ANIr <- function
|
|
1023
1327
|
}
|
1024
1328
|
|
1025
1329
|
#==============> Define internal functions
|
1330
|
+
|
1331
|
+
#' Enveomics: Recruitment Plot (2) Internal Ancillary Function
|
1332
|
+
#'
|
1333
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2}}).
|
1334
|
+
#'
|
1335
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1336
|
+
#' @param pos.breaks Position breaks
|
1337
|
+
#' @param id.breaks Identity breaks
|
1338
|
+
#' @param rec.idcol Identity column to use
|
1339
|
+
#'
|
1340
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1341
|
+
#' @author Kenji Gerhardt [aut]
|
1342
|
+
#'
|
1343
|
+
#' @export
|
1344
|
+
|
1026
1345
|
enve.recplot2.__counts <- function
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1346
|
+
(x, pos.breaks, id.breaks, rec.idcol) {
|
1347
|
+
rec2 <- x$rec
|
1348
|
+
verbose <- x$verbose
|
1349
|
+
|
1350
|
+
# get counts of how many occurrences of each genome pos.bin there are per read
|
1351
|
+
x.bins <- mapply(
|
1352
|
+
function(start, end) {
|
1353
|
+
list(rle(findInterval(start:end, pos.breaks, left.open = T)))
|
1354
|
+
}, rec2[, 1], rec2[, 2])
|
1355
|
+
|
1356
|
+
# find the single y bin for each row, replicates it at the correct places to
|
1357
|
+
# the number of distinct bins found in its row
|
1358
|
+
y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
|
1359
|
+
times = unlist(lapply(x.bins, function(a) length(a$lengths))))
|
1360
|
+
|
1361
|
+
# x.bins_counts is the number of occurrences of each bin a row contains,
|
1362
|
+
# per row, then unlisted
|
1363
|
+
x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
|
1364
|
+
|
1365
|
+
# these are the pos. in. genome bins that each count in x.bins_counts falls into
|
1366
|
+
x.bins <- unlist(lapply(x.bins, function(a) a$values))
|
1367
|
+
|
1368
|
+
# much more efficient counts implementation in R using lists instead of a matrix:
|
1369
|
+
counts <- lapply(
|
1370
|
+
1:(length(pos.breaks) - 1),
|
1371
|
+
function(col_len) rep(0, length(id.breaks) - 1))
|
1372
|
+
|
1373
|
+
# accesses the correct list in counts by x.bin, then
|
1374
|
+
# accesses the position in that row by y.bins and adds the new count
|
1375
|
+
for(i in 1:length(x.bins)){
|
1376
|
+
counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
|
1377
|
+
}
|
1378
|
+
|
1379
|
+
counts <- do.call(rbind, counts)
|
1380
|
+
return(counts)
|
1044
1381
|
}
|
1045
1382
|
|
1383
|
+
#' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
|
1384
|
+
#'
|
1385
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
|
1386
|
+
#'
|
1387
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1388
|
+
#' @param comp Components
|
1389
|
+
#' @param do_crit Function estimating the criterion
|
1390
|
+
#' @param best Best solution thus far
|
1391
|
+
#' @param verbose If verbose
|
1392
|
+
#' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
|
1393
|
+
#'
|
1394
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1395
|
+
#'
|
1396
|
+
#' @export
|
1397
|
+
|
1046
1398
|
enve.recplot2.findPeaks.__emauto_one <- function
|
1047
|
-
|
1048
|
-
(x, comp, do_crit, best, verbose, ...){
|
1399
|
+
(x, comp, do_crit, best, verbose, ...){
|
1049
1400
|
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
1050
1401
|
if(length(peaks)==0) return(best)
|
1051
1402
|
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
1052
1403
|
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
1053
1404
|
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
1054
|
-
|
1405
|
+
ifelse(crit > best[['crit']], '*', ''), '\n')
|
1055
1406
|
if(crit > best[['crit']]){
|
1056
1407
|
best[['crit']] <- crit
|
1057
1408
|
best[['peaks']] <- peaks
|
@@ -1059,148 +1410,217 @@ enve.recplot2.findPeaks.__emauto_one <- function
|
|
1059
1410
|
best[['pstore']][[comp]] <- peaks
|
1060
1411
|
return(best)
|
1061
1412
|
}
|
1413
|
+
|
1414
|
+
#' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
|
1415
|
+
#'
|
1416
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1417
|
+
#'
|
1418
|
+
#' @param x Vector of log-transformed sequencing depths
|
1419
|
+
#' @param theta Parameters list
|
1420
|
+
#'
|
1421
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1422
|
+
#'
|
1423
|
+
#' @export
|
1424
|
+
|
1062
1425
|
enve.recplot2.findPeaks.__em_e <- function
|
1063
|
-
|
1064
|
-
(x, theta){
|
1426
|
+
(x, theta){
|
1065
1427
|
components <- length(theta[['mu']])
|
1066
1428
|
product <- do.call(cbind,
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1429
|
+
lapply(1:components,
|
1430
|
+
function(i) dnorm(x, theta[['mu']][i],
|
1431
|
+
theta[['sd']][i])*theta[['alpha']][i]))
|
1070
1432
|
sum.of.components <- rowSums(product)
|
1071
1433
|
posterior <- product / sum.of.components
|
1072
|
-
|
1434
|
+
|
1073
1435
|
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1074
1436
|
}
|
1075
1437
|
|
1438
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
|
1439
|
+
#'
|
1440
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1441
|
+
#'
|
1442
|
+
#' @param x Vector of log-transformed sequencing depths
|
1443
|
+
#' @param posterior Posterior probability
|
1444
|
+
#'
|
1445
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1446
|
+
#'
|
1447
|
+
#' @export
|
1448
|
+
|
1076
1449
|
enve.recplot2.findPeaks.__em_m <- function
|
1077
|
-
|
1078
|
-
(x, posterior){
|
1450
|
+
(x, posterior){
|
1079
1451
|
components <- ncol(posterior)
|
1080
1452
|
n <- colSums(posterior)
|
1081
1453
|
mu <- colSums(posterior * x) / n
|
1082
1454
|
sd <- sqrt( colSums(
|
1083
|
-
|
1455
|
+
posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
|
1084
1456
|
alpha <- n/length(x)
|
1085
1457
|
return(list(mu=mu, sd=sd, alpha=alpha))
|
1086
1458
|
}
|
1087
1459
|
|
1460
|
+
#' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
|
1461
|
+
#'
|
1462
|
+
#' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
|
1463
|
+
#'
|
1464
|
+
#' @param x \code{\link{enve.RecPlot2.Peak}} object
|
1465
|
+
#' @param mids Midpoints
|
1466
|
+
#' @param counts Counts
|
1467
|
+
#'
|
1468
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1469
|
+
#'
|
1470
|
+
#' @export
|
1471
|
+
|
1088
1472
|
enve.recplot2.__peakHist <- function
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
return(prob*x$n.hat/sum(prob))
|
1473
|
+
(x, mids, counts=TRUE){
|
1474
|
+
d.o <- x$param.hat
|
1475
|
+
if(length(x$log)==0) x$log <- FALSE
|
1476
|
+
if(x$log){
|
1477
|
+
d.o$x <- log(mids)
|
1478
|
+
}else{
|
1479
|
+
d.o$x <- mids
|
1480
|
+
}
|
1481
|
+
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
1482
|
+
if(!counts) return(prob)
|
1483
|
+
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
1484
|
+
return(prob*x$n.hat/sum(prob))
|
1102
1485
|
}
|
1103
1486
|
|
1487
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
|
1488
|
+
#'
|
1489
|
+
#' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1490
|
+
#'
|
1491
|
+
#' @param lsd1 Vector of log-transformed sequencing depths
|
1492
|
+
#' @param min.points Minimum number of points
|
1493
|
+
#' @param quant.est Quantile estimate
|
1494
|
+
#' @param mlv.opts List of options for \code{mlv}
|
1495
|
+
#' @param fitdist.opts List of options for \code{fitdist}
|
1496
|
+
#' @param with.skewness If skewed-normal should be used
|
1497
|
+
#' @param optim.rounds Maximum number of optimization rounds
|
1498
|
+
#' @param optim.epsilon Minimum difference considered negligible
|
1499
|
+
#' @param n.total Global number of windows
|
1500
|
+
#' @param merge.logdist Attempted \code{merge.logdist} parameter
|
1501
|
+
#' @param verbose If verbose
|
1502
|
+
#' @param log If log-transformed depths
|
1503
|
+
#'
|
1504
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1505
|
+
#'
|
1506
|
+
#' @export
|
1507
|
+
|
1104
1508
|
enve.recplot2.findPeaks.__mow_one <- function
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
if(
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
if(epsilon2 < optim.epsilon) break;
|
1142
|
-
}
|
1509
|
+
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
1510
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
|
1511
|
+
){
|
1512
|
+
dist <- ifelse(with.skewness, 'sn', 'norm');
|
1513
|
+
|
1514
|
+
# Find peak
|
1515
|
+
o <- mlv.opts; o$x = lsd1;
|
1516
|
+
mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
|
1517
|
+
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
1518
|
+
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
1519
|
+
lim <- NA;
|
1520
|
+
if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
|
1521
|
+
|
1522
|
+
# Refine peak parameters
|
1523
|
+
for(round in 1:optim.rounds){
|
1524
|
+
param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
|
1525
|
+
lim.o <- param.hat
|
1526
|
+
lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
|
1527
|
+
lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
|
1528
|
+
if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
|
1529
|
+
'and params =',as.numeric(param.hat),' \r')
|
1530
|
+
if(length(lsd1.pop) < min.points) break;
|
1531
|
+
o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
|
1532
|
+
last.last.hat <- last.hat
|
1533
|
+
last.hat <- param.hat
|
1534
|
+
param.hat <- as.list(do.call(fitdist, o)$estimate);
|
1535
|
+
if(any(is.na(param.hat))){
|
1536
|
+
if(round>1) param.hat <- last.hat;
|
1537
|
+
break;
|
1538
|
+
}
|
1539
|
+
if(round > 1){
|
1540
|
+
epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
|
1541
|
+
if(epsilon1 < optim.epsilon) break;
|
1542
|
+
if(round > 2){
|
1543
|
+
epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
1544
|
+
if(epsilon2 < optim.epsilon) break;
|
1143
1545
|
}
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1546
|
+
}
|
1547
|
+
}
|
1548
|
+
if(verbose) cat('\n')
|
1549
|
+
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
1550
|
+
|
1551
|
+
# Mow distribution
|
1552
|
+
lsd2 <- c();
|
1553
|
+
lsd.pop <- c();
|
1554
|
+
n.hat <- length(lsd1.pop)/diff(quant.est)
|
1555
|
+
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
1556
|
+
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
1557
|
+
merge.logdist=merge.logdist, log=log)
|
1558
|
+
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
1559
|
+
peak.cnt <- enve.recplot2.__peakHist(peak,
|
1560
|
+
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
1561
|
+
for(i in 2:length(peak.breaks)){
|
1562
|
+
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
1563
|
+
n.exp <- peak.cnt[i-1]
|
1564
|
+
if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
|
1565
|
+
if(length(values)==0) next
|
1566
|
+
in.peak <- runif(length(values)) <= n.exp/length(values)
|
1567
|
+
lsd2 <- c(lsd2, values[!in.peak])
|
1568
|
+
lsd.pop <- c(lsd.pop, values[in.peak])
|
1569
|
+
}
|
1570
|
+
if(length(lsd.pop) < min.points) return(NULL)
|
1571
|
+
|
1572
|
+
# Return peak
|
1573
|
+
attr(peak, 'values') <- lsd.pop
|
1574
|
+
attr(peak, 'values.res') <- lsd2
|
1575
|
+
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
1576
|
+
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
1577
|
+
plot=FALSE)$counts)+1)/2
|
1578
|
+
mu <- tail(param.hat, n=1)
|
1579
|
+
attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
|
1580
|
+
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
1581
|
+
'with expected n =',n.hat,'\n')
|
1582
|
+
return(peak)
|
1180
1583
|
}
|
1181
1584
|
|
1585
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
|
1586
|
+
#'
|
1587
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1588
|
+
#'
|
1589
|
+
#' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
|
1590
|
+
#'
|
1591
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1592
|
+
#'
|
1593
|
+
#' @export
|
1594
|
+
|
1182
1595
|
enve.recplot2.findPeaks.__mower <- function
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
return(peaks)
|
1596
|
+
(peaks.opts){
|
1597
|
+
peaks <- list()
|
1598
|
+
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
1599
|
+
peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
|
1600
|
+
if(is.null(peak)) break
|
1601
|
+
peaks[[ length(peaks)+1 ]] <- peak
|
1602
|
+
peaks.opts$lsd1 <- peak$values.res
|
1603
|
+
}
|
1604
|
+
return(peaks)
|
1193
1605
|
}
|
1194
1606
|
|
1607
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
|
1608
|
+
#'
|
1609
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
|
1610
|
+
#'
|
1611
|
+
#' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
|
1612
|
+
#' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
|
1613
|
+
#'
|
1614
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1615
|
+
#'
|
1616
|
+
#' @export
|
1195
1617
|
|
1196
1618
|
enve.recplot2.__whichClosestPeak <- function
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
return(which.min(dist))
|
1619
|
+
(peak, peaks){
|
1620
|
+
dist <- as.numeric(lapply(peaks,
|
1621
|
+
function(x)
|
1622
|
+
abs(log(x$param.hat[[ length(x$param.hat) ]] /
|
1623
|
+
peak$param.hat[[ length(peak$param.hat) ]] ))))
|
1624
|
+
dist[ dist==0 ] <- Inf
|
1625
|
+
return(which.min(dist))
|
1205
1626
|
}
|
1206
|
-
|