miga-base 0.4.3.0 → 0.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -1,497 +1,647 @@
|
|
1
1
|
#==============> Define S4 classes
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
2
|
+
|
3
|
+
#' Enveomics: Recruitment Plot (2) - S4 Class
|
4
|
+
#'
|
5
|
+
#' Enve-omics representation of Recruitment plots. This object can
|
6
|
+
#' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
|
7
|
+
#'
|
8
|
+
#' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
|
9
|
+
#' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
|
10
|
+
#' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
|
11
|
+
#' @slot id.counts \code{(numeric)} Counts per ID bin.
|
12
|
+
#' @slot id.breaks \code{(numeric)} Breaks of identity bins.
|
13
|
+
#' @slot pos.breaks \code{(numeric)} Breaks of position bins.
|
14
|
+
#' @slot pos.names \code{(character)} Names of the position bins.
|
15
|
+
#' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
|
16
|
+
#' @slot peaks \code{(list)} Peaks identified in the recplot.
|
17
|
+
#' Limits of the subject sequences after concatenation.
|
18
|
+
#' @slot seq.names \code{(character}) Names of the subject sequences.
|
19
|
+
#' @slot id.metric \code{(character}) Metric used as 'identity'.
|
20
|
+
#' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
|
21
|
+
#' @slot call \code{(call)} Call producing this object.
|
22
|
+
#'
|
23
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
24
|
+
#'
|
25
|
+
#' @exportClass
|
26
|
+
|
27
|
+
enve.RecPlot2 <- setClass("enve.RecPlot2",
|
28
|
+
representation(
|
29
|
+
# slots = list(
|
30
|
+
counts='matrix',
|
31
|
+
pos.counts.in='numeric',
|
32
|
+
pos.counts.out='numeric',
|
33
|
+
id.counts='numeric',
|
34
|
+
id.breaks='numeric',
|
35
|
+
pos.breaks='numeric',
|
36
|
+
pos.names='character',
|
37
|
+
seq.breaks='numeric',
|
38
|
+
peaks='list',
|
39
|
+
seq.names='character',
|
40
|
+
id.metric='character',
|
41
|
+
id.ingroup='logical',
|
42
|
+
call='call')
|
43
|
+
,package='enveomics.R'
|
44
|
+
);
|
45
|
+
|
46
|
+
#' Enveomics: Recruitment Plot (2) Peak - S4 Class
|
47
|
+
#'
|
48
|
+
#' Enve-omics representation of a peak in the sequencing depth histogram
|
49
|
+
#' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
|
50
|
+
#'
|
51
|
+
#' @slot dist \code{(character)}
|
52
|
+
#' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
|
53
|
+
#' (skew-normal).
|
54
|
+
#' @slot values \code{(numeric)}
|
55
|
+
#' Sequencing depth values predicted to conform the peak.
|
56
|
+
#' @slot values.res \code{(numeric)}
|
57
|
+
#' Sequencing depth values not explained by this or previously identified
|
58
|
+
#' peaks.
|
59
|
+
#' @slot mode \code{(numeric)}
|
60
|
+
#' Seed-value of mode anchoring the peak.
|
61
|
+
#' @slot param.hat \code{(list)}
|
62
|
+
#' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
|
63
|
+
#' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
|
64
|
+
#' xi=location). Note that the "dispersion" parameter is always first and
|
65
|
+
#' the "location" parameter is always last.
|
66
|
+
#' @slot n.hat \code{(numeric)}
|
67
|
+
#' Number of bins estimated to be explained by this peak. This should
|
68
|
+
#' ideally be equal to the length of \code{values}, but it's not an integer.
|
69
|
+
#' @slot n.total \code{(numeric)}
|
70
|
+
#' Total number of bins from which the peak was extracted. I.e., total
|
71
|
+
#' number of position bins with non-zero sequencing depth in the recruitment
|
72
|
+
#' plot (regardless of peak count).
|
73
|
+
#' @slot err.res \code{(numeric)}
|
74
|
+
#' Error left after adding the peak (mower) or log-likelihood (em or emauto).
|
75
|
+
#' @slot merge.logdist \code{(numeric)}
|
76
|
+
#' Attempted \code{merge.logdist} parameter.
|
77
|
+
#' @slot seq.depth \code{(numeric)}
|
78
|
+
#' Best estimate available for the sequencing depth of the peak (centrality).
|
79
|
+
#' @slot log \code{(logical)}
|
80
|
+
#' Indicates if the estimation was performed in natural logarithm space.
|
81
|
+
#'
|
82
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
83
|
+
#'
|
84
|
+
#' @exportClass
|
85
|
+
|
86
|
+
enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
|
87
|
+
representation(
|
88
|
+
# slots = list(
|
89
|
+
dist='character',
|
90
|
+
values='numeric',
|
91
|
+
values.res='numeric',
|
92
|
+
mode='numeric',
|
93
|
+
param.hat='list',
|
94
|
+
n.hat='numeric',
|
95
|
+
n.total='numeric',
|
96
|
+
err.res='numeric',
|
97
|
+
merge.logdist='numeric',
|
98
|
+
seq.depth='numeric',
|
99
|
+
log='logical'
|
100
|
+
));
|
101
|
+
|
102
|
+
#' Attribute accessor
|
103
|
+
#'
|
104
|
+
#'
|
105
|
+
#' @param x Object
|
106
|
+
#' @param name Attribute name
|
57
107
|
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
108
|
+
|
109
|
+
#' Attribute accessor
|
110
|
+
#'
|
111
|
+
#'
|
112
|
+
#' @param x Object
|
113
|
+
#' @param name Attribute name
|
58
114
|
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
59
115
|
|
60
116
|
#==============> Define S4 methods
|
117
|
+
|
118
|
+
#' Enveomics: Recruitment Plot (2)
|
119
|
+
#'
|
120
|
+
#' Plots an \code{\link{enve.RecPlot2}} object.
|
121
|
+
#'
|
122
|
+
#' @param x
|
123
|
+
#' \code{\link{enve.RecPlot2}} object to plot.
|
124
|
+
#' @param layout
|
125
|
+
#' Matrix indicating the position of the different panels in the layout,
|
126
|
+
#' where:
|
127
|
+
#' \itemize{
|
128
|
+
#' \item 0: Empty space
|
129
|
+
#' \item 1: Counts matrix
|
130
|
+
#' \item 2: position histogram (sequencing depth)
|
131
|
+
#' \item 3: identity histogram
|
132
|
+
#' \item 4: Populations histogram (histogram of sequencing depths)
|
133
|
+
#' \item 5: Color scale for the counts matrix (vertical)
|
134
|
+
#' \item 6: Color scale of the counts matrix (horizontal)
|
135
|
+
#' }
|
136
|
+
#' Only panels indicated here will be plotted. To plot only one panel
|
137
|
+
#' simply set this to the number of the panel you want to plot.
|
138
|
+
#' @param panel.fun
|
139
|
+
#' List of functions to be executed after drawing each panel. Use the
|
140
|
+
#' indices in \code{layout} (as characters) as keys. Functions for indices
|
141
|
+
#' missing in \code{layout} are ignored. For example, to add a vertical line
|
142
|
+
#' at the 3Mbp mark in both the position histogram and the counts matrix:
|
143
|
+
#' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
|
144
|
+
#' Note that the X-axis in both panels is in Mbp by default. To change
|
145
|
+
#' this behavior, set \code{pos.units} accordingly.
|
146
|
+
#' @param widths
|
147
|
+
#' Relative widths of the columns of \code{layout}.
|
148
|
+
#' @param heights
|
149
|
+
#' Relative heights of the rows of \code{layout}.
|
150
|
+
#' @param palette
|
151
|
+
#' Colors to be used to represent the counts matrix, sorted from no hits
|
152
|
+
#' to the maximum sequencing depth.
|
153
|
+
#' @param underlay.group
|
154
|
+
#' If TRUE, it indicates the in-group and out-group areas couloured based
|
155
|
+
#' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
|
156
|
+
#' @param peaks.col
|
157
|
+
#' If not \code{NA}, it attempts to represent peaks in the population histogram
|
158
|
+
#' in the specified color. Set to \code{NA} to avoid peak-finding.
|
159
|
+
#' @param use.peaks
|
160
|
+
#' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
|
161
|
+
#' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
|
162
|
+
#' @param id.lim
|
163
|
+
#' Limits of identities to represent.
|
164
|
+
#' @param pos.lim
|
165
|
+
#' Limits of positions to represent (in bp, regardless of \code{pos.units}).
|
166
|
+
#' @param pos.units
|
167
|
+
#' Units in which the positions should be represented (powers of 1,000
|
168
|
+
#' base pairs).
|
169
|
+
#' @param mar
|
170
|
+
#' Margins of the panels as a list, with the character representation of
|
171
|
+
#' the number of the panel as index (see \code{layout}).
|
172
|
+
#' @param pos.splines
|
173
|
+
#' Smoothing parameter for the splines in the position histogram. Zero
|
174
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
175
|
+
#' cross-validation.
|
176
|
+
#' @param id.splines
|
177
|
+
#' Smoothing parameter for the splines in the identity histogram. Zero
|
178
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
179
|
+
#' cross-validation.
|
180
|
+
#' @param in.lwd
|
181
|
+
#' Line width for the sequencing depth of in-group matches.
|
182
|
+
#' @param out.lwd
|
183
|
+
#' Line width for the sequencing depth of out-group matches.
|
184
|
+
#' @param id.lwd
|
185
|
+
#' Line width for the identity histogram.
|
186
|
+
#' @param in.col
|
187
|
+
#' Color associated to in-group matches.
|
188
|
+
#' @param out.col
|
189
|
+
#' Color associated to out-group matches.
|
190
|
+
#' @param id.col
|
191
|
+
#' Color for the identity histogram.
|
192
|
+
#' @param breaks.col
|
193
|
+
#' Color of the vertical lines indicating sequence breaks.
|
194
|
+
#' @param peaks.opts
|
195
|
+
#' Options passed to \code{\link{enve.recplot2.findPeaks}},
|
196
|
+
#' if \code{peaks.col} is not \code{NA}.
|
197
|
+
#' @param ...
|
198
|
+
#' Any other graphic parameters (currently ignored).
|
199
|
+
#'
|
200
|
+
#' @return
|
201
|
+
#' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
|
202
|
+
#' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
|
203
|
+
#' \code{layout} doesn't include 4, returns \code{NA}.
|
204
|
+
#'
|
205
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
206
|
+
#'
|
207
|
+
#' @method plot enve.RecPlot2
|
208
|
+
#' @export
|
209
|
+
|
61
210
|
plot.enve.RecPlot2 <- function
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
211
|
+
(x,
|
212
|
+
layout=matrix(c(5,5,2,1,4,3), nrow=2),
|
213
|
+
panel.fun=list(),
|
214
|
+
widths=c(1,7,2),
|
215
|
+
heights=c(1,2),
|
216
|
+
palette=grey((100:0)/100),
|
217
|
+
underlay.group=TRUE,
|
218
|
+
peaks.col='darkred',
|
219
|
+
use.peaks,
|
220
|
+
id.lim=range(x$id.breaks),
|
221
|
+
pos.lim=range(x$pos.breaks),
|
222
|
+
pos.units=c('Mbp','Kbp','bp'),
|
223
|
+
mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
|
224
|
+
'3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
|
225
|
+
'4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
|
226
|
+
'5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
|
227
|
+
pos.splines=0,
|
228
|
+
id.splines=1/2,
|
229
|
+
in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
230
|
+
out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
231
|
+
id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
|
232
|
+
in.col='darkblue',
|
233
|
+
out.col='lightblue',
|
234
|
+
id.col='black',
|
235
|
+
breaks.col='#AAAAAA40',
|
236
|
+
peaks.opts=list(),
|
237
|
+
...
|
238
|
+
){
|
239
|
+
pos.units <- match.arg(pos.units);
|
240
|
+
pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
|
241
|
+
pos.lim <- pos.lim/pos.factor;
|
242
|
+
lmat <- layout;
|
243
|
+
for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
|
244
|
+
|
245
|
+
layout(lmat, widths=widths, heights=heights);
|
246
|
+
ori.mar <- par('mar');
|
247
|
+
|
248
|
+
# Essential vars
|
249
|
+
counts <- x$counts
|
250
|
+
|
251
|
+
id.ingroup <- x$id.ingroup
|
252
|
+
id.counts <- x$id.counts
|
253
|
+
id.breaks <- x$id.breaks
|
254
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
|
255
|
+
id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
|
256
|
+
|
257
|
+
pos.counts.in <- x$pos.counts.in
|
258
|
+
pos.counts.out <- x$pos.counts.out
|
259
|
+
pos.breaks <- x$pos.breaks/pos.factor
|
260
|
+
pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
|
261
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
|
262
|
+
|
263
|
+
seqdepth.in <- pos.counts.in/pos.binsize
|
264
|
+
seqdepth.out <- pos.counts.out/pos.binsize
|
265
|
+
seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
|
266
|
+
seqdepth.out[seqdepth.out>0]))*c(1/2,2)
|
267
|
+
|
268
|
+
if(underlay.group){
|
269
|
+
in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
|
270
|
+
list(maxColorValue=256, alpha=62)));
|
271
|
+
out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
|
272
|
+
list(maxColorValue=256, alpha=52)));
|
273
|
+
}
|
274
|
+
|
275
|
+
# [1] Counts matrix
|
276
|
+
if(any(layout==1)){
|
277
|
+
par(mar=mar[['1']]);
|
278
|
+
plot(1, t='n', bty='l',
|
279
|
+
xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
|
280
|
+
xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
|
281
|
+
if(underlay.group){
|
282
|
+
rect(pos.lim[1], id.lim[1], pos.lim[2],
|
283
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
284
|
+
rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
|
285
|
+
id.lim[2], col=in.bg, border=NA);
|
286
|
+
}
|
287
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
288
|
+
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
289
|
+
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
290
|
+
length.out=1+length(palette)), add=TRUE);
|
291
|
+
if(exists('1',panel.fun)) panel.fun[['1']]();
|
292
|
+
}
|
293
|
+
|
294
|
+
# [2] Position histogram
|
295
|
+
if(any(layout==2)){
|
296
|
+
par(mar=mar[['2']]);
|
297
|
+
if(any(layout==1)){
|
298
|
+
xlab=''
|
299
|
+
xaxt='n'
|
300
|
+
}else{
|
301
|
+
xlab=paste('Position in genome (',pos.units,')',sep='')
|
302
|
+
xaxt='s'
|
303
|
+
}
|
304
|
+
plot(1,t='n', bty='l', log='y',
|
305
|
+
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
306
|
+
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
307
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
308
|
+
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
309
|
+
pos.f <- rep(seqdepth.in,each=2)
|
310
|
+
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
311
|
+
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
312
|
+
if(is.null(pos.splines) || pos.splines > 0){
|
313
|
+
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
314
|
+
spar=pos.splines)
|
315
|
+
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
316
|
+
}
|
317
|
+
if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
|
318
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
|
319
|
+
seqdepth.lim[1]*3/2, col=out.col, border=NA);
|
320
|
+
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
321
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
322
|
+
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
323
|
+
if(exists('2',panel.fun)) panel.fun[['2']]();
|
324
|
+
}
|
325
|
+
|
326
|
+
# [3] Identity histogram
|
327
|
+
if(any(layout==3)){
|
328
|
+
par(mar=mar[['3']]);
|
329
|
+
if(any(layout==1)){
|
330
|
+
ylab=''
|
331
|
+
yaxt='n'
|
332
|
+
}else{
|
333
|
+
ylab=x$id.metric
|
334
|
+
yaxt='s'
|
335
|
+
}
|
336
|
+
if(sum(id.counts>0) >= 4){
|
337
|
+
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
338
|
+
plot(1,t='n', bty='l', log='x',
|
339
|
+
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
340
|
+
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
183
341
|
if(underlay.group){
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
}
|
189
|
-
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
190
|
-
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
191
|
-
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
192
|
-
length.out=1+length(palette)), add=TRUE);
|
193
|
-
if(exists('1',panel.fun)) panel.fun[['1']]();
|
194
|
-
}
|
195
|
-
|
196
|
-
# [2] Position histogram
|
197
|
-
if(any(layout==2)){
|
198
|
-
par(mar=mar[['2']]);
|
199
|
-
if(any(layout==1)){
|
200
|
-
xlab=''
|
201
|
-
xaxt='n'
|
202
|
-
}else{
|
203
|
-
xlab=paste('Position in genome (',pos.units,')',sep='')
|
204
|
-
xaxt='s'
|
205
|
-
}
|
206
|
-
plot(1,t='n', bty='l', log='y',
|
207
|
-
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
208
|
-
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
209
|
-
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
210
|
-
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
211
|
-
pos.f <- rep(seqdepth.in,each=2)
|
212
|
-
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
213
|
-
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
214
|
-
if(is.null(pos.splines) || pos.splines > 0){
|
215
|
-
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
216
|
-
spar=pos.splines)
|
217
|
-
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
342
|
+
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
343
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
344
|
+
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
345
|
+
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
218
346
|
}
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
if(
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
}
|
227
|
-
|
228
|
-
# [3] Identity histogram
|
229
|
-
if(any(layout==3)){
|
230
|
-
par(mar=mar[['3']]);
|
231
|
-
if(any(layout==1)){
|
232
|
-
ylab=''
|
233
|
-
yaxt='n'
|
234
|
-
}else{
|
235
|
-
ylab=x$id.metric
|
236
|
-
yaxt='s'
|
237
|
-
}
|
238
|
-
if(sum(id.counts>0) >= 4){
|
239
|
-
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
240
|
-
plot(1,t='n', bty='l', log='x',
|
241
|
-
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
242
|
-
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
243
|
-
if(underlay.group){
|
244
|
-
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
245
|
-
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
246
|
-
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
247
|
-
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
248
|
-
}
|
249
|
-
id.f <- rep(id.counts,each=2)
|
250
|
-
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
251
|
-
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
252
|
-
if(is.null(id.splines) || id.splines > 0){
|
253
|
-
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
254
|
-
spar=id.splines)
|
255
|
-
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
256
|
-
}
|
257
|
-
}else{
|
258
|
-
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
259
|
-
text(1,1,labels='Insufficient data', srt=90)
|
347
|
+
id.f <- rep(id.counts,each=2)
|
348
|
+
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
349
|
+
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
350
|
+
if(is.null(id.splines) || id.splines > 0){
|
351
|
+
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
352
|
+
spar=id.splines)
|
353
|
+
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
260
354
|
}
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
355
|
+
}else{
|
356
|
+
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
357
|
+
text(1,1,labels='Insufficient data', srt=90)
|
358
|
+
}
|
359
|
+
if(exists('3',panel.fun)) panel.fun[['3']]();
|
360
|
+
}
|
361
|
+
|
362
|
+
# [4] Populations histogram
|
363
|
+
peaks <- NA;
|
364
|
+
if(any(layout==4)){
|
365
|
+
par(mar=mar[['4']]);
|
366
|
+
if(any(layout==2)){
|
367
|
+
ylab=''
|
368
|
+
yaxt='n'
|
369
|
+
}else{
|
370
|
+
ylab='Sequencing depth (X)'
|
371
|
+
yaxt='s'
|
372
|
+
}
|
373
|
+
h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
|
374
|
+
length.out=200);
|
375
|
+
h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
|
376
|
+
h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
|
377
|
+
plot(1, t='n', log='y',
|
378
|
+
xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
|
379
|
+
xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
|
380
|
+
yaxt=yaxt)
|
381
|
+
y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
382
|
+
y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
383
|
+
lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
|
384
|
+
y.tmp.out, col=out.col)
|
385
|
+
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
386
|
+
y.tmp.in, border=NA, col=in.col)
|
387
|
+
if(!is.na(peaks.col)){
|
388
|
+
o <- peaks.opts; o$x = x;
|
389
|
+
if(missing(use.peaks)){
|
390
|
+
peaks <- do.call(enve.recplot2.findPeaks, o)
|
271
391
|
}else{
|
272
|
-
|
273
|
-
yaxt='s'
|
392
|
+
peaks <- use.peaks
|
274
393
|
}
|
275
|
-
h.
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
for(i in 1:length(peaks)){
|
300
|
-
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
301
|
-
lines(cnt, h.mids, col='red');
|
302
|
-
pf <- pf+cnt;
|
303
|
-
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
304
|
-
}
|
305
|
-
lines(pf, h.mids, col='red',lwd=1.5);
|
306
|
-
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
307
|
-
frx <- signif(100*as.numeric(
|
308
|
-
lapply(peaks,
|
309
|
-
function(x) ifelse(length(x$values)==0, x$n.hat,
|
310
|
-
length(x$values))/x$n.total)), 2)
|
311
|
-
if(peaks[[1]]$err.res < 0){
|
312
|
-
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
313
|
-
}else{
|
314
|
-
err <- paste(', err:',
|
315
|
-
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
316
|
-
}
|
317
|
-
legend('topright', bty='n', cex=1/2,
|
318
|
-
legend=paste(letters[1:length(peaks)],'. ',
|
319
|
-
dpt,'X (', frx, '%', err, ')', sep=''))
|
320
|
-
}
|
394
|
+
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
395
|
+
if(!is.null(peaks) & length(peaks)>0){
|
396
|
+
pf <- h.mids*0;
|
397
|
+
for(i in 1:length(peaks)){
|
398
|
+
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
399
|
+
lines(cnt, h.mids, col='red');
|
400
|
+
pf <- pf+cnt;
|
401
|
+
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
402
|
+
}
|
403
|
+
lines(pf, h.mids, col='red',lwd=1.5);
|
404
|
+
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
405
|
+
frx <- signif(100*as.numeric(
|
406
|
+
lapply(peaks,
|
407
|
+
function(x) ifelse(length(x$values)==0, x$n.hat,
|
408
|
+
length(x$values))/x$n.total)), 2)
|
409
|
+
if(peaks[[1]]$err.res < 0){
|
410
|
+
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
411
|
+
}else{
|
412
|
+
err <- paste(', err:',
|
413
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
414
|
+
}
|
415
|
+
legend('topright', bty='n', cex=1/2,
|
416
|
+
legend=paste(letters[1:length(peaks)],'. ',
|
417
|
+
dpt,'X (', frx, '%', err, ')', sep=''))
|
321
418
|
}
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
|
351
|
-
### 4, returns NA.
|
419
|
+
}
|
420
|
+
if(exists('4',panel.fun)) panel.fun[['4']]();
|
421
|
+
}
|
422
|
+
|
423
|
+
# [5] Color scale of the counts matrix (vertical)
|
424
|
+
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
425
|
+
length.out=1+length(palette))
|
426
|
+
if(any(layout==5)){
|
427
|
+
par(mar=mar[['5']]);
|
428
|
+
plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
|
429
|
+
ylim=range(count.bins), yaxs='i', ylab='')
|
430
|
+
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
431
|
+
border=NA)
|
432
|
+
if(exists('5',panel.fun)) panel.fun[['5']]();
|
433
|
+
}
|
434
|
+
|
435
|
+
# [6] Color scale of the coutnts matrix (horizontal)
|
436
|
+
if(any(layout==6)){
|
437
|
+
par(mar=mar[['6']]);
|
438
|
+
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
439
|
+
xlim=range(count.bins), xaxs='i',xlab='');
|
440
|
+
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
441
|
+
border=NA);
|
442
|
+
if(exists('6',panel.fun)) panel.fun[['6']]();
|
443
|
+
}
|
444
|
+
|
445
|
+
par(mar=ori.mar);
|
446
|
+
return(peaks);
|
352
447
|
}
|
353
448
|
|
354
449
|
#==============> Define core functions
|
450
|
+
|
451
|
+
#' Enveomics: Recruitment Plot (2)
|
452
|
+
#'
|
453
|
+
#' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
|
454
|
+
#' been previously executed.
|
455
|
+
#'
|
456
|
+
#' @param prefix
|
457
|
+
#' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
|
458
|
+
#' least the files .rec and .lim must exist with this prefix.
|
459
|
+
#' @param plot
|
460
|
+
#' Should the object be plotted?
|
461
|
+
#' @param pos.breaks
|
462
|
+
#' Breaks in the positions histogram. It can also be a vector of break
|
463
|
+
#' points, and values outside the range are ignored. If zero (0), it
|
464
|
+
#' uses the sequence breaks as defined in the .lim file, which means
|
465
|
+
#' one bin per contig (or gene, if the mapping is agains genes). Ignored
|
466
|
+
#' if `pos.breaks.tsv` is passed.
|
467
|
+
#' @param pos.breaks.tsv
|
468
|
+
#' Path to a list of (absolute) coordinates to use as position breaks.
|
469
|
+
#' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
|
470
|
+
#' must contain at least one column: coordinates of the break positions of
|
471
|
+
#' each position bin. If it has a second column, this is used as the name
|
472
|
+
#' of the position bin that ends at the given coordinate (the first row is
|
473
|
+
#' ignored). Any additional columns are currently ignored. If \code{NA},
|
474
|
+
#' position bins are determined by \code{pos.breaks}.
|
475
|
+
#' @param id.breaks
|
476
|
+
#' Breaks in the identity histogram. It can also be a vector of break
|
477
|
+
#' points, and values outside the range are ignored.
|
478
|
+
#' @param id.free.range
|
479
|
+
#' Indicates that the range should be freely set from the observed
|
480
|
+
#' values. Otherwise, 70-100\% is included in the identity histogram
|
481
|
+
#' (default).
|
482
|
+
#' @param id.metric
|
483
|
+
#' Metric of identity to be used (Y-axis). Corrected identity is only
|
484
|
+
#' supported if the original BLAST file included sequence lengths.
|
485
|
+
#' @param id.summary
|
486
|
+
#' Function summarizing the identity bins. Other recommended options
|
487
|
+
#' include: \code{median} to estimate the median instead of total bins, and
|
488
|
+
#' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
|
489
|
+
#' @param id.cutoff
|
490
|
+
#' Cutoff of identity metric above which the hits are considered
|
491
|
+
#' \code{in-group}. The 95\% identity corresponds to the expectation of
|
492
|
+
#' ANI<95\% within species.
|
493
|
+
#' @param threads
|
494
|
+
#' Number of threads to use.
|
495
|
+
#' @param verbose
|
496
|
+
#' Indicates if the function should report the advance.
|
497
|
+
#' @param ...
|
498
|
+
#' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
|
499
|
+
#'
|
500
|
+
#' @return Returns an object of class \code{\link{enve.RecPlot2}}.
|
501
|
+
#'
|
502
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
503
|
+
#' @author Kenji Gerhardt [aut]
|
504
|
+
#'
|
505
|
+
#' @export
|
506
|
+
|
355
507
|
enve.recplot2 <- function(
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
pos.breaks.tsv=NA,
|
370
|
-
### Path to a list of (absolute) coordinates to use as position breaks.
|
371
|
-
### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
|
372
|
-
### must contain at least one column: coordinates of the break positions of
|
373
|
-
### each position bin. If it has a second column, this is used as the name
|
374
|
-
### of the position bin that ends at the given coordinate (the first row is
|
375
|
-
### ignored). Any additional columns are currently ignored. If NA,
|
376
|
-
### position bins are determined by `pos.breaks`.
|
377
|
-
id.breaks=300,
|
378
|
-
### Breaks in the identity histogram. It can also be a vector of break
|
379
|
-
### points, and values outside the range are ignored.
|
380
|
-
id.free.range=FALSE,
|
381
|
-
### Indicates that the range should be freely set from the observed
|
382
|
-
### values. Otherwise, 70-100% is included in the identity histogram
|
383
|
-
### (default).
|
384
|
-
id.metric=c('identity', 'corrected identity', 'bit score'),
|
385
|
-
### Metric of identity to be used (Y-axis). Corrected identity is only
|
386
|
-
### supported if the original BLAST file included sequence lengths.
|
387
|
-
id.summary=sum,
|
388
|
-
### Function summarizing the identity bins. Other recommended options
|
389
|
-
### include: `median` to estimate the median instead of total bins, and
|
390
|
-
### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
|
391
|
-
id.cutoff=95,
|
392
|
-
### Cutoff of identity metric above which the hits are considered
|
393
|
-
### 'in-group'. The 95% identity corresponds to the expectation of
|
394
|
-
### ANI<95% within species.
|
395
|
-
threads=2,
|
396
|
-
### Number of threads to use.
|
397
|
-
verbose=TRUE,
|
398
|
-
### Indicates if the function should report the advance.
|
399
|
-
...
|
400
|
-
### Any additional parameters supported by `plot.enve.RecPlot2`.
|
401
|
-
){
|
508
|
+
prefix,
|
509
|
+
plot = TRUE,
|
510
|
+
pos.breaks = 1e3,
|
511
|
+
pos.breaks.tsv = NA,
|
512
|
+
id.breaks = 60,
|
513
|
+
id.free.range = FALSE,
|
514
|
+
id.metric = c('identity', 'corrected identity', 'bit score'),
|
515
|
+
id.summary = sum,
|
516
|
+
id.cutoff = 95,
|
517
|
+
threads = 2,
|
518
|
+
verbose = TRUE,
|
519
|
+
...
|
520
|
+
){
|
402
521
|
# Settings
|
403
522
|
id.metric <- match.arg(id.metric);
|
404
523
|
|
405
524
|
#Read files
|
406
|
-
if(verbose) cat("Reading files.\n")
|
407
|
-
rec <- read.table(paste(prefix, ".rec", sep
|
408
|
-
|
409
|
-
lim <- read.table(paste(prefix, ".lim", sep
|
410
|
-
|
525
|
+
if (verbose) cat("Reading files.\n")
|
526
|
+
rec <- read.table(paste(prefix, ".rec", sep = ""),
|
527
|
+
sep = "\t", comment.char = "", quote = "");
|
528
|
+
lim <- read.table(paste(prefix, ".lim", sep = ""),
|
529
|
+
sep = "\t", comment.char = "", quote = "", as.is = TRUE);
|
411
530
|
|
412
531
|
# Build matrix
|
413
|
-
if(verbose) cat("Building counts matrix.\n")
|
414
|
-
if(id.metric=="corrected identity" & ncol(rec)<6){
|
532
|
+
if (verbose) cat("Building counts matrix.\n")
|
533
|
+
if (id.metric == "corrected identity" & ncol(rec) < 6) {
|
415
534
|
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
416
535
|
}
|
417
|
-
rec.idcol <- ifelse(id.metric=="identity", 3,
|
418
|
-
|
536
|
+
rec.idcol <- ifelse(id.metric == "identity", 3,
|
537
|
+
ifelse(id.metric == "corrected identity", 6, 4))
|
419
538
|
pos.names <- as.character(NULL)
|
420
|
-
if(!is.na(pos.breaks.tsv)){
|
421
|
-
tmp <- read.table(pos.breaks.tsv, sep=
|
422
|
-
pos.breaks <- as.numeric(tmp[,1])
|
423
|
-
if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
|
424
|
-
}else if(length(pos.breaks)==1){
|
425
|
-
if(pos.breaks>0){
|
426
|
-
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1)
|
427
|
-
}else{
|
428
|
-
pos.breaks <- c(lim[1,2], lim[,3])
|
429
|
-
pos.names <- lim[,1]
|
539
|
+
if (!is.na(pos.breaks.tsv)){
|
540
|
+
tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
|
541
|
+
pos.breaks <- as.numeric(tmp[, 1])
|
542
|
+
if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
|
543
|
+
} else if (length(pos.breaks) == 1) {
|
544
|
+
if (pos.breaks > 0){
|
545
|
+
pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
|
546
|
+
} else {
|
547
|
+
pos.breaks <- c(lim[1, 2], lim[, 3])
|
548
|
+
pos.names <- lim[, 1]
|
430
549
|
}
|
431
550
|
}
|
432
|
-
if(length(id.breaks)==1){
|
433
|
-
id.range.v <- rec[,rec.idcol]
|
434
|
-
if(!id.free.range) id.range.v <- c(id.range.v,70,100)
|
551
|
+
if (length(id.breaks) == 1) {
|
552
|
+
id.range.v <- rec[, rec.idcol]
|
553
|
+
if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
|
435
554
|
id.range.v <- range(id.range.v)
|
436
|
-
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1)
|
555
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
|
437
556
|
}
|
438
557
|
|
439
558
|
# Run in parallel
|
440
|
-
|
441
|
-
|
442
|
-
rec
|
443
|
-
|
444
|
-
|
445
|
-
rec.l[[
|
446
|
-
|
447
|
-
|
559
|
+
# If they already set threads to 1 manually, there's no point in launching
|
560
|
+
# clusters, it's just slower. Ditto for small files.
|
561
|
+
if (nrow(rec) < 75000 | threads == 1) {
|
562
|
+
# Coerces rec into a form that __counts is happy about
|
563
|
+
rec.l <- list()
|
564
|
+
rec.l[[1]] <- list(rec = rec, verbose = FALSE)
|
565
|
+
|
566
|
+
# No need to make a temporary variable, there's only one return for sure
|
567
|
+
# and it's not a list because it isn't coming back from an apply
|
568
|
+
counts <- enve.recplot2.__counts(
|
569
|
+
rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
|
570
|
+
rec.idcol = rec.idcol)
|
571
|
+
} else {
|
572
|
+
cl <- makeCluster(threads)
|
573
|
+
rec.l <- list()
|
574
|
+
thl <- ceiling(nrow(rec)/threads)
|
575
|
+
for (i in 0:(threads - 1)) {
|
576
|
+
rec.l[[i + 1]] <- list(
|
577
|
+
rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
|
578
|
+
verbose = ifelse(i == 0, verbose, FALSE))
|
579
|
+
}
|
580
|
+
counts.l <- clusterApply(
|
581
|
+
cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
|
582
|
+
id.breaks = id.breaks, rec.idcol = rec.idcol)
|
583
|
+
stopCluster(cl) # No spooky ghost clusters
|
584
|
+
|
585
|
+
counts <- counts.l[[1]]
|
586
|
+
for (i in 2:threads) counts <- counts + counts.l[[i]]
|
448
587
|
}
|
449
|
-
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
450
|
-
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
451
|
-
rec.idcol=rec.idcol)
|
452
|
-
counts <- counts.l[[1]]
|
453
|
-
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
454
|
-
stopCluster(cl)
|
455
588
|
|
456
589
|
# Estimate 1D histograms
|
457
|
-
if(verbose) cat("Building histograms.\n")
|
458
|
-
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
590
|
+
if (verbose) cat("Building histograms.\n")
|
591
|
+
id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
|
459
592
|
id.ingroup <- (id.mids > id.cutoff);
|
460
593
|
id.counts <- apply(counts, 2, id.summary);
|
461
|
-
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
462
|
-
pos.counts.out <- apply(counts[
|
594
|
+
pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
|
595
|
+
pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
|
463
596
|
|
464
597
|
# Plot and return
|
465
598
|
recplot <- new('enve.RecPlot2',
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
if(plot){
|
473
|
-
if(verbose) cat("Plotting.\n")
|
599
|
+
counts = counts, id.counts = id.counts,
|
600
|
+
pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
|
601
|
+
id.breaks = id.breaks, pos.breaks = pos.breaks,
|
602
|
+
pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
|
603
|
+
seq.names = lim[, 1], id.ingroup = id.ingroup,
|
604
|
+
id.metric = id.metric, call = match.call());
|
605
|
+
if (plot) {
|
606
|
+
if (verbose) cat("Plotting.\n")
|
474
607
|
peaks <- plot(recplot, ...);
|
475
608
|
attr(recplot, "peaks") <- peaks
|
476
609
|
}
|
477
610
|
return(recplot);
|
478
|
-
### Returns an object of class `enve.RecPlot2`.
|
479
611
|
}
|
480
612
|
|
613
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder
|
614
|
+
#'
|
615
|
+
#' Identifies peaks in the population histogram potentially indicating
|
616
|
+
#' sub-population mixtures.
|
617
|
+
#'
|
618
|
+
#' @param x
|
619
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
620
|
+
#' @param method
|
621
|
+
#' Peak-finder method. This should be one of:
|
622
|
+
#' \itemize{
|
623
|
+
#' \item \strong{emauto}
|
624
|
+
#' (Expectation-Maximization with auto-selection of components)
|
625
|
+
#' \item \strong{em}
|
626
|
+
#' (Expectation-Maximization)
|
627
|
+
#' \item \strong{mower}
|
628
|
+
#' (Custom distribution-mowing method)
|
629
|
+
#' }
|
630
|
+
#' @param ...
|
631
|
+
#' Any additional parameters supported by
|
632
|
+
#' \code{\link{enve.recplot2.findPeaks}}.
|
633
|
+
#'
|
634
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
635
|
+
#'
|
636
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
637
|
+
#'
|
638
|
+
#' export
|
639
|
+
|
481
640
|
enve.recplot2.findPeaks <- function(
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
method="emauto",
|
487
|
-
### Peak-finder method. This should be one of:
|
488
|
-
### "emauto" (Expectation-Maximization with auto-selection of components),
|
489
|
-
### "em" (Expectation-Maximization),
|
490
|
-
### "mower" (Custom distribution-mowing method).
|
491
|
-
...
|
492
|
-
### Any additional parameters supported by
|
493
|
-
### `enve.recplot2.findPeaks.<method>`.
|
494
|
-
){
|
641
|
+
x,
|
642
|
+
method="emauto",
|
643
|
+
...
|
644
|
+
){
|
495
645
|
if(method == "emauto"){
|
496
646
|
peaks <- enve.recplot2.findPeaks.emauto(x, ...)
|
497
647
|
}else if(method == "em"){
|
@@ -502,29 +652,44 @@ enve.recplot2.findPeaks <- function(
|
|
502
652
|
stop("Invalid peak-finder method ", method)
|
503
653
|
}
|
504
654
|
return(peaks)
|
505
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
506
655
|
}
|
507
656
|
|
657
|
+
#' Enveomics: Recruitment Plot (2) Emauto Peak Finder
|
658
|
+
#'
|
659
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
660
|
+
#' Model Expectation Maximization (GMM-EM) method with number of components
|
661
|
+
#' automatically detected.
|
662
|
+
#'
|
663
|
+
#' @param x
|
664
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
665
|
+
#' @param components
|
666
|
+
#' A vector of number of components to evaluate.
|
667
|
+
#' @param criterion
|
668
|
+
#' Criterion to use for components selection. Must be one of:
|
669
|
+
#' \code{aic} (Akaike Information Criterion),
|
670
|
+
#' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
|
671
|
+
#' @param merge.tol
|
672
|
+
#' When attempting to merge peaks with very similar sequencing depth, use
|
673
|
+
#' this number of significant digits (in log-scale).
|
674
|
+
#' @param verbose
|
675
|
+
#' Display (mostly debugging) information.
|
676
|
+
#' @param ...
|
677
|
+
#' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
|
678
|
+
#'
|
679
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
680
|
+
#'
|
681
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
682
|
+
#'
|
683
|
+
#' @export
|
684
|
+
|
508
685
|
enve.recplot2.findPeaks.emauto <- function(
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
criterion='aic',
|
517
|
-
### Criterion to use for components selection. Must be one of:
|
518
|
-
### 'aic' (Akaike Information Criterion),
|
519
|
-
### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
|
520
|
-
merge.tol=2L,
|
521
|
-
### When attempting to merge peaks with very similar sequencing depth, use
|
522
|
-
### this number of significant digits (in log-scale).
|
523
|
-
verbose=FALSE,
|
524
|
-
### Display (mostly debugging) information.
|
525
|
-
...
|
526
|
-
### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
|
527
|
-
){
|
686
|
+
x,
|
687
|
+
components=seq(1,10),
|
688
|
+
criterion='aic',
|
689
|
+
merge.tol=2L,
|
690
|
+
verbose=FALSE,
|
691
|
+
...
|
692
|
+
){
|
528
693
|
best <- list(crit=0, pstore=list())
|
529
694
|
if(criterion == 'aic'){
|
530
695
|
do_crit <- function(ll, k, n) 2*k - 2*ll
|
@@ -536,55 +701,72 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
536
701
|
for(comp in components){
|
537
702
|
if(verbose) cat('Testing:',comp,'\n')
|
538
703
|
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
539
|
-
|
704
|
+
verbose, ...)
|
540
705
|
}
|
541
706
|
if(length(best[['peaks']])==0) return(list())
|
542
707
|
|
543
708
|
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
544
|
-
|
709
|
+
function(x) x$seq.depth)), merge.tol)
|
545
710
|
distinct <- length(unique(seqdepths.r))
|
546
711
|
if(distinct < length(best[['peaks']])){
|
547
712
|
if(verbose) cat('Attempting merge to', distinct, 'components\n')
|
548
713
|
init <- apply(sapply(best[['peaks']],
|
549
|
-
|
714
|
+
function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
|
550
715
|
init <- init[!duplicated(seqdepths.r),]
|
551
716
|
init <- list(mu=init[,'mean'], sd=init[,'sd'],
|
552
|
-
|
717
|
+
alpha=init[,'alpha']/sum(init[,'alpha']))
|
553
718
|
best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
|
554
|
-
|
719
|
+
verbose, ...)
|
555
720
|
}
|
556
721
|
return(best[['peaks']])
|
557
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
558
722
|
}
|
559
723
|
|
724
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder
|
725
|
+
#'
|
726
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
727
|
+
#' Model Expectation Maximization (GMM-EM) method.
|
728
|
+
#'
|
729
|
+
#' @param x
|
730
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
731
|
+
#' @param max.iter
|
732
|
+
#' Maximum number of EM iterations.
|
733
|
+
#' @param ll.diff.res
|
734
|
+
#' Maximum Log-Likelihood difference to be considered as convergent.
|
735
|
+
#' @param components
|
736
|
+
#' Number of distributions assumed in the mixture.
|
737
|
+
#' @param rm.top
|
738
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
739
|
+
#' This step is useful to remove highly conserved regions, but can be
|
740
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
741
|
+
#' \strong{after} removing zero-coverage windows.
|
742
|
+
#' @param verbose
|
743
|
+
#' Display (mostly debugging) information.
|
744
|
+
#' @param init
|
745
|
+
#' Initialization parameters. By default, these are derived from k-means
|
746
|
+
#' clustering. A named list with vectors for \code{mu}, \code{sd}, and
|
747
|
+
#' \code{alpha}, each of length \code{components}.
|
748
|
+
#' @param log
|
749
|
+
#' Logical value indicating if the estimations should be performed in
|
750
|
+
#' natural logarithm units. Do not change unless you know what you're
|
751
|
+
#' doing.
|
752
|
+
#'
|
753
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
754
|
+
#'
|
755
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
756
|
+
#'
|
757
|
+
#' @export
|
758
|
+
|
560
759
|
enve.recplot2.findPeaks.em <- function(
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
rm.top=0.05,
|
572
|
-
### Top-values to remove before finding peaks, as a quantile probability.
|
573
|
-
### This step is useful to remove highly conserved regions, but can be
|
574
|
-
### turned off by setting rm.top=0. The quantile is determined *after*
|
575
|
-
### removing zero-coverage windows.
|
576
|
-
verbose=FALSE,
|
577
|
-
### Display (mostly debugging) information.
|
578
|
-
init,
|
579
|
-
### Initialization parameters. By default, these are derived from k-means
|
580
|
-
### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
|
581
|
-
### of length `components`.
|
582
|
-
log=TRUE
|
583
|
-
### Logical value indicating if the estimations should be performed in
|
584
|
-
### natural logarithm units. Do not change unless you know what you're
|
585
|
-
### doing.
|
586
|
-
){
|
587
|
-
|
760
|
+
x,
|
761
|
+
max.iter=1000,
|
762
|
+
ll.diff.res=1e-8,
|
763
|
+
components=2,
|
764
|
+
rm.top=0.05,
|
765
|
+
verbose=FALSE,
|
766
|
+
init,
|
767
|
+
log=TRUE
|
768
|
+
){
|
769
|
+
|
588
770
|
# Essential vars
|
589
771
|
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
590
772
|
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
@@ -603,7 +785,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
603
785
|
m.step <- init
|
604
786
|
ll <- c()
|
605
787
|
cur.ll <- -Inf
|
606
|
-
|
788
|
+
|
607
789
|
for(i in 1:max.iter){
|
608
790
|
# 2/3. EM
|
609
791
|
e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
|
@@ -621,224 +803,297 @@ enve.recplot2.findPeaks.em <- function(
|
|
621
803
|
for(i in 1:components){
|
622
804
|
n.hat <- m.step[['alpha']][i]*length(lsd1)
|
623
805
|
peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
806
|
+
values.res=0, mode=m.step[['mu']][i],
|
807
|
+
param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
|
808
|
+
n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
|
809
|
+
merge.logdist=as.numeric(), log=log,
|
810
|
+
seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
|
629
811
|
}
|
630
812
|
return(peaks)
|
631
|
-
### Returns a list of `enve.RecPlot2.Peak` objects.
|
632
813
|
}
|
633
814
|
|
815
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder
|
816
|
+
#'
|
817
|
+
#' Identifies peaks in the population histogram potentially indicating
|
818
|
+
#' sub-population mixtures, using a custom distribution-mowing method.
|
819
|
+
#'
|
820
|
+
#' @param x
|
821
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
822
|
+
#' @param min.points
|
823
|
+
#' Minimum number of points in the quantile-estimation-range
|
824
|
+
#' \code{(quant.est)} to estimate a peak.
|
825
|
+
#' @param quant.est
|
826
|
+
#' Range of quantiles to be used in the estimation of a peak's
|
827
|
+
#' parameters.
|
828
|
+
#' @param mlv.opts
|
829
|
+
#' Ignored. For backwards compatibility.
|
830
|
+
#' @param fitdist.opts.sn
|
831
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
832
|
+
#' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
|
833
|
+
#' ammended with \code{xi=estimated} mode for each peak.
|
834
|
+
#' @param fitdist.opts.norm
|
835
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
836
|
+
#' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
|
837
|
+
#' ammended with \code{mean=estimated} mode for each peak.
|
838
|
+
#' @param rm.top
|
839
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
840
|
+
#' This step is useful to remove highly conserved regions, but can be
|
841
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
842
|
+
#' \strong{after} removing zero-coverage windows.
|
843
|
+
#' @param with.skewness
|
844
|
+
#' Allow skewness correction of the peaks. Typically, the
|
845
|
+
#' sequencing-depth distribution for a single peak is left-skewed, due
|
846
|
+
#' partly (but not exclusively) to fragmentation and mapping sensitivity.
|
847
|
+
#' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
|
848
|
+
#' alternative solution for the first problem (fragmentation) called
|
849
|
+
#' "tail distribution".
|
850
|
+
#' @param optim.rounds
|
851
|
+
#' Maximum rounds of peak optimization.
|
852
|
+
#' @param optim.epsilon
|
853
|
+
#' Trace change at which optimization stops (unless \code{optim.rounds} is
|
854
|
+
#' reached first). The trace change is estimated as the sum of square
|
855
|
+
#' differences between parameters in one round and those from two rounds
|
856
|
+
#' earlier (to avoid infinite loops from approximation).
|
857
|
+
#' @param merge.logdist
|
858
|
+
#' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
|
859
|
+
#' to attempt merging. The default of ~0.22 corresponds to a maximum
|
860
|
+
#' difference of 25\%.
|
861
|
+
#' @param verbose
|
862
|
+
#' Display (mostly debugging) information.
|
863
|
+
#' @param log
|
864
|
+
#' Logical value indicating if the estimations should be performed in
|
865
|
+
#' natural logarithm units. Do not change unless you know what you're
|
866
|
+
#' doing.
|
867
|
+
#'
|
868
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
869
|
+
#'
|
870
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
871
|
+
#'
|
872
|
+
#' @export
|
873
|
+
|
634
874
|
enve.recplot2.findPeaks.mower <- function(
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
fitdist.opts <- fitdist.opts.norm
|
698
|
-
}
|
699
|
-
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
700
|
-
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
701
|
-
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
702
|
-
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
703
|
-
|
704
|
-
# Find seed peaks
|
705
|
-
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
706
|
-
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
707
|
-
|
708
|
-
# Merge overlapping peaks
|
709
|
-
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
710
|
-
merged <- (length(peaks)>1)
|
711
|
-
while(merged){
|
712
|
-
merged <- FALSE
|
713
|
-
ignore <- c()
|
714
|
-
peaks2 <- list();
|
715
|
-
for(i in 1:length(peaks)){
|
716
|
-
if(i %in% ignore) next
|
717
|
-
p <- peaks[[ i ]]
|
718
|
-
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
719
|
-
p2 <- peaks[[ j ]]
|
720
|
-
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
721
|
-
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
722
|
-
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
723
|
-
if(verbose) cat('==> Attempting a merge at',
|
724
|
-
p$param.hat[[ length(p$param.hat) ]],'&',
|
725
|
-
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
726
|
-
peaks.opts$lsd1 <- c(p$values, p2$values)
|
727
|
-
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
728
|
-
if(length(p.new)==1){
|
729
|
-
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
730
|
-
ignore <- c(ignore, j)
|
731
|
-
merged <- TRUE
|
732
|
-
}
|
733
|
-
}
|
734
|
-
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
875
|
+
x,
|
876
|
+
min.points=10,
|
877
|
+
quant.est=c(0.002, 0.998),
|
878
|
+
mlv.opts=list(method='parzen'),
|
879
|
+
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
880
|
+
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
881
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
|
882
|
+
start=list(sd=1), lower=c(0, -Inf)),
|
883
|
+
rm.top=0.05,
|
884
|
+
with.skewness=TRUE,
|
885
|
+
optim.rounds=200,
|
886
|
+
optim.epsilon=1e-4,
|
887
|
+
merge.logdist=log(1.75),
|
888
|
+
verbose=FALSE,
|
889
|
+
log=TRUE
|
890
|
+
){
|
891
|
+
|
892
|
+
# Essential vars
|
893
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
|
894
|
+
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
895
|
+
lsd1 <- seqdepth.in[seqdepth.in>0];
|
896
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
897
|
+
if(log) lsd1 <- log(lsd1)
|
898
|
+
if(with.skewness){
|
899
|
+
fitdist.opts <- fitdist.opts.sn
|
900
|
+
}else{
|
901
|
+
fitdist.opts <- fitdist.opts.norm
|
902
|
+
}
|
903
|
+
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
904
|
+
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
905
|
+
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
906
|
+
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
907
|
+
|
908
|
+
# Find seed peaks
|
909
|
+
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
910
|
+
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
911
|
+
|
912
|
+
# Merge overlapping peaks
|
913
|
+
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
914
|
+
merged <- (length(peaks)>1)
|
915
|
+
while(merged){
|
916
|
+
merged <- FALSE
|
917
|
+
ignore <- c()
|
918
|
+
peaks2 <- list();
|
919
|
+
for(i in 1:length(peaks)){
|
920
|
+
if(i %in% ignore) next
|
921
|
+
p <- peaks[[ i ]]
|
922
|
+
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
923
|
+
p2 <- peaks[[ j ]]
|
924
|
+
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
925
|
+
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
926
|
+
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
927
|
+
if(verbose) cat('==> Attempting a merge at',
|
928
|
+
p$param.hat[[ length(p$param.hat) ]],'&',
|
929
|
+
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
930
|
+
peaks.opts$lsd1 <- c(p$values, p2$values)
|
931
|
+
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
932
|
+
if(length(p.new)==1){
|
933
|
+
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
934
|
+
ignore <- c(ignore, j)
|
935
|
+
merged <- TRUE
|
936
|
+
}
|
735
937
|
}
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
938
|
+
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
939
|
+
}
|
940
|
+
peaks <- peaks2
|
941
|
+
if(length(peaks)==1) break
|
942
|
+
}
|
943
|
+
|
944
|
+
if(verbose) cat('Found',length(peaks),'peak(s)\n')
|
945
|
+
return(peaks);
|
743
946
|
}
|
744
947
|
|
745
948
|
#==============> Define utils
|
949
|
+
|
950
|
+
#' Enveomics: Recruitment Plot (2) Core Peak Finder
|
951
|
+
#'
|
952
|
+
#' Finds the peak in a list of peaks that is most likely to represent the
|
953
|
+
#' "core genome" of a population.
|
954
|
+
#'
|
955
|
+
#' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
|
956
|
+
#'
|
957
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
958
|
+
#'
|
959
|
+
#' @export
|
960
|
+
|
746
961
|
enve.recplot2.corePeak <- function
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
771
|
-
}
|
772
|
-
return(corePeak)
|
962
|
+
(x
|
963
|
+
){
|
964
|
+
# Find the peak with maximum depth (centrality)
|
965
|
+
maxPeak <- x[[
|
966
|
+
which.max(as.numeric(lapply(x,
|
967
|
+
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
968
|
+
]]
|
969
|
+
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
970
|
+
# the default "merge.logdist" distance, take that one instead.
|
971
|
+
corePeak <- maxPeak
|
972
|
+
for(p in x){
|
973
|
+
p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
|
974
|
+
corePeak.len <- ifelse(
|
975
|
+
length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
|
976
|
+
sz.d <- log(p.len/corePeak.len)
|
977
|
+
if(is.nan(sz.d) || sz.d < 0) next
|
978
|
+
sq.d.a <- as.numeric(tail(p$param.hat, n=1))
|
979
|
+
sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
|
980
|
+
if(p$log) sq.d.a <- exp(sq.d.a)
|
981
|
+
if(corePeak$log) sq.d.b <- exp(sq.d.b)
|
982
|
+
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
983
|
+
}
|
984
|
+
return(corePeak)
|
773
985
|
}
|
774
986
|
|
987
|
+
#' Enveomics: Recruitment Plot (2) Change Cutoff
|
988
|
+
#'
|
989
|
+
#' Change the intra-species cutoff of an existing recruitment plot.
|
990
|
+
#'
|
991
|
+
#' @param rp
|
992
|
+
#' \code{\link{enve.RecPlot2}} object.
|
993
|
+
#' @param new.cutoff
|
994
|
+
#' New cutoff to use.
|
995
|
+
#'
|
996
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
997
|
+
#'
|
998
|
+
#' @export
|
999
|
+
|
775
1000
|
enve.recplot2.changeCutoff <- function
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
attr(rp, "pos.counts.out") <- pos.counts.out
|
791
|
-
attr(rp, "call") <- match.call()
|
792
|
-
return(rp)
|
1001
|
+
(rp,
|
1002
|
+
new.cutoff=98
|
1003
|
+
){
|
1004
|
+
# Re-calculate vectors
|
1005
|
+
id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
|
1006
|
+
id.ingroup <- (id.mids > new.cutoff)
|
1007
|
+
pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
|
1008
|
+
pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
|
1009
|
+
# Update object
|
1010
|
+
attr(rp, "id.ingroup") <- id.ingroup
|
1011
|
+
attr(rp, "pos.counts.in") <- pos.counts.in
|
1012
|
+
attr(rp, "pos.counts.out") <- pos.counts.out
|
1013
|
+
attr(rp, "call") <- match.call()
|
1014
|
+
return(rp)
|
793
1015
|
}
|
794
1016
|
|
1017
|
+
#' Enveomics: Recruitment Plot (2) Window Depth Threshold
|
1018
|
+
#'
|
1019
|
+
#' Identifies the threshold below which windows should be identified as
|
1020
|
+
#' variable or absent.
|
1021
|
+
#'
|
1022
|
+
#' @param rp
|
1023
|
+
#' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
|
1024
|
+
#' @param peak
|
1025
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1026
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1027
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1028
|
+
#' @param lower.tail
|
1029
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1030
|
+
#' sequencing depth.
|
1031
|
+
#' @param significance
|
1032
|
+
#' Significance threshold (alpha) to select windows.
|
1033
|
+
#'
|
1034
|
+
#' @return
|
1035
|
+
#' Returns a float. The units are depth if the peaks were estimated in
|
1036
|
+
#' linear scale, or log-depth otherwise (\code{peak$log}).
|
1037
|
+
#'
|
1038
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1039
|
+
#'
|
1040
|
+
#' @export
|
1041
|
+
|
795
1042
|
enve.recplot2.windowDepthThreshold <- function
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
### Significance threshold (alpha) to select windows.
|
809
|
-
){
|
810
|
-
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
811
|
-
par <- peak$param.hat
|
812
|
-
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
813
|
-
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
814
|
-
if(peak$log) thr <- exp(thr)
|
815
|
-
|
816
|
-
return(thr)
|
817
|
-
### Returns a float. The units are depth if the peaks were estimated in
|
818
|
-
### linear scale, or log-depth otherwise (`peak$log`).
|
1043
|
+
(rp,
|
1044
|
+
peak,
|
1045
|
+
lower.tail=TRUE,
|
1046
|
+
significance=0.05
|
1047
|
+
){
|
1048
|
+
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
1049
|
+
par <- peak$param.hat
|
1050
|
+
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
1051
|
+
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
1052
|
+
if(peak$log) thr <- exp(thr)
|
1053
|
+
|
1054
|
+
return(thr)
|
819
1055
|
}
|
820
1056
|
|
1057
|
+
#' Enveomics: Recruitment Plot (2) Extract Windows
|
1058
|
+
#'
|
1059
|
+
#' Extract windows significantly below (or above) the peak in sequencing
|
1060
|
+
#' depth.
|
1061
|
+
#'
|
1062
|
+
#' @param rp
|
1063
|
+
#' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
|
1064
|
+
#' @param peak
|
1065
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1066
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1067
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1068
|
+
#' @param lower.tail
|
1069
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1070
|
+
#' sequencing depth.
|
1071
|
+
#' @param significance
|
1072
|
+
#' Significance threshold (alpha) to select windows.
|
1073
|
+
#' @param seq.names
|
1074
|
+
#' Returns subject sequence names instead of a vector of Booleans. If
|
1075
|
+
#' the recruitment plot was generated with named position bins (e.g, using
|
1076
|
+
#' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
|
1077
|
+
#' vector of characters (the sequence identifiers), otherwise it returns a
|
1078
|
+
#' data.frame with a name column and two columns of coordinates.
|
1079
|
+
#'
|
1080
|
+
#' @return
|
1081
|
+
#' Returns a vector of logicals if \code{seq.names = FALSE}.
|
1082
|
+
#' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
|
1083
|
+
#' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
|
1084
|
+
#' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
|
1085
|
+
#'
|
1086
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1087
|
+
#'
|
1088
|
+
#' @export
|
1089
|
+
|
821
1090
|
enve.recplot2.extractWindows <- function
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
|
829
|
-
### used (see `enve.recplot2.corePeak`).
|
830
|
-
lower.tail=TRUE,
|
831
|
-
### If FALSE, it returns windows significantly above the peak in
|
832
|
-
### sequencing depth.
|
833
|
-
significance=0.05,
|
834
|
-
### Significance threshold (alpha) to select windows.
|
835
|
-
seq.names=FALSE
|
836
|
-
### Returns subject sequence names instead of a vector of Booleans. If
|
837
|
-
### the recruitment plot was generated with named position bins (e.g, using
|
838
|
-
### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
|
839
|
-
### characters (the sequence identifiers), otherwise it returns a data.frame
|
840
|
-
### with a name column and two columns of coordinates.
|
841
|
-
){
|
1091
|
+
(rp,
|
1092
|
+
peak,
|
1093
|
+
lower.tail = TRUE,
|
1094
|
+
significance = 0.05,
|
1095
|
+
seq.names = FALSE
|
1096
|
+
){
|
842
1097
|
# Determine the threshold
|
843
1098
|
thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
|
844
1099
|
|
@@ -850,43 +1105,54 @@ enve.recplot2.extractWindows <- function
|
|
850
1105
|
sel <- seqdepth.in > thr
|
851
1106
|
}
|
852
1107
|
|
853
|
-
# seq.names=FALSE
|
1108
|
+
# seq.names = FALSE
|
854
1109
|
if(!seq.names) return(sel)
|
855
|
-
# seq.names=TRUE
|
856
|
-
|
857
|
-
# seq.names=TRUE and pos.names undefined
|
858
|
-
return(enve.recplot2.coordinates(rp,sel))
|
859
|
-
### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
|
860
|
-
### it returns a vector of characters if the object has `pos.names` defined,
|
861
|
-
### or a data.frame with four columns otherwise:
|
862
|
-
### name.from, name.to, pos.from, and pos.to
|
863
|
-
### (see `enve.recplot2.coordinates`).
|
1110
|
+
# seq.names = TRUE
|
1111
|
+
return(enve.recplot2.coordinates(rp, sel))
|
864
1112
|
}
|
865
1113
|
|
1114
|
+
#' Enveomics: Recruitment Plot (2) Compare Identities
|
1115
|
+
#'
|
1116
|
+
#' Compare the distribution of identities between two
|
1117
|
+
#' \code{\link{enve.RecPlot2}} objects.
|
1118
|
+
#'
|
1119
|
+
#' @param x
|
1120
|
+
#' First \code{\link{enve.RecPlot2}} object.
|
1121
|
+
#' @param y
|
1122
|
+
#' Second \code{\link{enve.RecPlot2}} object.
|
1123
|
+
#' @param method
|
1124
|
+
#' Distance method to use. This should be (an unambiguous abbreviation of)
|
1125
|
+
#' one of:
|
1126
|
+
#' \itemize{
|
1127
|
+
#' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
|
1128
|
+
#' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
|
1129
|
+
#' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
|
1130
|
+
#' doi:10.1214/aoms/1177729694}), or}
|
1131
|
+
#' \item{"euclidean"}
|
1132
|
+
#' }
|
1133
|
+
#' @param smooth.par
|
1134
|
+
#' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
1135
|
+
#' Use \code{NULL} to automatically determine this value using leave-one-out
|
1136
|
+
#' cross-validation (see \code{smooth.spline} parameter \code{spar}).
|
1137
|
+
#' @param pseudocounts
|
1138
|
+
#' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
1139
|
+
#' 1 for add-one smoothing.
|
1140
|
+
#' @param max.deviation
|
1141
|
+
#' Maximum mean deviation between identity breaks tolerated (as percent
|
1142
|
+
#' identity). Difference in number of \code{id.breaks} is never tolerated.
|
1143
|
+
#'
|
1144
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1145
|
+
#'
|
1146
|
+
#' @export
|
1147
|
+
|
866
1148
|
enve.recplot2.compareIdentities <- function
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
### one of:
|
875
|
-
### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
|
876
|
-
### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
|
877
|
-
### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
|
878
|
-
### doi:10.1214/aoms/1177729694), or "euclidean".
|
879
|
-
smooth.par=NULL,
|
880
|
-
### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
881
|
-
### Use NULL to automatically determine this value using leave-one-out
|
882
|
-
### cross-validation (see `smooth.spline` parameter `spar`).
|
883
|
-
pseudocounts=0,
|
884
|
-
### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
885
|
-
### 1 for add-one smoothing.
|
886
|
-
max.deviation=0.75
|
887
|
-
### Maximum mean deviation between identity breaks tolerated (as percent
|
888
|
-
### identity). Difference in number of id.breaks is never tolerated.
|
889
|
-
){
|
1149
|
+
(x,
|
1150
|
+
y,
|
1151
|
+
method="hellinger",
|
1152
|
+
smooth.par=NULL,
|
1153
|
+
pseudocounts=0,
|
1154
|
+
max.deviation=0.75
|
1155
|
+
){
|
890
1156
|
METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
|
891
1157
|
i.meth <- pmatch(method, METHODS)
|
892
1158
|
if (is.na(i.meth)) stop("Invalid distance ", method)
|
@@ -899,7 +1165,7 @@ enve.recplot2.compareIdentities <- function
|
|
899
1165
|
dev <- mean(abs(x$id.breaks - y$id.breaks))
|
900
1166
|
if(dev > max.deviation)
|
901
1167
|
stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
|
902
|
-
|
1168
|
+
dev)
|
903
1169
|
x.cnt <- x$id.counts
|
904
1170
|
y.cnt <- y$id.counts
|
905
1171
|
if(is.null(smooth.par) || smooth.par > 0){
|
@@ -928,63 +1194,92 @@ enve.recplot2.compareIdentities <- function
|
|
928
1194
|
return(d)
|
929
1195
|
}
|
930
1196
|
|
1197
|
+
#' Enveomics: Recruitment Plot (2) Coordinates
|
1198
|
+
#'
|
1199
|
+
#' Returns the sequence name and coordinates of the requested position bins.
|
1200
|
+
#'
|
1201
|
+
#' @param x
|
1202
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1203
|
+
#' @param bins
|
1204
|
+
#' Vector of selected bins to return. It can be a vector of logical values
|
1205
|
+
#' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
|
1206
|
+
#' missing, returns the coordinates of all windows.
|
1207
|
+
#'
|
1208
|
+
#' @return
|
1209
|
+
#' Returns a data.frame with five columns: \code{name.from} (character),
|
1210
|
+
#' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
|
1211
|
+
#' (numeric), and \code{seq.name} (character).
|
1212
|
+
#' The first two correspond to sequence and position of the start point of the
|
1213
|
+
#' bin. The next two correspond to the sequence and position of the end point of
|
1214
|
+
#' the bin. The last one indicates the name of the sequence (if defined).
|
1215
|
+
#'
|
1216
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1217
|
+
#'
|
1218
|
+
#' @export
|
1219
|
+
|
931
1220
|
enve.recplot2.coordinates <- function
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
bins
|
936
|
-
### Vector of selected bins to return. It can be a vector of logical values
|
937
|
-
### with the same length as `x$pos.breaks`-1 or a vector of integers. If
|
938
|
-
### missing, returns the coordinates of all windows.
|
939
|
-
){
|
1221
|
+
(x,
|
1222
|
+
bins
|
1223
|
+
){
|
940
1224
|
if(!inherits(x, "enve.RecPlot2"))
|
941
1225
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
942
1226
|
if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
|
943
1227
|
if(!is.vector(bins)) stop("'bins' must be a vector")
|
944
1228
|
if(inherits(bins, "logical")) bins <- which(bins)
|
945
1229
|
|
946
|
-
y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
|
947
|
-
|
1230
|
+
y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
|
1231
|
+
|
948
1232
|
for(i in 1:length(bins)){
|
949
1233
|
j <- bins[i]
|
950
1234
|
# Concatenated coordinates
|
951
1235
|
cc <- x$pos.breaks[c(j, j+1)]
|
952
1236
|
# Find the corresponding `seq.breaks`
|
953
1237
|
sb.from <- which(
|
954
|
-
|
955
|
-
|
1238
|
+
cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
|
1239
|
+
cc[1] < x$seq.breaks[-1])
|
956
1240
|
sb.to <- which(
|
957
|
-
|
958
|
-
|
1241
|
+
cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
|
1242
|
+
cc[2] <= x$seq.breaks[-1])
|
959
1243
|
# Translate coordinates
|
960
1244
|
if(length(sb.from)==1 & length(sb.to)==1){
|
961
1245
|
y[i, 'name.from'] <- x$seq.names[sb.from]
|
962
|
-
y[i, 'pos.from']
|
1246
|
+
y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
|
963
1247
|
y[i, 'name.to'] <- x$seq.names[sb.to]
|
964
|
-
y[i, 'pos.to']
|
1248
|
+
y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
|
1249
|
+
y[i, 'seq.name'] <- x$pos.names[i]
|
965
1250
|
}
|
966
1251
|
}
|
967
1252
|
|
968
1253
|
return(y)
|
969
|
-
### Returns a data.frame with four columns: name.from (character), pos.from
|
970
|
-
### (numeric) name.to (character), and pos.to (numeric). The first two
|
971
|
-
### correspond to sequence and position of the start point of the bin, the
|
972
|
-
### last two correspond to the sequence and position of the end point of the
|
973
|
-
### bin.
|
974
1254
|
}
|
975
1255
|
|
1256
|
+
#' Enveomics: Recruitment Plot (2) Sequencing Depth
|
1257
|
+
#'
|
1258
|
+
#' Calculate the sequencing depth of the given window(s).
|
1259
|
+
#'
|
1260
|
+
#' @param x
|
1261
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1262
|
+
#' @param sel
|
1263
|
+
#' Window(s) for which the sequencing depth is to be calculated. If not
|
1264
|
+
#' passed, it returns the sequencing depth of all windows.
|
1265
|
+
#' @param low.identity
|
1266
|
+
#' A logical indicating if the sequencing depth is to be estimated only
|
1267
|
+
#' with low-identity matches. By default, only high-identity matches are
|
1268
|
+
#' used.
|
1269
|
+
#'
|
1270
|
+
#' @return
|
1271
|
+
#' Returns a numeric vector of sequencing depths (in bp/bp).
|
1272
|
+
#'
|
1273
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1274
|
+
#'
|
1275
|
+
#' @export
|
1276
|
+
|
976
1277
|
enve.recplot2.seqdepth <- function
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
### passed, it returns the sequencing depth of all windows
|
983
|
-
low.identity=FALSE
|
984
|
-
### A logical indicating if the sequencing depth is to be estimated only
|
985
|
-
### with low-identity matches. By default, only high-identity matches are
|
986
|
-
### used.
|
987
|
-
){
|
1278
|
+
|
1279
|
+
(x,
|
1280
|
+
sel,
|
1281
|
+
low.identity=FALSE
|
1282
|
+
){
|
988
1283
|
if(!inherits(x, "enve.RecPlot2"))
|
989
1284
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
990
1285
|
if(low.identity){
|
@@ -997,21 +1292,30 @@ enve.recplot2.seqdepth <- function
|
|
997
1292
|
seqdepth.in <- pos.cnts.in/pos.binsize
|
998
1293
|
if(missing(sel)) return(seqdepth.in)
|
999
1294
|
return(seqdepth.in[sel])
|
1000
|
-
### Returns a numeric vector of sequencing depths (in bp/bp).
|
1001
1295
|
}
|
1002
1296
|
|
1297
|
+
#' Enveomics: Recruitment Plot (2) ANI Estimate
|
1298
|
+
#'
|
1299
|
+
#' Estimate the Average Nucleotide Identity from reads (ANIr) from a
|
1300
|
+
#' recruitment plot.
|
1301
|
+
#'
|
1302
|
+
#' @param x
|
1303
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1304
|
+
#' @param range
|
1305
|
+
#' Range of identities to be considered. By default, the full range
|
1306
|
+
#' is used (note that the upper boundary is \code{Inf} and not 100 because
|
1307
|
+
#' recruitment plots can also be built with bit-scores). To use only
|
1308
|
+
#' intra-population matches (with identities), use c(95,100). To use only
|
1309
|
+
#' inter-population values, use c(0,95).
|
1310
|
+
#'
|
1311
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1312
|
+
#'
|
1313
|
+
#' @export
|
1314
|
+
|
1003
1315
|
enve.recplot2.ANIr <- function
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
### `enve.RecPlot2` object.
|
1008
|
-
range=c(0,Inf)
|
1009
|
-
### Range of identities to be considered. By default, the full range
|
1010
|
-
### is used (note that the upper boundary is `Inf` and not 100 because
|
1011
|
-
### recruitment plots can also be built with bit-scores). To use only
|
1012
|
-
### intra-population matches (with identities), use c(95,100). To use only
|
1013
|
-
### inter-population values, use c(0,95).
|
1014
|
-
){
|
1316
|
+
(x,
|
1317
|
+
range=c(0,Inf)
|
1318
|
+
){
|
1015
1319
|
if(!inherits(x, "enve.RecPlot2"))
|
1016
1320
|
stop("'x' must inherit from class `enve.RecPlot2`")
|
1017
1321
|
id.b <- x$id.breaks
|
@@ -1023,35 +1327,82 @@ enve.recplot2.ANIr <- function
|
|
1023
1327
|
}
|
1024
1328
|
|
1025
1329
|
#==============> Define internal functions
|
1330
|
+
|
1331
|
+
#' Enveomics: Recruitment Plot (2) Internal Ancillary Function
|
1332
|
+
#'
|
1333
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2}}).
|
1334
|
+
#'
|
1335
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1336
|
+
#' @param pos.breaks Position breaks
|
1337
|
+
#' @param id.breaks Identity breaks
|
1338
|
+
#' @param rec.idcol Identity column to use
|
1339
|
+
#'
|
1340
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1341
|
+
#' @author Kenji Gerhardt [aut]
|
1342
|
+
#'
|
1343
|
+
#' @export
|
1344
|
+
|
1026
1345
|
enve.recplot2.__counts <- function
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1346
|
+
(x, pos.breaks, id.breaks, rec.idcol) {
|
1347
|
+
rec2 <- x$rec
|
1348
|
+
verbose <- x$verbose
|
1349
|
+
|
1350
|
+
# get counts of how many occurrences of each genome pos.bin there are per read
|
1351
|
+
x.bins <- mapply(
|
1352
|
+
function(start, end) {
|
1353
|
+
list(rle(findInterval(start:end, pos.breaks, left.open = T)))
|
1354
|
+
}, rec2[, 1], rec2[, 2])
|
1355
|
+
|
1356
|
+
# find the single y bin for each row, replicates it at the correct places to
|
1357
|
+
# the number of distinct bins found in its row
|
1358
|
+
y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
|
1359
|
+
times = unlist(lapply(x.bins, function(a) length(a$lengths))))
|
1360
|
+
|
1361
|
+
# x.bins_counts is the number of occurrences of each bin a row contains,
|
1362
|
+
# per row, then unlisted
|
1363
|
+
x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
|
1364
|
+
|
1365
|
+
# these are the pos. in. genome bins that each count in x.bins_counts falls into
|
1366
|
+
x.bins <- unlist(lapply(x.bins, function(a) a$values))
|
1367
|
+
|
1368
|
+
# much more efficient counts implementation in R using lists instead of a matrix:
|
1369
|
+
counts <- lapply(
|
1370
|
+
1:(length(pos.breaks) - 1),
|
1371
|
+
function(col_len) rep(0, length(id.breaks) - 1))
|
1372
|
+
|
1373
|
+
# accesses the correct list in counts by x.bin, then
|
1374
|
+
# accesses the position in that row by y.bins and adds the new count
|
1375
|
+
for(i in 1:length(x.bins)){
|
1376
|
+
counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
|
1377
|
+
}
|
1378
|
+
|
1379
|
+
counts <- do.call(rbind, counts)
|
1380
|
+
return(counts)
|
1044
1381
|
}
|
1045
1382
|
|
1383
|
+
#' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
|
1384
|
+
#'
|
1385
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
|
1386
|
+
#'
|
1387
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1388
|
+
#' @param comp Components
|
1389
|
+
#' @param do_crit Function estimating the criterion
|
1390
|
+
#' @param best Best solution thus far
|
1391
|
+
#' @param verbose If verbose
|
1392
|
+
#' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
|
1393
|
+
#'
|
1394
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1395
|
+
#'
|
1396
|
+
#' @export
|
1397
|
+
|
1046
1398
|
enve.recplot2.findPeaks.__emauto_one <- function
|
1047
|
-
|
1048
|
-
(x, comp, do_crit, best, verbose, ...){
|
1399
|
+
(x, comp, do_crit, best, verbose, ...){
|
1049
1400
|
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
1050
1401
|
if(length(peaks)==0) return(best)
|
1051
1402
|
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
1052
1403
|
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
1053
1404
|
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
1054
|
-
|
1405
|
+
ifelse(crit > best[['crit']], '*', ''), '\n')
|
1055
1406
|
if(crit > best[['crit']]){
|
1056
1407
|
best[['crit']] <- crit
|
1057
1408
|
best[['peaks']] <- peaks
|
@@ -1059,148 +1410,217 @@ enve.recplot2.findPeaks.__emauto_one <- function
|
|
1059
1410
|
best[['pstore']][[comp]] <- peaks
|
1060
1411
|
return(best)
|
1061
1412
|
}
|
1413
|
+
|
1414
|
+
#' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
|
1415
|
+
#'
|
1416
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1417
|
+
#'
|
1418
|
+
#' @param x Vector of log-transformed sequencing depths
|
1419
|
+
#' @param theta Parameters list
|
1420
|
+
#'
|
1421
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1422
|
+
#'
|
1423
|
+
#' @export
|
1424
|
+
|
1062
1425
|
enve.recplot2.findPeaks.__em_e <- function
|
1063
|
-
|
1064
|
-
(x, theta){
|
1426
|
+
(x, theta){
|
1065
1427
|
components <- length(theta[['mu']])
|
1066
1428
|
product <- do.call(cbind,
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1429
|
+
lapply(1:components,
|
1430
|
+
function(i) dnorm(x, theta[['mu']][i],
|
1431
|
+
theta[['sd']][i])*theta[['alpha']][i]))
|
1070
1432
|
sum.of.components <- rowSums(product)
|
1071
1433
|
posterior <- product / sum.of.components
|
1072
|
-
|
1434
|
+
|
1073
1435
|
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1074
1436
|
}
|
1075
1437
|
|
1438
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
|
1439
|
+
#'
|
1440
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1441
|
+
#'
|
1442
|
+
#' @param x Vector of log-transformed sequencing depths
|
1443
|
+
#' @param posterior Posterior probability
|
1444
|
+
#'
|
1445
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1446
|
+
#'
|
1447
|
+
#' @export
|
1448
|
+
|
1076
1449
|
enve.recplot2.findPeaks.__em_m <- function
|
1077
|
-
|
1078
|
-
(x, posterior){
|
1450
|
+
(x, posterior){
|
1079
1451
|
components <- ncol(posterior)
|
1080
1452
|
n <- colSums(posterior)
|
1081
1453
|
mu <- colSums(posterior * x) / n
|
1082
1454
|
sd <- sqrt( colSums(
|
1083
|
-
|
1455
|
+
posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
|
1084
1456
|
alpha <- n/length(x)
|
1085
1457
|
return(list(mu=mu, sd=sd, alpha=alpha))
|
1086
1458
|
}
|
1087
1459
|
|
1460
|
+
#' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
|
1461
|
+
#'
|
1462
|
+
#' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
|
1463
|
+
#'
|
1464
|
+
#' @param x \code{\link{enve.RecPlot2.Peak}} object
|
1465
|
+
#' @param mids Midpoints
|
1466
|
+
#' @param counts Counts
|
1467
|
+
#'
|
1468
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1469
|
+
#'
|
1470
|
+
#' @export
|
1471
|
+
|
1088
1472
|
enve.recplot2.__peakHist <- function
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
return(prob*x$n.hat/sum(prob))
|
1473
|
+
(x, mids, counts=TRUE){
|
1474
|
+
d.o <- x$param.hat
|
1475
|
+
if(length(x$log)==0) x$log <- FALSE
|
1476
|
+
if(x$log){
|
1477
|
+
d.o$x <- log(mids)
|
1478
|
+
}else{
|
1479
|
+
d.o$x <- mids
|
1480
|
+
}
|
1481
|
+
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
1482
|
+
if(!counts) return(prob)
|
1483
|
+
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
1484
|
+
return(prob*x$n.hat/sum(prob))
|
1102
1485
|
}
|
1103
1486
|
|
1487
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
|
1488
|
+
#'
|
1489
|
+
#' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1490
|
+
#'
|
1491
|
+
#' @param lsd1 Vector of log-transformed sequencing depths
|
1492
|
+
#' @param min.points Minimum number of points
|
1493
|
+
#' @param quant.est Quantile estimate
|
1494
|
+
#' @param mlv.opts List of options for \code{mlv}
|
1495
|
+
#' @param fitdist.opts List of options for \code{fitdist}
|
1496
|
+
#' @param with.skewness If skewed-normal should be used
|
1497
|
+
#' @param optim.rounds Maximum number of optimization rounds
|
1498
|
+
#' @param optim.epsilon Minimum difference considered negligible
|
1499
|
+
#' @param n.total Global number of windows
|
1500
|
+
#' @param merge.logdist Attempted \code{merge.logdist} parameter
|
1501
|
+
#' @param verbose If verbose
|
1502
|
+
#' @param log If log-transformed depths
|
1503
|
+
#'
|
1504
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1505
|
+
#'
|
1506
|
+
#' @export
|
1507
|
+
|
1104
1508
|
enve.recplot2.findPeaks.__mow_one <- function
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
if(
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
if(epsilon2 < optim.epsilon) break;
|
1142
|
-
}
|
1509
|
+
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
1510
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
|
1511
|
+
){
|
1512
|
+
dist <- ifelse(with.skewness, 'sn', 'norm');
|
1513
|
+
|
1514
|
+
# Find peak
|
1515
|
+
o <- mlv.opts; o$x = lsd1;
|
1516
|
+
mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
|
1517
|
+
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
1518
|
+
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
1519
|
+
lim <- NA;
|
1520
|
+
if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
|
1521
|
+
|
1522
|
+
# Refine peak parameters
|
1523
|
+
for(round in 1:optim.rounds){
|
1524
|
+
param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
|
1525
|
+
lim.o <- param.hat
|
1526
|
+
lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
|
1527
|
+
lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
|
1528
|
+
if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
|
1529
|
+
'and params =',as.numeric(param.hat),' \r')
|
1530
|
+
if(length(lsd1.pop) < min.points) break;
|
1531
|
+
o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
|
1532
|
+
last.last.hat <- last.hat
|
1533
|
+
last.hat <- param.hat
|
1534
|
+
param.hat <- as.list(do.call(fitdist, o)$estimate);
|
1535
|
+
if(any(is.na(param.hat))){
|
1536
|
+
if(round>1) param.hat <- last.hat;
|
1537
|
+
break;
|
1538
|
+
}
|
1539
|
+
if(round > 1){
|
1540
|
+
epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
|
1541
|
+
if(epsilon1 < optim.epsilon) break;
|
1542
|
+
if(round > 2){
|
1543
|
+
epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
1544
|
+
if(epsilon2 < optim.epsilon) break;
|
1143
1545
|
}
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1546
|
+
}
|
1547
|
+
}
|
1548
|
+
if(verbose) cat('\n')
|
1549
|
+
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
1550
|
+
|
1551
|
+
# Mow distribution
|
1552
|
+
lsd2 <- c();
|
1553
|
+
lsd.pop <- c();
|
1554
|
+
n.hat <- length(lsd1.pop)/diff(quant.est)
|
1555
|
+
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
1556
|
+
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
1557
|
+
merge.logdist=merge.logdist, log=log)
|
1558
|
+
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
1559
|
+
peak.cnt <- enve.recplot2.__peakHist(peak,
|
1560
|
+
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
1561
|
+
for(i in 2:length(peak.breaks)){
|
1562
|
+
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
1563
|
+
n.exp <- peak.cnt[i-1]
|
1564
|
+
if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
|
1565
|
+
if(length(values)==0) next
|
1566
|
+
in.peak <- runif(length(values)) <= n.exp/length(values)
|
1567
|
+
lsd2 <- c(lsd2, values[!in.peak])
|
1568
|
+
lsd.pop <- c(lsd.pop, values[in.peak])
|
1569
|
+
}
|
1570
|
+
if(length(lsd.pop) < min.points) return(NULL)
|
1571
|
+
|
1572
|
+
# Return peak
|
1573
|
+
attr(peak, 'values') <- lsd.pop
|
1574
|
+
attr(peak, 'values.res') <- lsd2
|
1575
|
+
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
1576
|
+
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
1577
|
+
plot=FALSE)$counts)+1)/2
|
1578
|
+
mu <- tail(param.hat, n=1)
|
1579
|
+
attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
|
1580
|
+
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
1581
|
+
'with expected n =',n.hat,'\n')
|
1582
|
+
return(peak)
|
1180
1583
|
}
|
1181
1584
|
|
1585
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
|
1586
|
+
#'
|
1587
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1588
|
+
#'
|
1589
|
+
#' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
|
1590
|
+
#'
|
1591
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1592
|
+
#'
|
1593
|
+
#' @export
|
1594
|
+
|
1182
1595
|
enve.recplot2.findPeaks.__mower <- function
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
return(peaks)
|
1596
|
+
(peaks.opts){
|
1597
|
+
peaks <- list()
|
1598
|
+
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
1599
|
+
peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
|
1600
|
+
if(is.null(peak)) break
|
1601
|
+
peaks[[ length(peaks)+1 ]] <- peak
|
1602
|
+
peaks.opts$lsd1 <- peak$values.res
|
1603
|
+
}
|
1604
|
+
return(peaks)
|
1193
1605
|
}
|
1194
1606
|
|
1607
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
|
1608
|
+
#'
|
1609
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
|
1610
|
+
#'
|
1611
|
+
#' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
|
1612
|
+
#' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
|
1613
|
+
#'
|
1614
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1615
|
+
#'
|
1616
|
+
#' @export
|
1195
1617
|
|
1196
1618
|
enve.recplot2.__whichClosestPeak <- function
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
return(which.min(dist))
|
1619
|
+
(peak, peaks){
|
1620
|
+
dist <- as.numeric(lapply(peaks,
|
1621
|
+
function(x)
|
1622
|
+
abs(log(x$param.hat[[ length(x$param.hat) ]] /
|
1623
|
+
peak$param.hat[[ length(peak$param.hat) ]] ))))
|
1624
|
+
dist[ dist==0 ] <- Inf
|
1625
|
+
return(which.min(dist))
|
1205
1626
|
}
|
1206
|
-
|