miga-base 0.4.3.0 → 0.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -1,497 +1,647 @@
1
1
  #==============> Define S4 classes
2
- setClass("enve.RecPlot2",
3
- ### Enve-omics representation of Recruitment plots. This object can
4
- ### be produced by `enve.recplot2` and supports S4 method plot.
5
- representation(
6
- counts='matrix', ##<< Counts as a two-dimensional histogram.
7
- pos.counts.in='numeric', ##<< Counts of in-group hits per position bin.
8
- pos.counts.out='numeric', ##<< Counts of out-group hits per position bin.
9
- id.counts='numeric', ##<< Counts per ID bin.
10
- id.breaks='numeric', ##<< Breaks of identity bins.
11
- pos.breaks='numeric', ##<< Breaks of position bins.
12
- pos.names='character', ##<< Names of the position bins.
13
- seq.breaks='numeric', ##<< Breaks of input sequences.
14
- peaks='list', ##<< Peaks identified in the recplot.
15
- ### Limits of the subject sequences after concatenation.
16
- seq.names='character', ##<< Names of the subject sequences.
17
- id.metric='character', ##<< Metric used as 'identity'.
18
- id.ingroup='logical', ##<< Identity bins considered in-group.
19
- call='call') ##<< Call producing this object.
20
- ,package='enveomics.R'
21
- );
22
- setClass("enve.RecPlot2.Peak",
23
- ### Enve-omics representation of a peak in the sequencing depth histogram
24
- ### of a Recruitment plot (see `enve.recplot2.findPeaks`).
25
- representation(
26
- dist='character',
27
- ### Distribution of the peak. Currently supported: 'norm' (normal) and 'sn'
28
- ### (skew-normal).
29
- values='numeric',
30
- ### Sequencing depth values predicted to conform the peak.
31
- values.res='numeric',
32
- ### Sequencing depth values not explained by this or previously identified
33
- ### peaks.
34
- mode='numeric',
35
- ### Seed-value of mode anchoring the peak.
36
- param.hat='list',
37
- ### Parameters of the distribution. A list of two values if dist='norm' (sd
38
- ### and mean), or three values if dist='sn' (omega=scale, alpha=shape, and
39
- ### xi=location). Note that the "dispersion" parameter is always first and
40
- ### the "location" parameter is always last.
41
- n.hat='numeric',
42
- ### Number of bins estimated to be explained by this peak. This should
43
- ### ideally be equal to the length of `values`, but it's not and integer.
44
- n.total='numeric',
45
- ### Total number of bins from which the peak was extracted. I.e., total
46
- ### number of position bins with non-zero sequencing depth in the recruitment
47
- ### plot (regardless of peak count).
48
- err.res='numeric',
49
- ### Error left after adding the peak (mower) or log-likelihood (em or emauto).
50
- merge.logdist='numeric',
51
- ### Attempted `merge.logdist` parameter.
52
- seq.depth='numeric',
53
- ### Best estimate available for the sequencing depth of the peak (centrality).
54
- log='logical'
55
- ### Indicates if the estimation was performed in natural logarithm space
56
- ));
2
+
3
+ #' Enveomics: Recruitment Plot (2) - S4 Class
4
+ #'
5
+ #' Enve-omics representation of Recruitment plots. This object can
6
+ #' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
7
+ #'
8
+ #' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
9
+ #' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
10
+ #' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
11
+ #' @slot id.counts \code{(numeric)} Counts per ID bin.
12
+ #' @slot id.breaks \code{(numeric)} Breaks of identity bins.
13
+ #' @slot pos.breaks \code{(numeric)} Breaks of position bins.
14
+ #' @slot pos.names \code{(character)} Names of the position bins.
15
+ #' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
16
+ #' @slot peaks \code{(list)} Peaks identified in the recplot.
17
+ #' Limits of the subject sequences after concatenation.
18
+ #' @slot seq.names \code{(character}) Names of the subject sequences.
19
+ #' @slot id.metric \code{(character}) Metric used as 'identity'.
20
+ #' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
21
+ #' @slot call \code{(call)} Call producing this object.
22
+ #'
23
+ #' @author Luis M. Rodriguez-R [aut, cre]
24
+ #'
25
+ #' @exportClass
26
+
27
+ enve.RecPlot2 <- setClass("enve.RecPlot2",
28
+ representation(
29
+ # slots = list(
30
+ counts='matrix',
31
+ pos.counts.in='numeric',
32
+ pos.counts.out='numeric',
33
+ id.counts='numeric',
34
+ id.breaks='numeric',
35
+ pos.breaks='numeric',
36
+ pos.names='character',
37
+ seq.breaks='numeric',
38
+ peaks='list',
39
+ seq.names='character',
40
+ id.metric='character',
41
+ id.ingroup='logical',
42
+ call='call')
43
+ ,package='enveomics.R'
44
+ );
45
+
46
+ #' Enveomics: Recruitment Plot (2) Peak - S4 Class
47
+ #'
48
+ #' Enve-omics representation of a peak in the sequencing depth histogram
49
+ #' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
50
+ #'
51
+ #' @slot dist \code{(character)}
52
+ #' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
53
+ #' (skew-normal).
54
+ #' @slot values \code{(numeric)}
55
+ #' Sequencing depth values predicted to conform the peak.
56
+ #' @slot values.res \code{(numeric)}
57
+ #' Sequencing depth values not explained by this or previously identified
58
+ #' peaks.
59
+ #' @slot mode \code{(numeric)}
60
+ #' Seed-value of mode anchoring the peak.
61
+ #' @slot param.hat \code{(list)}
62
+ #' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
63
+ #' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
64
+ #' xi=location). Note that the "dispersion" parameter is always first and
65
+ #' the "location" parameter is always last.
66
+ #' @slot n.hat \code{(numeric)}
67
+ #' Number of bins estimated to be explained by this peak. This should
68
+ #' ideally be equal to the length of \code{values}, but it's not an integer.
69
+ #' @slot n.total \code{(numeric)}
70
+ #' Total number of bins from which the peak was extracted. I.e., total
71
+ #' number of position bins with non-zero sequencing depth in the recruitment
72
+ #' plot (regardless of peak count).
73
+ #' @slot err.res \code{(numeric)}
74
+ #' Error left after adding the peak (mower) or log-likelihood (em or emauto).
75
+ #' @slot merge.logdist \code{(numeric)}
76
+ #' Attempted \code{merge.logdist} parameter.
77
+ #' @slot seq.depth \code{(numeric)}
78
+ #' Best estimate available for the sequencing depth of the peak (centrality).
79
+ #' @slot log \code{(logical)}
80
+ #' Indicates if the estimation was performed in natural logarithm space.
81
+ #'
82
+ #' @author Luis M. Rodriguez-R [aut, cre]
83
+ #'
84
+ #' @exportClass
85
+
86
+ enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
87
+ representation(
88
+ # slots = list(
89
+ dist='character',
90
+ values='numeric',
91
+ values.res='numeric',
92
+ mode='numeric',
93
+ param.hat='list',
94
+ n.hat='numeric',
95
+ n.total='numeric',
96
+ err.res='numeric',
97
+ merge.logdist='numeric',
98
+ seq.depth='numeric',
99
+ log='logical'
100
+ ));
101
+
102
+ #' Attribute accessor
103
+ #'
104
+ #'
105
+ #' @param x Object
106
+ #' @param name Attribute name
57
107
  setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
108
+
109
+ #' Attribute accessor
110
+ #'
111
+ #'
112
+ #' @param x Object
113
+ #' @param name Attribute name
58
114
  setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
59
115
 
60
116
  #==============> Define S4 methods
117
+
118
+ #' Enveomics: Recruitment Plot (2)
119
+ #'
120
+ #' Plots an \code{\link{enve.RecPlot2}} object.
121
+ #'
122
+ #' @param x
123
+ #' \code{\link{enve.RecPlot2}} object to plot.
124
+ #' @param layout
125
+ #' Matrix indicating the position of the different panels in the layout,
126
+ #' where:
127
+ #' \itemize{
128
+ #' \item 0: Empty space
129
+ #' \item 1: Counts matrix
130
+ #' \item 2: position histogram (sequencing depth)
131
+ #' \item 3: identity histogram
132
+ #' \item 4: Populations histogram (histogram of sequencing depths)
133
+ #' \item 5: Color scale for the counts matrix (vertical)
134
+ #' \item 6: Color scale of the counts matrix (horizontal)
135
+ #' }
136
+ #' Only panels indicated here will be plotted. To plot only one panel
137
+ #' simply set this to the number of the panel you want to plot.
138
+ #' @param panel.fun
139
+ #' List of functions to be executed after drawing each panel. Use the
140
+ #' indices in \code{layout} (as characters) as keys. Functions for indices
141
+ #' missing in \code{layout} are ignored. For example, to add a vertical line
142
+ #' at the 3Mbp mark in both the position histogram and the counts matrix:
143
+ #' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
144
+ #' Note that the X-axis in both panels is in Mbp by default. To change
145
+ #' this behavior, set \code{pos.units} accordingly.
146
+ #' @param widths
147
+ #' Relative widths of the columns of \code{layout}.
148
+ #' @param heights
149
+ #' Relative heights of the rows of \code{layout}.
150
+ #' @param palette
151
+ #' Colors to be used to represent the counts matrix, sorted from no hits
152
+ #' to the maximum sequencing depth.
153
+ #' @param underlay.group
154
+ #' If TRUE, it indicates the in-group and out-group areas couloured based
155
+ #' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
156
+ #' @param peaks.col
157
+ #' If not \code{NA}, it attempts to represent peaks in the population histogram
158
+ #' in the specified color. Set to \code{NA} to avoid peak-finding.
159
+ #' @param use.peaks
160
+ #' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
161
+ #' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
162
+ #' @param id.lim
163
+ #' Limits of identities to represent.
164
+ #' @param pos.lim
165
+ #' Limits of positions to represent (in bp, regardless of \code{pos.units}).
166
+ #' @param pos.units
167
+ #' Units in which the positions should be represented (powers of 1,000
168
+ #' base pairs).
169
+ #' @param mar
170
+ #' Margins of the panels as a list, with the character representation of
171
+ #' the number of the panel as index (see \code{layout}).
172
+ #' @param pos.splines
173
+ #' Smoothing parameter for the splines in the position histogram. Zero
174
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
175
+ #' cross-validation.
176
+ #' @param id.splines
177
+ #' Smoothing parameter for the splines in the identity histogram. Zero
178
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
179
+ #' cross-validation.
180
+ #' @param in.lwd
181
+ #' Line width for the sequencing depth of in-group matches.
182
+ #' @param out.lwd
183
+ #' Line width for the sequencing depth of out-group matches.
184
+ #' @param id.lwd
185
+ #' Line width for the identity histogram.
186
+ #' @param in.col
187
+ #' Color associated to in-group matches.
188
+ #' @param out.col
189
+ #' Color associated to out-group matches.
190
+ #' @param id.col
191
+ #' Color for the identity histogram.
192
+ #' @param breaks.col
193
+ #' Color of the vertical lines indicating sequence breaks.
194
+ #' @param peaks.opts
195
+ #' Options passed to \code{\link{enve.recplot2.findPeaks}},
196
+ #' if \code{peaks.col} is not \code{NA}.
197
+ #' @param ...
198
+ #' Any other graphic parameters (currently ignored).
199
+ #'
200
+ #' @return
201
+ #' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
202
+ #' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
203
+ #' \code{layout} doesn't include 4, returns \code{NA}.
204
+ #'
205
+ #' @author Luis M. Rodriguez-R [aut, cre]
206
+ #'
207
+ #' @method plot enve.RecPlot2
208
+ #' @export
209
+
61
210
  plot.enve.RecPlot2 <- function
62
- ### Plots an `enve.RecPlot2` object.
63
- (x,
64
- ### `enve.RecPlot2` object to plot.
65
- layout=matrix(c(5,5,2,1,4,3), nrow=2),
66
- ### Matrix indicating the position of the different panels in the layout,
67
- ### where:
68
- ### 0: Empty space,
69
- ### 1: Counts matrix,
70
- ### 2: position histogram (sequencing depth),
71
- ### 3: identity histogram,
72
- ### 4: Populations histogram (histogram of sequencing depths),
73
- ### 5: Color scale for the counts matrix (vertical),
74
- ### 6: Color scale of the counts matrix (horizontal)
75
- ### Only panels indicated here will be plotted. To plot only one panel
76
- ### simply set this to the number of the panel you want to plot.
77
- panel.fun=list(),
78
- ### List of functions to be executed after drawing each panel. Use the
79
- ### indices in `layout` (as characters) as keys. Functions for indices
80
- ### missing in `layout` are ignored. For example, to add a vertical line
81
- ### at the 3Mbp mark in both the position histogram and the counts matrix:
82
- ### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
83
- ### Note that the X-axis in both panels is in Mbp by default. To change
84
- ### this behavior, set `pos.units` accordingly.
85
- widths=c(1,7,2),
86
- ### Relative widths of the columns of `layout`.
87
- heights=c(1,2),
88
- ### Relative heights of the rows of `layout`.
89
- palette=grey((100:0)/100),
90
- ### Colors to be used to represent the counts matrix, sorted from no hits
91
- ### to the maximum sequencing depth.
92
- underlay.group=TRUE,
93
- ### If TRUE, it indicates the in-group and out-group areas couloured based
94
- ### on `in.col` and `out.col`. Requires support for semi-transparency.
95
- peaks.col='darkred',
96
- ### If not NA, it attempts to represent peaks in the population histogram
97
- ### in the specified color. Set to NA to avoid peak-finding.
98
- use.peaks,
99
- ### A list of `enve.RecPlot2.Peak` objects, as returned by
100
- ### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
101
- id.lim=range(x$id.breaks),
102
- ### Limits of identities to represent.
103
- pos.lim=range(x$pos.breaks),
104
- ### Limits of positions to represent (in bp, regardless of `pos.units`).
105
- pos.units=c('Mbp','Kbp','bp'),
106
- ### Units in which the positions should be represented (powers of 1,000
107
- ### base pairs).
108
- mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
109
- '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
110
- '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
111
- '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
112
- ### Margins of the panels as a list, with the character representation of
113
- ### the number of the panel as index (see `layout`).
114
- pos.splines=0,
115
- ### Smoothing parameter for the splines in the position histogram. Zero
116
- ### (0) for no splines. Use NULL to automatically detect by leave-one-out
117
- ### cross-validation.
118
- id.splines=1/2,
119
- ### Smoothing parameter for the splines in the identity histogram. Zero
120
- ### (0) for no splines. Use NULL to automatically detect by leave-one-out
121
- ### cross-validation.
122
- in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
123
- ### Line width for the sequencing depth of in-group matches.
124
- out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
125
- ### Line width for the sequencing depth of out-group matches.
126
- id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
127
- ### Line width for the identity histogram.
128
- in.col='darkblue',
129
- ### Color associated to in-group matches.
130
- out.col='lightblue',
131
- ### Color associated to out-group matches.
132
- id.col='black',
133
- ### Color for the identity histogram.
134
- breaks.col='#AAAAAA40',
135
- ### Color of the vertical lines indicating sequence breaks.
136
- peaks.opts=list(),
137
- ### Options passed to `enve.recplot2.findPeaks`, if `peaks.col` is not NA.
138
- ...
139
- ### Any other graphic parameters (currently ignored).
140
- ){
141
- pos.units <- match.arg(pos.units);
142
- pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
143
- pos.lim <- pos.lim/pos.factor;
144
- lmat <- layout;
145
- for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
146
-
147
- layout(lmat, widths=widths, heights=heights);
148
- ori.mar <- par('mar');
149
-
150
- # Essential vars
151
- counts <- x$counts
152
-
153
- id.ingroup <- x$id.ingroup
154
- id.counts <- x$id.counts
155
- id.breaks <- x$id.breaks
156
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
157
- id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
158
-
159
- pos.counts.in <- x$pos.counts.in
160
- pos.counts.out <- x$pos.counts.out
161
- pos.breaks <- x$pos.breaks/pos.factor
162
- pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
163
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
164
-
165
- seqdepth.in <- pos.counts.in/pos.binsize
166
- seqdepth.out <- pos.counts.out/pos.binsize
167
- seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
168
- seqdepth.out[seqdepth.out>0]))*c(1/2,2)
169
-
170
- if(underlay.group){
171
- in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
172
- list(maxColorValue=256, alpha=62)));
173
- out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
174
- list(maxColorValue=256, alpha=52)));
175
- }
176
-
177
- # [1] Counts matrix
178
- if(any(layout==1)){
179
- par(mar=mar[['1']]);
180
- plot(1, t='n', bty='l',
181
- xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
182
- xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
211
+ (x,
212
+ layout=matrix(c(5,5,2,1,4,3), nrow=2),
213
+ panel.fun=list(),
214
+ widths=c(1,7,2),
215
+ heights=c(1,2),
216
+ palette=grey((100:0)/100),
217
+ underlay.group=TRUE,
218
+ peaks.col='darkred',
219
+ use.peaks,
220
+ id.lim=range(x$id.breaks),
221
+ pos.lim=range(x$pos.breaks),
222
+ pos.units=c('Mbp','Kbp','bp'),
223
+ mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
224
+ '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
225
+ '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
226
+ '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
227
+ pos.splines=0,
228
+ id.splines=1/2,
229
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
230
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
231
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
232
+ in.col='darkblue',
233
+ out.col='lightblue',
234
+ id.col='black',
235
+ breaks.col='#AAAAAA40',
236
+ peaks.opts=list(),
237
+ ...
238
+ ){
239
+ pos.units <- match.arg(pos.units);
240
+ pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
241
+ pos.lim <- pos.lim/pos.factor;
242
+ lmat <- layout;
243
+ for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
244
+
245
+ layout(lmat, widths=widths, heights=heights);
246
+ ori.mar <- par('mar');
247
+
248
+ # Essential vars
249
+ counts <- x$counts
250
+
251
+ id.ingroup <- x$id.ingroup
252
+ id.counts <- x$id.counts
253
+ id.breaks <- x$id.breaks
254
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
255
+ id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
256
+
257
+ pos.counts.in <- x$pos.counts.in
258
+ pos.counts.out <- x$pos.counts.out
259
+ pos.breaks <- x$pos.breaks/pos.factor
260
+ pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
261
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
262
+
263
+ seqdepth.in <- pos.counts.in/pos.binsize
264
+ seqdepth.out <- pos.counts.out/pos.binsize
265
+ seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
266
+ seqdepth.out[seqdepth.out>0]))*c(1/2,2)
267
+
268
+ if(underlay.group){
269
+ in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
270
+ list(maxColorValue=256, alpha=62)));
271
+ out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
272
+ list(maxColorValue=256, alpha=52)));
273
+ }
274
+
275
+ # [1] Counts matrix
276
+ if(any(layout==1)){
277
+ par(mar=mar[['1']]);
278
+ plot(1, t='n', bty='l',
279
+ xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
280
+ xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
281
+ if(underlay.group){
282
+ rect(pos.lim[1], id.lim[1], pos.lim[2],
283
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
284
+ rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
285
+ id.lim[2], col=in.bg, border=NA);
286
+ }
287
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col);
288
+ image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
289
+ bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
290
+ length.out=1+length(palette)), add=TRUE);
291
+ if(exists('1',panel.fun)) panel.fun[['1']]();
292
+ }
293
+
294
+ # [2] Position histogram
295
+ if(any(layout==2)){
296
+ par(mar=mar[['2']]);
297
+ if(any(layout==1)){
298
+ xlab=''
299
+ xaxt='n'
300
+ }else{
301
+ xlab=paste('Position in genome (',pos.units,')',sep='')
302
+ xaxt='s'
303
+ }
304
+ plot(1,t='n', bty='l', log='y',
305
+ xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
306
+ ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
307
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col)
308
+ pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
309
+ pos.f <- rep(seqdepth.in,each=2)
310
+ lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
311
+ lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
312
+ if(is.null(pos.splines) || pos.splines > 0){
313
+ pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
314
+ spar=pos.splines)
315
+ lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
316
+ }
317
+ if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
318
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
319
+ seqdepth.lim[1]*3/2, col=out.col, border=NA);
320
+ if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
321
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
322
+ seqdepth.lim[1]*3/2, col=in.col, border=NA);
323
+ if(exists('2',panel.fun)) panel.fun[['2']]();
324
+ }
325
+
326
+ # [3] Identity histogram
327
+ if(any(layout==3)){
328
+ par(mar=mar[['3']]);
329
+ if(any(layout==1)){
330
+ ylab=''
331
+ yaxt='n'
332
+ }else{
333
+ ylab=x$id.metric
334
+ yaxt='s'
335
+ }
336
+ if(sum(id.counts>0) >= 4){
337
+ id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
338
+ plot(1,t='n', bty='l', log='x',
339
+ xlim=id.counts.range, xlab='bps per bin', xaxs='i',
340
+ ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
183
341
  if(underlay.group){
184
- rect(pos.lim[1], id.lim[1], pos.lim[2],
185
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
186
- rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
187
- id.lim[2], col=in.bg, border=NA);
188
- }
189
- abline(v=x$seq.breaks/pos.factor, col=breaks.col);
190
- image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
191
- bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
192
- length.out=1+length(palette)), add=TRUE);
193
- if(exists('1',panel.fun)) panel.fun[['1']]();
194
- }
195
-
196
- # [2] Position histogram
197
- if(any(layout==2)){
198
- par(mar=mar[['2']]);
199
- if(any(layout==1)){
200
- xlab=''
201
- xaxt='n'
202
- }else{
203
- xlab=paste('Position in genome (',pos.units,')',sep='')
204
- xaxt='s'
205
- }
206
- plot(1,t='n', bty='l', log='y',
207
- xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
208
- ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
209
- abline(v=x$seq.breaks/pos.factor, col=breaks.col)
210
- pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
211
- pos.f <- rep(seqdepth.in,each=2)
212
- lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
213
- lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
214
- if(is.null(pos.splines) || pos.splines > 0){
215
- pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
216
- spar=pos.splines)
217
- lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
342
+ rect(id.counts.range[1], id.lim[1], id.counts.range[2],
343
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
344
+ rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
345
+ id.counts.range[2], id.lim[2], col=in.bg, border=NA);
218
346
  }
219
- if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
220
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
221
- seqdepth.lim[1]*3/2, col=out.col, border=NA);
222
- if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
223
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
224
- seqdepth.lim[1]*3/2, col=in.col, border=NA);
225
- if(exists('2',panel.fun)) panel.fun[['2']]();
226
- }
227
-
228
- # [3] Identity histogram
229
- if(any(layout==3)){
230
- par(mar=mar[['3']]);
231
- if(any(layout==1)){
232
- ylab=''
233
- yaxt='n'
234
- }else{
235
- ylab=x$id.metric
236
- yaxt='s'
237
- }
238
- if(sum(id.counts>0) >= 4){
239
- id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
240
- plot(1,t='n', bty='l', log='x',
241
- xlim=id.counts.range, xlab='bps per bin', xaxs='i',
242
- ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
243
- if(underlay.group){
244
- rect(id.counts.range[1], id.lim[1], id.counts.range[2],
245
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
246
- rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
247
- id.counts.range[2], id.lim[2], col=in.bg, border=NA);
248
- }
249
- id.f <- rep(id.counts,each=2)
250
- id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
251
- lines(id.f, id.x, lwd=id.lwd, col=id.col);
252
- if(is.null(id.splines) || id.splines > 0){
253
- id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
254
- spar=id.splines)
255
- lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
256
- }
257
- }else{
258
- plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
259
- text(1,1,labels='Insufficient data', srt=90)
347
+ id.f <- rep(id.counts,each=2)
348
+ id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
349
+ lines(id.f, id.x, lwd=id.lwd, col=id.col);
350
+ if(is.null(id.splines) || id.splines > 0){
351
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
352
+ spar=id.splines)
353
+ lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
260
354
  }
261
- if(exists('3',panel.fun)) panel.fun[['3']]();
262
- }
263
-
264
- # [4] Populations histogram
265
- peaks <- NA;
266
- if(any(layout==4)){
267
- par(mar=mar[['4']]);
268
- if(any(layout==2)){
269
- ylab=''
270
- yaxt='n'
355
+ }else{
356
+ plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
357
+ text(1,1,labels='Insufficient data', srt=90)
358
+ }
359
+ if(exists('3',panel.fun)) panel.fun[['3']]();
360
+ }
361
+
362
+ # [4] Populations histogram
363
+ peaks <- NA;
364
+ if(any(layout==4)){
365
+ par(mar=mar[['4']]);
366
+ if(any(layout==2)){
367
+ ylab=''
368
+ yaxt='n'
369
+ }else{
370
+ ylab='Sequencing depth (X)'
371
+ yaxt='s'
372
+ }
373
+ h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
374
+ length.out=200);
375
+ h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
376
+ h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
377
+ plot(1, t='n', log='y',
378
+ xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
379
+ xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
380
+ yaxt=yaxt)
381
+ y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
382
+ y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
383
+ lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
384
+ y.tmp.out, col=out.col)
385
+ polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
386
+ y.tmp.in, border=NA, col=in.col)
387
+ if(!is.na(peaks.col)){
388
+ o <- peaks.opts; o$x = x;
389
+ if(missing(use.peaks)){
390
+ peaks <- do.call(enve.recplot2.findPeaks, o)
271
391
  }else{
272
- ylab='Sequencing depth (X)'
273
- yaxt='s'
392
+ peaks <- use.peaks
274
393
  }
275
- h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
276
- length.out=200);
277
- h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
278
- h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
279
- plot(1, t='n', log='y',
280
- xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
281
- xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
282
- yaxt=yaxt)
283
- y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
284
- y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
285
- lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
286
- y.tmp.out, col=out.col)
287
- polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
288
- y.tmp.in, border=NA, col=in.col)
289
- if(!is.na(peaks.col)){
290
- o <- peaks.opts; o$x = x;
291
- if(missing(use.peaks)){
292
- peaks <- do.call(enve.recplot2.findPeaks, o)
293
- }else{
294
- peaks <- use.peaks
295
- }
296
- h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
297
- if(!is.null(peaks) & length(peaks)>0){
298
- pf <- h.mids*0;
299
- for(i in 1:length(peaks)){
300
- cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
301
- lines(cnt, h.mids, col='red');
302
- pf <- pf+cnt;
303
- axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
304
- }
305
- lines(pf, h.mids, col='red',lwd=1.5);
306
- dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
307
- frx <- signif(100*as.numeric(
308
- lapply(peaks,
309
- function(x) ifelse(length(x$values)==0, x$n.hat,
310
- length(x$values))/x$n.total)), 2)
311
- if(peaks[[1]]$err.res < 0){
312
- err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
313
- }else{
314
- err <- paste(', err:',
315
- signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
316
- }
317
- legend('topright', bty='n', cex=1/2,
318
- legend=paste(letters[1:length(peaks)],'. ',
319
- dpt,'X (', frx, '%', err, ')', sep=''))
320
- }
394
+ h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
395
+ if(!is.null(peaks) & length(peaks)>0){
396
+ pf <- h.mids*0;
397
+ for(i in 1:length(peaks)){
398
+ cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
399
+ lines(cnt, h.mids, col='red');
400
+ pf <- pf+cnt;
401
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
402
+ }
403
+ lines(pf, h.mids, col='red',lwd=1.5);
404
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
405
+ frx <- signif(100*as.numeric(
406
+ lapply(peaks,
407
+ function(x) ifelse(length(x$values)==0, x$n.hat,
408
+ length(x$values))/x$n.total)), 2)
409
+ if(peaks[[1]]$err.res < 0){
410
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
411
+ }else{
412
+ err <- paste(', err:',
413
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
414
+ }
415
+ legend('topright', bty='n', cex=1/2,
416
+ legend=paste(letters[1:length(peaks)],'. ',
417
+ dpt,'X (', frx, '%', err, ')', sep=''))
321
418
  }
322
- if(exists('4',panel.fun)) panel.fun[['4']]();
323
- }
324
-
325
- # [5] Color scale of the counts matrix (vertical)
326
- count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
327
- length.out=1+length(palette))
328
- if(any(layout==5)){
329
- par(mar=mar[['5']]);
330
- plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
331
- ylim=range(count.bins), yaxs='i', ylab='')
332
- rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
333
- border=NA)
334
- if(exists('5',panel.fun)) panel.fun[['5']]();
335
- }
336
-
337
- # [6] Color scale of the coutnts matrix (horizontal)
338
- if(any(layout==6)){
339
- par(mar=mar[['6']]);
340
- plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
341
- xlim=range(count.bins), xaxs='i',xlab='');
342
- rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
343
- border=NA);
344
- if(exists('6',panel.fun)) panel.fun[['6']]();
345
- }
346
-
347
- par(mar=ori.mar);
348
- return(peaks);
349
- ### Returns a list of `enve.RecPlot2.Peak` objects (see
350
- ### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
351
- ### 4, returns NA.
419
+ }
420
+ if(exists('4',panel.fun)) panel.fun[['4']]();
421
+ }
422
+
423
+ # [5] Color scale of the counts matrix (vertical)
424
+ count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
425
+ length.out=1+length(palette))
426
+ if(any(layout==5)){
427
+ par(mar=mar[['5']]);
428
+ plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
429
+ ylim=range(count.bins), yaxs='i', ylab='')
430
+ rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
431
+ border=NA)
432
+ if(exists('5',panel.fun)) panel.fun[['5']]();
433
+ }
434
+
435
+ # [6] Color scale of the coutnts matrix (horizontal)
436
+ if(any(layout==6)){
437
+ par(mar=mar[['6']]);
438
+ plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
439
+ xlim=range(count.bins), xaxs='i',xlab='');
440
+ rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
441
+ border=NA);
442
+ if(exists('6',panel.fun)) panel.fun[['6']]();
443
+ }
444
+
445
+ par(mar=ori.mar);
446
+ return(peaks);
352
447
  }
353
448
 
354
449
  #==============> Define core functions
450
+
451
+ #' Enveomics: Recruitment Plot (2)
452
+ #'
453
+ #' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
454
+ #' been previously executed.
455
+ #'
456
+ #' @param prefix
457
+ #' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
458
+ #' least the files .rec and .lim must exist with this prefix.
459
+ #' @param plot
460
+ #' Should the object be plotted?
461
+ #' @param pos.breaks
462
+ #' Breaks in the positions histogram. It can also be a vector of break
463
+ #' points, and values outside the range are ignored. If zero (0), it
464
+ #' uses the sequence breaks as defined in the .lim file, which means
465
+ #' one bin per contig (or gene, if the mapping is agains genes). Ignored
466
+ #' if `pos.breaks.tsv` is passed.
467
+ #' @param pos.breaks.tsv
468
+ #' Path to a list of (absolute) coordinates to use as position breaks.
469
+ #' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
470
+ #' must contain at least one column: coordinates of the break positions of
471
+ #' each position bin. If it has a second column, this is used as the name
472
+ #' of the position bin that ends at the given coordinate (the first row is
473
+ #' ignored). Any additional columns are currently ignored. If \code{NA},
474
+ #' position bins are determined by \code{pos.breaks}.
475
+ #' @param id.breaks
476
+ #' Breaks in the identity histogram. It can also be a vector of break
477
+ #' points, and values outside the range are ignored.
478
+ #' @param id.free.range
479
+ #' Indicates that the range should be freely set from the observed
480
+ #' values. Otherwise, 70-100\% is included in the identity histogram
481
+ #' (default).
482
+ #' @param id.metric
483
+ #' Metric of identity to be used (Y-axis). Corrected identity is only
484
+ #' supported if the original BLAST file included sequence lengths.
485
+ #' @param id.summary
486
+ #' Function summarizing the identity bins. Other recommended options
487
+ #' include: \code{median} to estimate the median instead of total bins, and
488
+ #' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
489
+ #' @param id.cutoff
490
+ #' Cutoff of identity metric above which the hits are considered
491
+ #' \code{in-group}. The 95\% identity corresponds to the expectation of
492
+ #' ANI<95\% within species.
493
+ #' @param threads
494
+ #' Number of threads to use.
495
+ #' @param verbose
496
+ #' Indicates if the function should report the advance.
497
+ #' @param ...
498
+ #' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
499
+ #'
500
+ #' @return Returns an object of class \code{\link{enve.RecPlot2}}.
501
+ #'
502
+ #' @author Luis M. Rodriguez-R [aut, cre]
503
+ #' @author Kenji Gerhardt [aut]
504
+ #'
505
+ #' @export
506
+
355
507
  enve.recplot2 <- function(
356
- ### Produces recruitment plots provided that BlastTab.catsbj.pl has
357
- ### been previously executed.
358
- prefix,
359
- ### Path to the prefix of the BlastTab.catsbj.pl output files. At
360
- ### least the files .rec and .lim must exist with this prefix.
361
- plot=TRUE,
362
- ### Should the object be plotted?
363
- pos.breaks=1e3,
364
- ### Breaks in the positions histogram. It can also be a vector of break
365
- ### points, and values outside the range are ignored. If zero (0), it
366
- ### uses the sequence breaks as defined in the .lim file, which means
367
- ### one bin per contig (or gene, if the mapping is agains genes). Ignored
368
- ### if `pos.breaks.tsv` is passed.
369
- pos.breaks.tsv=NA,
370
- ### Path to a list of (absolute) coordinates to use as position breaks.
371
- ### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
372
- ### must contain at least one column: coordinates of the break positions of
373
- ### each position bin. If it has a second column, this is used as the name
374
- ### of the position bin that ends at the given coordinate (the first row is
375
- ### ignored). Any additional columns are currently ignored. If NA,
376
- ### position bins are determined by `pos.breaks`.
377
- id.breaks=300,
378
- ### Breaks in the identity histogram. It can also be a vector of break
379
- ### points, and values outside the range are ignored.
380
- id.free.range=FALSE,
381
- ### Indicates that the range should be freely set from the observed
382
- ### values. Otherwise, 70-100% is included in the identity histogram
383
- ### (default).
384
- id.metric=c('identity', 'corrected identity', 'bit score'),
385
- ### Metric of identity to be used (Y-axis). Corrected identity is only
386
- ### supported if the original BLAST file included sequence lengths.
387
- id.summary=sum,
388
- ### Function summarizing the identity bins. Other recommended options
389
- ### include: `median` to estimate the median instead of total bins, and
390
- ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
391
- id.cutoff=95,
392
- ### Cutoff of identity metric above which the hits are considered
393
- ### 'in-group'. The 95% identity corresponds to the expectation of
394
- ### ANI<95% within species.
395
- threads=2,
396
- ### Number of threads to use.
397
- verbose=TRUE,
398
- ### Indicates if the function should report the advance.
399
- ...
400
- ### Any additional parameters supported by `plot.enve.RecPlot2`.
401
- ){
508
+ prefix,
509
+ plot = TRUE,
510
+ pos.breaks = 1e3,
511
+ pos.breaks.tsv = NA,
512
+ id.breaks = 60,
513
+ id.free.range = FALSE,
514
+ id.metric = c('identity', 'corrected identity', 'bit score'),
515
+ id.summary = sum,
516
+ id.cutoff = 95,
517
+ threads = 2,
518
+ verbose = TRUE,
519
+ ...
520
+ ){
402
521
  # Settings
403
522
  id.metric <- match.arg(id.metric);
404
523
 
405
524
  #Read files
406
- if(verbose) cat("Reading files.\n")
407
- rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
408
- quote="");
409
- lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
410
- quote="", as.is=TRUE);
525
+ if (verbose) cat("Reading files.\n")
526
+ rec <- read.table(paste(prefix, ".rec", sep = ""),
527
+ sep = "\t", comment.char = "", quote = "");
528
+ lim <- read.table(paste(prefix, ".lim", sep = ""),
529
+ sep = "\t", comment.char = "", quote = "", as.is = TRUE);
411
530
 
412
531
  # Build matrix
413
- if(verbose) cat("Building counts matrix.\n")
414
- if(id.metric=="corrected identity" & ncol(rec)<6){
532
+ if (verbose) cat("Building counts matrix.\n")
533
+ if (id.metric == "corrected identity" & ncol(rec) < 6) {
415
534
  stop("Requesting corr. identity, but .rec file doesn't have 6th column")
416
535
  }
417
- rec.idcol <- ifelse(id.metric=="identity", 3,
418
- ifelse(id.metric=="corrected identity", 6, 4))
536
+ rec.idcol <- ifelse(id.metric == "identity", 3,
537
+ ifelse(id.metric == "corrected identity", 6, 4))
419
538
  pos.names <- as.character(NULL)
420
- if(!is.na(pos.breaks.tsv)){
421
- tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
422
- pos.breaks <- as.numeric(tmp[,1])
423
- if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
424
- }else if(length(pos.breaks)==1){
425
- if(pos.breaks>0){
426
- pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
427
- }else{
428
- pos.breaks <- c(lim[1,2], lim[,3])
429
- pos.names <- lim[,1]
539
+ if (!is.na(pos.breaks.tsv)){
540
+ tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
541
+ pos.breaks <- as.numeric(tmp[, 1])
542
+ if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
543
+ } else if (length(pos.breaks) == 1) {
544
+ if (pos.breaks > 0){
545
+ pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
546
+ } else {
547
+ pos.breaks <- c(lim[1, 2], lim[, 3])
548
+ pos.names <- lim[, 1]
430
549
  }
431
550
  }
432
- if(length(id.breaks)==1){
433
- id.range.v <- rec[,rec.idcol]
434
- if(!id.free.range) id.range.v <- c(id.range.v,70,100)
551
+ if (length(id.breaks) == 1) {
552
+ id.range.v <- rec[, rec.idcol]
553
+ if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
435
554
  id.range.v <- range(id.range.v)
436
- id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
555
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
437
556
  }
438
557
 
439
558
  # Run in parallel
440
- if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
441
- cl <- makeCluster(threads)
442
- rec.l <- list()
443
- thl <- ceiling(nrow(rec)/threads)
444
- for(i in 0:(threads-1)){
445
- rec.l[[i+1]] <- list(
446
- rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
447
- verbose=ifelse(i==0, verbose, FALSE))
559
+ # If they already set threads to 1 manually, there's no point in launching
560
+ # clusters, it's just slower. Ditto for small files.
561
+ if (nrow(rec) < 75000 | threads == 1) {
562
+ # Coerces rec into a form that __counts is happy about
563
+ rec.l <- list()
564
+ rec.l[[1]] <- list(rec = rec, verbose = FALSE)
565
+
566
+ # No need to make a temporary variable, there's only one return for sure
567
+ # and it's not a list because it isn't coming back from an apply
568
+ counts <- enve.recplot2.__counts(
569
+ rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
570
+ rec.idcol = rec.idcol)
571
+ } else {
572
+ cl <- makeCluster(threads)
573
+ rec.l <- list()
574
+ thl <- ceiling(nrow(rec)/threads)
575
+ for (i in 0:(threads - 1)) {
576
+ rec.l[[i + 1]] <- list(
577
+ rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
578
+ verbose = ifelse(i == 0, verbose, FALSE))
579
+ }
580
+ counts.l <- clusterApply(
581
+ cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
582
+ id.breaks = id.breaks, rec.idcol = rec.idcol)
583
+ stopCluster(cl) # No spooky ghost clusters
584
+
585
+ counts <- counts.l[[1]]
586
+ for (i in 2:threads) counts <- counts + counts.l[[i]]
448
587
  }
449
- counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
450
- pos.breaks=pos.breaks, id.breaks=id.breaks,
451
- rec.idcol=rec.idcol)
452
- counts <- counts.l[[1]]
453
- if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
454
- stopCluster(cl)
455
588
 
456
589
  # Estimate 1D histograms
457
- if(verbose) cat("Building histograms.\n")
458
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
590
+ if (verbose) cat("Building histograms.\n")
591
+ id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
459
592
  id.ingroup <- (id.mids > id.cutoff);
460
593
  id.counts <- apply(counts, 2, id.summary);
461
- pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
462
- pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
594
+ pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
595
+ pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
463
596
 
464
597
  # Plot and return
465
598
  recplot <- new('enve.RecPlot2',
466
- counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
467
- pos.counts.out=pos.counts.out,
468
- id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
469
- seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
470
- id.ingroup=id.ingroup,id.metric=id.metric,
471
- call=match.call());
472
- if(plot){
473
- if(verbose) cat("Plotting.\n")
599
+ counts = counts, id.counts = id.counts,
600
+ pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
601
+ id.breaks = id.breaks, pos.breaks = pos.breaks,
602
+ pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
603
+ seq.names = lim[, 1], id.ingroup = id.ingroup,
604
+ id.metric = id.metric, call = match.call());
605
+ if (plot) {
606
+ if (verbose) cat("Plotting.\n")
474
607
  peaks <- plot(recplot, ...);
475
608
  attr(recplot, "peaks") <- peaks
476
609
  }
477
610
  return(recplot);
478
- ### Returns an object of class `enve.RecPlot2`.
479
611
  }
480
612
 
613
+ #' Enveomics: Recruitment Plot (2) Peak Finder
614
+ #'
615
+ #' Identifies peaks in the population histogram potentially indicating
616
+ #' sub-population mixtures.
617
+ #'
618
+ #' @param x
619
+ #' An \code{\link{enve.RecPlot2}} object.
620
+ #' @param method
621
+ #' Peak-finder method. This should be one of:
622
+ #' \itemize{
623
+ #' \item \strong{emauto}
624
+ #' (Expectation-Maximization with auto-selection of components)
625
+ #' \item \strong{em}
626
+ #' (Expectation-Maximization)
627
+ #' \item \strong{mower}
628
+ #' (Custom distribution-mowing method)
629
+ #' }
630
+ #' @param ...
631
+ #' Any additional parameters supported by
632
+ #' \code{\link{enve.recplot2.findPeaks}}.
633
+ #'
634
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
635
+ #'
636
+ #' @author Luis M. Rodriguez-R [aut, cre]
637
+ #'
638
+ #' export
639
+
481
640
  enve.recplot2.findPeaks <- function(
482
- ### Identifies peaks in the population histogram potentially indicating
483
- ### sub-population mixtures
484
- x,
485
- ### An `enve.RecPlot2` object.
486
- method="emauto",
487
- ### Peak-finder method. This should be one of:
488
- ### "emauto" (Expectation-Maximization with auto-selection of components),
489
- ### "em" (Expectation-Maximization),
490
- ### "mower" (Custom distribution-mowing method).
491
- ...
492
- ### Any additional parameters supported by
493
- ### `enve.recplot2.findPeaks.<method>`.
494
- ){
641
+ x,
642
+ method="emauto",
643
+ ...
644
+ ){
495
645
  if(method == "emauto"){
496
646
  peaks <- enve.recplot2.findPeaks.emauto(x, ...)
497
647
  }else if(method == "em"){
@@ -502,29 +652,44 @@ enve.recplot2.findPeaks <- function(
502
652
  stop("Invalid peak-finder method ", method)
503
653
  }
504
654
  return(peaks)
505
- ### Returns a list of `enve.RecPlot2.Peak` objects.
506
655
  }
507
656
 
657
+ #' Enveomics: Recruitment Plot (2) Emauto Peak Finder
658
+ #'
659
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
660
+ #' Model Expectation Maximization (GMM-EM) method with number of components
661
+ #' automatically detected.
662
+ #'
663
+ #' @param x
664
+ #' An \code{\link{enve.RecPlot2}} object.
665
+ #' @param components
666
+ #' A vector of number of components to evaluate.
667
+ #' @param criterion
668
+ #' Criterion to use for components selection. Must be one of:
669
+ #' \code{aic} (Akaike Information Criterion),
670
+ #' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
671
+ #' @param merge.tol
672
+ #' When attempting to merge peaks with very similar sequencing depth, use
673
+ #' this number of significant digits (in log-scale).
674
+ #' @param verbose
675
+ #' Display (mostly debugging) information.
676
+ #' @param ...
677
+ #' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
678
+ #'
679
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
680
+ #'
681
+ #' @author Luis M. Rodriguez-R [aut, cre]
682
+ #'
683
+ #' @export
684
+
508
685
  enve.recplot2.findPeaks.emauto <- function(
509
- ### Identifies peaks in the population histogram using a Gaussian Mixture
510
- ### Model Expectation Maximization (GMM-EM) method with number of components
511
- ### automatically detected.
512
- x,
513
- ### An `enve.RecPlot2` object.
514
- components=seq(1,10),
515
- ### A vector of number of components to evaluate.
516
- criterion='aic',
517
- ### Criterion to use for components selection. Must be one of:
518
- ### 'aic' (Akaike Information Criterion),
519
- ### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
520
- merge.tol=2L,
521
- ### When attempting to merge peaks with very similar sequencing depth, use
522
- ### this number of significant digits (in log-scale).
523
- verbose=FALSE,
524
- ### Display (mostly debugging) information.
525
- ...
526
- ### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
527
- ){
686
+ x,
687
+ components=seq(1,10),
688
+ criterion='aic',
689
+ merge.tol=2L,
690
+ verbose=FALSE,
691
+ ...
692
+ ){
528
693
  best <- list(crit=0, pstore=list())
529
694
  if(criterion == 'aic'){
530
695
  do_crit <- function(ll, k, n) 2*k - 2*ll
@@ -536,55 +701,72 @@ enve.recplot2.findPeaks.emauto <- function(
536
701
  for(comp in components){
537
702
  if(verbose) cat('Testing:',comp,'\n')
538
703
  best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
539
- verbose, ...)
704
+ verbose, ...)
540
705
  }
541
706
  if(length(best[['peaks']])==0) return(list())
542
707
 
543
708
  seqdepths.r <- signif(log(sapply(best[['peaks']],
544
- function(x) x$seq.depth)), merge.tol)
709
+ function(x) x$seq.depth)), merge.tol)
545
710
  distinct <- length(unique(seqdepths.r))
546
711
  if(distinct < length(best[['peaks']])){
547
712
  if(verbose) cat('Attempting merge to', distinct, 'components\n')
548
713
  init <- apply(sapply(best[['peaks']],
549
- function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
714
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
550
715
  init <- init[!duplicated(seqdepths.r),]
551
716
  init <- list(mu=init[,'mean'], sd=init[,'sd'],
552
- alpha=init[,'alpha']/sum(init[,'alpha']))
717
+ alpha=init[,'alpha']/sum(init[,'alpha']))
553
718
  best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
554
- verbose, ...)
719
+ verbose, ...)
555
720
  }
556
721
  return(best[['peaks']])
557
- ### Returns a list of `enve.RecPlot2.Peak` objects.
558
722
  }
559
723
 
724
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder
725
+ #'
726
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
727
+ #' Model Expectation Maximization (GMM-EM) method.
728
+ #'
729
+ #' @param x
730
+ #' An \code{\link{enve.RecPlot2}} object.
731
+ #' @param max.iter
732
+ #' Maximum number of EM iterations.
733
+ #' @param ll.diff.res
734
+ #' Maximum Log-Likelihood difference to be considered as convergent.
735
+ #' @param components
736
+ #' Number of distributions assumed in the mixture.
737
+ #' @param rm.top
738
+ #' Top-values to remove before finding peaks, as a quantile probability.
739
+ #' This step is useful to remove highly conserved regions, but can be
740
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
741
+ #' \strong{after} removing zero-coverage windows.
742
+ #' @param verbose
743
+ #' Display (mostly debugging) information.
744
+ #' @param init
745
+ #' Initialization parameters. By default, these are derived from k-means
746
+ #' clustering. A named list with vectors for \code{mu}, \code{sd}, and
747
+ #' \code{alpha}, each of length \code{components}.
748
+ #' @param log
749
+ #' Logical value indicating if the estimations should be performed in
750
+ #' natural logarithm units. Do not change unless you know what you're
751
+ #' doing.
752
+ #'
753
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
754
+ #'
755
+ #' @author Luis M. Rodriguez-R [aut, cre]
756
+ #'
757
+ #' @export
758
+
560
759
  enve.recplot2.findPeaks.em <- function(
561
- ### Identifies peaks in the population histogram using a Gaussian Mixture
562
- ### Model Expectation Maximization (GMM-EM) method.
563
- x,
564
- ### An `enve.RecPlot2` object.
565
- max.iter=1000,
566
- ### Maximum number of EM iterations.
567
- ll.diff.res=1e-8,
568
- ### Maximum Log-Likelihood difference to be considered as convergent.
569
- components=2,
570
- ### Number of distributions assumed in the mixture.
571
- rm.top=0.05,
572
- ### Top-values to remove before finding peaks, as a quantile probability.
573
- ### This step is useful to remove highly conserved regions, but can be
574
- ### turned off by setting rm.top=0. The quantile is determined *after*
575
- ### removing zero-coverage windows.
576
- verbose=FALSE,
577
- ### Display (mostly debugging) information.
578
- init,
579
- ### Initialization parameters. By default, these are derived from k-means
580
- ### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
581
- ### of length `components`.
582
- log=TRUE
583
- ### Logical value indicating if the estimations should be performed in
584
- ### natural logarithm units. Do not change unless you know what you're
585
- ### doing.
586
- ){
587
-
760
+ x,
761
+ max.iter=1000,
762
+ ll.diff.res=1e-8,
763
+ components=2,
764
+ rm.top=0.05,
765
+ verbose=FALSE,
766
+ init,
767
+ log=TRUE
768
+ ){
769
+
588
770
  # Essential vars
589
771
  pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
590
772
  lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
@@ -603,7 +785,7 @@ enve.recplot2.findPeaks.em <- function(
603
785
  m.step <- init
604
786
  ll <- c()
605
787
  cur.ll <- -Inf
606
-
788
+
607
789
  for(i in 1:max.iter){
608
790
  # 2/3. EM
609
791
  e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
@@ -621,224 +803,297 @@ enve.recplot2.findPeaks.em <- function(
621
803
  for(i in 1:components){
622
804
  n.hat <- m.step[['alpha']][i]*length(lsd1)
623
805
  peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
624
- values.res=0, mode=m.step[['mu']][i],
625
- param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
626
- n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
627
- merge.logdist=as.numeric(), log=log,
628
- seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
806
+ values.res=0, mode=m.step[['mu']][i],
807
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
808
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
809
+ merge.logdist=as.numeric(), log=log,
810
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
629
811
  }
630
812
  return(peaks)
631
- ### Returns a list of `enve.RecPlot2.Peak` objects.
632
813
  }
633
814
 
815
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder
816
+ #'
817
+ #' Identifies peaks in the population histogram potentially indicating
818
+ #' sub-population mixtures, using a custom distribution-mowing method.
819
+ #'
820
+ #' @param x
821
+ #' An \code{\link{enve.RecPlot2}} object.
822
+ #' @param min.points
823
+ #' Minimum number of points in the quantile-estimation-range
824
+ #' \code{(quant.est)} to estimate a peak.
825
+ #' @param quant.est
826
+ #' Range of quantiles to be used in the estimation of a peak's
827
+ #' parameters.
828
+ #' @param mlv.opts
829
+ #' Ignored. For backwards compatibility.
830
+ #' @param fitdist.opts.sn
831
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
832
+ #' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
833
+ #' ammended with \code{xi=estimated} mode for each peak.
834
+ #' @param fitdist.opts.norm
835
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
836
+ #' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
837
+ #' ammended with \code{mean=estimated} mode for each peak.
838
+ #' @param rm.top
839
+ #' Top-values to remove before finding peaks, as a quantile probability.
840
+ #' This step is useful to remove highly conserved regions, but can be
841
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
842
+ #' \strong{after} removing zero-coverage windows.
843
+ #' @param with.skewness
844
+ #' Allow skewness correction of the peaks. Typically, the
845
+ #' sequencing-depth distribution for a single peak is left-skewed, due
846
+ #' partly (but not exclusively) to fragmentation and mapping sensitivity.
847
+ #' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
848
+ #' alternative solution for the first problem (fragmentation) called
849
+ #' "tail distribution".
850
+ #' @param optim.rounds
851
+ #' Maximum rounds of peak optimization.
852
+ #' @param optim.epsilon
853
+ #' Trace change at which optimization stops (unless \code{optim.rounds} is
854
+ #' reached first). The trace change is estimated as the sum of square
855
+ #' differences between parameters in one round and those from two rounds
856
+ #' earlier (to avoid infinite loops from approximation).
857
+ #' @param merge.logdist
858
+ #' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
859
+ #' to attempt merging. The default of ~0.22 corresponds to a maximum
860
+ #' difference of 25\%.
861
+ #' @param verbose
862
+ #' Display (mostly debugging) information.
863
+ #' @param log
864
+ #' Logical value indicating if the estimations should be performed in
865
+ #' natural logarithm units. Do not change unless you know what you're
866
+ #' doing.
867
+ #'
868
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
869
+ #'
870
+ #' @author Luis M. Rodriguez-R [aut, cre]
871
+ #'
872
+ #' @export
873
+
634
874
  enve.recplot2.findPeaks.mower <- function(
635
- ### Identifies peaks in the population histogram potentially indicating
636
- ### sub-population mixtures, using a custom distribution-mowing method.
637
- x,
638
- ### An `enve.RecPlot2` object.
639
- min.points=10,
640
- ### Minimum number of points in the quantile-estimation-range
641
- ### (`quant.est`) to estimate a peak.
642
- quant.est=c(0.002, 0.998),
643
- ### Range of quantiles to be used in the estimation of a peak's
644
- ### parameters.
645
- mlv.opts=list(method='parzen'),
646
- ### Ignored. For backwards compatibility.
647
- fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
648
- start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
649
- ### Options passed to `fitdist` to estimate the standard deviation if
650
- ### with.skewness=TRUE. Note that the `start` parameter will be ammended
651
- ### with xi=estimated mode for each peak.
652
- fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
653
- start=list(sd=1), lower=c(0, -Inf)),
654
- ### Options passed to `fitdist` to estimate the standard deviation if
655
- ### with.skewness=FALSE. Note that the `start` parameter will be ammended
656
- ### with mean=estimated mode for each peak.
657
- rm.top=0.05,
658
- ### Top-values to remove before finding peaks, as a quantile probability.
659
- ### This step is useful to remove highly conserved regions, but can be
660
- ### turned off by setting rm.top=0. The quantile is determined *after*
661
- ### removing zero-coverage windows.
662
- with.skewness=TRUE,
663
- ### Allow skewness correction of the peaks. Typically, the
664
- ### sequencing-depth distribution for a single peak is left-skewed, due
665
- ### partly (but not exclusively) to fragmentation and mapping sensitivity.
666
- ### See Lindner et al 2013, Bioinformatics 29(10):1260-7 for an
667
- ### alternative solution for the first problem (fragmentation) called
668
- ### "tail distribution".
669
- optim.rounds=200,
670
- ### Maximum rounds of peak optimization.
671
- optim.epsilon=1e-4,
672
- ### Trace change at which optimization stops (unless `optim.rounds` is
673
- ### reached first). The trace change is estimated as the sum of square
674
- ### differences between parameters in one round and those from two rounds
675
- ### earlier (to avoid infinite loops from approximation).
676
- merge.logdist=log(1.75),
677
- ### Maximum value of |log-ratio| between centrality parameters in peaks to
678
- ### attempt merging. The default of ~0.22 corresponds to a maximum
679
- ### difference of 25%.
680
- verbose=FALSE,
681
- ### Display (mostly debugging) information.
682
- log=TRUE
683
- ### Logical value indicating if the estimations should be performed in
684
- ### natural logarithm units. Do not change unless you know what you're
685
- ### doing.
686
- ){
687
-
688
- # Essential vars
689
- pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
690
- seqdepth.in <- x$pos.counts.in/pos.binsize;
691
- lsd1 <- seqdepth.in[seqdepth.in>0];
692
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
693
- if(log) lsd1 <- log(lsd1)
694
- if(with.skewness){
695
- fitdist.opts <- fitdist.opts.sn
696
- }else{
697
- fitdist.opts <- fitdist.opts.norm
698
- }
699
- peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
700
- mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
701
- optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
702
- n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
703
-
704
- # Find seed peaks
705
- if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
706
- peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
707
-
708
- # Merge overlapping peaks
709
- if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
710
- merged <- (length(peaks)>1)
711
- while(merged){
712
- merged <- FALSE
713
- ignore <- c()
714
- peaks2 <- list();
715
- for(i in 1:length(peaks)){
716
- if(i %in% ignore) next
717
- p <- peaks[[ i ]]
718
- j <- enve.recplot2.__whichClosestPeak(p, peaks)
719
- p2 <- peaks[[ j ]]
720
- dst.a <- p$param.hat[[ length(p$param.hat) ]]
721
- dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
722
- if( abs(log(dst.a/dst.b)) < merge.logdist ){
723
- if(verbose) cat('==> Attempting a merge at',
724
- p$param.hat[[ length(p$param.hat) ]],'&',
725
- p2$param.hat[[ length(p2$param.hat) ]],'X\n');
726
- peaks.opts$lsd1 <- c(p$values, p2$values)
727
- p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
728
- if(length(p.new)==1){
729
- peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
730
- ignore <- c(ignore, j)
731
- merged <- TRUE
732
- }
733
- }
734
- if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
875
+ x,
876
+ min.points=10,
877
+ quant.est=c(0.002, 0.998),
878
+ mlv.opts=list(method='parzen'),
879
+ fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
880
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
881
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
882
+ start=list(sd=1), lower=c(0, -Inf)),
883
+ rm.top=0.05,
884
+ with.skewness=TRUE,
885
+ optim.rounds=200,
886
+ optim.epsilon=1e-4,
887
+ merge.logdist=log(1.75),
888
+ verbose=FALSE,
889
+ log=TRUE
890
+ ){
891
+
892
+ # Essential vars
893
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
894
+ seqdepth.in <- x$pos.counts.in/pos.binsize;
895
+ lsd1 <- seqdepth.in[seqdepth.in>0];
896
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
897
+ if(log) lsd1 <- log(lsd1)
898
+ if(with.skewness){
899
+ fitdist.opts <- fitdist.opts.sn
900
+ }else{
901
+ fitdist.opts <- fitdist.opts.norm
902
+ }
903
+ peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
904
+ mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
905
+ optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
906
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
907
+
908
+ # Find seed peaks
909
+ if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
910
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
911
+
912
+ # Merge overlapping peaks
913
+ if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
914
+ merged <- (length(peaks)>1)
915
+ while(merged){
916
+ merged <- FALSE
917
+ ignore <- c()
918
+ peaks2 <- list();
919
+ for(i in 1:length(peaks)){
920
+ if(i %in% ignore) next
921
+ p <- peaks[[ i ]]
922
+ j <- enve.recplot2.__whichClosestPeak(p, peaks)
923
+ p2 <- peaks[[ j ]]
924
+ dst.a <- p$param.hat[[ length(p$param.hat) ]]
925
+ dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
926
+ if( abs(log(dst.a/dst.b)) < merge.logdist ){
927
+ if(verbose) cat('==> Attempting a merge at',
928
+ p$param.hat[[ length(p$param.hat) ]],'&',
929
+ p2$param.hat[[ length(p2$param.hat) ]],'X\n');
930
+ peaks.opts$lsd1 <- c(p$values, p2$values)
931
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
932
+ if(length(p.new)==1){
933
+ peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
934
+ ignore <- c(ignore, j)
935
+ merged <- TRUE
936
+ }
735
937
  }
736
- peaks <- peaks2
737
- if(length(peaks)==1) break
738
- }
739
-
740
- if(verbose) cat('Found',length(peaks),'peak(s)\n')
741
- return(peaks);
742
- ### Returns a list of `enve.RecPlot2.Peak` objects.
938
+ if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
939
+ }
940
+ peaks <- peaks2
941
+ if(length(peaks)==1) break
942
+ }
943
+
944
+ if(verbose) cat('Found',length(peaks),'peak(s)\n')
945
+ return(peaks);
743
946
  }
744
947
 
745
948
  #==============> Define utils
949
+
950
+ #' Enveomics: Recruitment Plot (2) Core Peak Finder
951
+ #'
952
+ #' Finds the peak in a list of peaks that is most likely to represent the
953
+ #' "core genome" of a population.
954
+ #'
955
+ #' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
956
+ #'
957
+ #' @author Luis M. Rodriguez-R [aut, cre]
958
+ #'
959
+ #' @export
960
+
746
961
  enve.recplot2.corePeak <- function
747
- ### Finds the peak in a list of peaks that is most likely to represent the
748
- ### "core genome" of a population.
749
- (x
750
- ### `list` of `enve.RecPlot2.Peak` objects.
751
- ){
752
- # Find the peak with maximum depth (centrality)
753
- maxPeak <- x[[
754
- which.max(as.numeric(lapply(x,
755
- function(y) y$param.hat[[ length(y$param.hat) ]])))
756
- ]]
757
- # If a "larger" peak (a peak explaining more bins of the genome) is within
758
- # the default "merge.logdist" distance, take that one instead.
759
- corePeak <- maxPeak
760
- for(p in x){
761
- p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
762
- corePeak.len <- ifelse(
763
- length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
764
- sz.d <- log(p.len/corePeak.len)
765
- if(is.nan(sz.d) || sz.d < 0) next
766
- sq.d.a <- as.numeric(tail(p$param.hat, n=1))
767
- sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
768
- if(p$log) sq.d.a <- exp(sq.d.a)
769
- if(corePeak$log) sq.d.b <- exp(sq.d.b)
770
- if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
771
- }
772
- return(corePeak)
962
+ (x
963
+ ){
964
+ # Find the peak with maximum depth (centrality)
965
+ maxPeak <- x[[
966
+ which.max(as.numeric(lapply(x,
967
+ function(y) y$param.hat[[ length(y$param.hat) ]])))
968
+ ]]
969
+ # If a "larger" peak (a peak explaining more bins of the genome) is within
970
+ # the default "merge.logdist" distance, take that one instead.
971
+ corePeak <- maxPeak
972
+ for(p in x){
973
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
974
+ corePeak.len <- ifelse(
975
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
976
+ sz.d <- log(p.len/corePeak.len)
977
+ if(is.nan(sz.d) || sz.d < 0) next
978
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
979
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
980
+ if(p$log) sq.d.a <- exp(sq.d.a)
981
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
982
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
983
+ }
984
+ return(corePeak)
773
985
  }
774
986
 
987
+ #' Enveomics: Recruitment Plot (2) Change Cutoff
988
+ #'
989
+ #' Change the intra-species cutoff of an existing recruitment plot.
990
+ #'
991
+ #' @param rp
992
+ #' \code{\link{enve.RecPlot2}} object.
993
+ #' @param new.cutoff
994
+ #' New cutoff to use.
995
+ #'
996
+ #' @author Luis M. Rodriguez-R [aut, cre]
997
+ #'
998
+ #' @export
999
+
775
1000
  enve.recplot2.changeCutoff <- function
776
- ### Change the intra-species cutoff of an existing recruitment plot.
777
- (rp,
778
- ### enve.RecPlot2 object.
779
- new.cutoff=98
780
- ### New cutoff to use.
781
- ){
782
- # Re-calculate vectors
783
- id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
784
- id.ingroup <- (id.mids > new.cutoff)
785
- pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
786
- pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
787
- # Update object
788
- attr(rp, "id.ingroup") <- id.ingroup
789
- attr(rp, "pos.counts.in") <- pos.counts.in
790
- attr(rp, "pos.counts.out") <- pos.counts.out
791
- attr(rp, "call") <- match.call()
792
- return(rp)
1001
+ (rp,
1002
+ new.cutoff=98
1003
+ ){
1004
+ # Re-calculate vectors
1005
+ id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
1006
+ id.ingroup <- (id.mids > new.cutoff)
1007
+ pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
1008
+ pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
1009
+ # Update object
1010
+ attr(rp, "id.ingroup") <- id.ingroup
1011
+ attr(rp, "pos.counts.in") <- pos.counts.in
1012
+ attr(rp, "pos.counts.out") <- pos.counts.out
1013
+ attr(rp, "call") <- match.call()
1014
+ return(rp)
793
1015
  }
794
1016
 
1017
+ #' Enveomics: Recruitment Plot (2) Window Depth Threshold
1018
+ #'
1019
+ #' Identifies the threshold below which windows should be identified as
1020
+ #' variable or absent.
1021
+ #'
1022
+ #' @param rp
1023
+ #' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
1024
+ #' @param peak
1025
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1026
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1027
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1028
+ #' @param lower.tail
1029
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1030
+ #' sequencing depth.
1031
+ #' @param significance
1032
+ #' Significance threshold (alpha) to select windows.
1033
+ #'
1034
+ #' @return
1035
+ #' Returns a float. The units are depth if the peaks were estimated in
1036
+ #' linear scale, or log-depth otherwise (\code{peak$log}).
1037
+ #'
1038
+ #' @author Luis M. Rodriguez-R [aut, cre]
1039
+ #'
1040
+ #' @export
1041
+
795
1042
  enve.recplot2.windowDepthThreshold <- function
796
- ### Identifies the threshold below which windows should be identified as
797
- ### variable or absent.
798
- (rp,
799
- ### Recruitment plot, an `enve.RecPlot2` object.
800
- peak,
801
- ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
802
- ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
803
- ### used (see `enve.recplot2.corePeak`).
804
- lower.tail=TRUE,
805
- ### If FALSE, it returns windows significantly above the peak in
806
- ### sequencing depth.
807
- significance=0.05
808
- ### Significance threshold (alpha) to select windows.
809
- ){
810
- if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
811
- par <- peak$param.hat
812
- par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
813
- thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
814
- if(peak$log) thr <- exp(thr)
815
-
816
- return(thr)
817
- ### Returns a float. The units are depth if the peaks were estimated in
818
- ### linear scale, or log-depth otherwise (`peak$log`).
1043
+ (rp,
1044
+ peak,
1045
+ lower.tail=TRUE,
1046
+ significance=0.05
1047
+ ){
1048
+ if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
1049
+ par <- peak$param.hat
1050
+ par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
1051
+ thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
1052
+ if(peak$log) thr <- exp(thr)
1053
+
1054
+ return(thr)
819
1055
  }
820
1056
 
1057
+ #' Enveomics: Recruitment Plot (2) Extract Windows
1058
+ #'
1059
+ #' Extract windows significantly below (or above) the peak in sequencing
1060
+ #' depth.
1061
+ #'
1062
+ #' @param rp
1063
+ #' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
1064
+ #' @param peak
1065
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1066
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1067
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1068
+ #' @param lower.tail
1069
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1070
+ #' sequencing depth.
1071
+ #' @param significance
1072
+ #' Significance threshold (alpha) to select windows.
1073
+ #' @param seq.names
1074
+ #' Returns subject sequence names instead of a vector of Booleans. If
1075
+ #' the recruitment plot was generated with named position bins (e.g, using
1076
+ #' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
1077
+ #' vector of characters (the sequence identifiers), otherwise it returns a
1078
+ #' data.frame with a name column and two columns of coordinates.
1079
+ #'
1080
+ #' @return
1081
+ #' Returns a vector of logicals if \code{seq.names = FALSE}.
1082
+ #' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
1083
+ #' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
1084
+ #' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
1085
+ #'
1086
+ #' @author Luis M. Rodriguez-R [aut, cre]
1087
+ #'
1088
+ #' @export
1089
+
821
1090
  enve.recplot2.extractWindows <- function
822
- ### Extract windows significantly below (or above) the peak in sequencing
823
- ### depth.
824
- (rp,
825
- ### Recruitment plot, a `enve.RecPlot2` object.
826
- peak,
827
- ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
828
- ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
829
- ### used (see `enve.recplot2.corePeak`).
830
- lower.tail=TRUE,
831
- ### If FALSE, it returns windows significantly above the peak in
832
- ### sequencing depth.
833
- significance=0.05,
834
- ### Significance threshold (alpha) to select windows.
835
- seq.names=FALSE
836
- ### Returns subject sequence names instead of a vector of Booleans. If
837
- ### the recruitment plot was generated with named position bins (e.g, using
838
- ### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
839
- ### characters (the sequence identifiers), otherwise it returns a data.frame
840
- ### with a name column and two columns of coordinates.
841
- ){
1091
+ (rp,
1092
+ peak,
1093
+ lower.tail = TRUE,
1094
+ significance = 0.05,
1095
+ seq.names = FALSE
1096
+ ){
842
1097
  # Determine the threshold
843
1098
  thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
844
1099
 
@@ -850,43 +1105,54 @@ enve.recplot2.extractWindows <- function
850
1105
  sel <- seqdepth.in > thr
851
1106
  }
852
1107
 
853
- # seq.names=FALSE
1108
+ # seq.names = FALSE
854
1109
  if(!seq.names) return(sel)
855
- # seq.names=TRUE and pos.names defined
856
- if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
857
- # seq.names=TRUE and pos.names undefined
858
- return(enve.recplot2.coordinates(rp,sel))
859
- ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
860
- ### it returns a vector of characters if the object has `pos.names` defined,
861
- ### or a data.frame with four columns otherwise:
862
- ### name.from, name.to, pos.from, and pos.to
863
- ### (see `enve.recplot2.coordinates`).
1110
+ # seq.names = TRUE
1111
+ return(enve.recplot2.coordinates(rp, sel))
864
1112
  }
865
1113
 
1114
+ #' Enveomics: Recruitment Plot (2) Compare Identities
1115
+ #'
1116
+ #' Compare the distribution of identities between two
1117
+ #' \code{\link{enve.RecPlot2}} objects.
1118
+ #'
1119
+ #' @param x
1120
+ #' First \code{\link{enve.RecPlot2}} object.
1121
+ #' @param y
1122
+ #' Second \code{\link{enve.RecPlot2}} object.
1123
+ #' @param method
1124
+ #' Distance method to use. This should be (an unambiguous abbreviation of)
1125
+ #' one of:
1126
+ #' \itemize{
1127
+ #' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
1128
+ #' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
1129
+ #' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
1130
+ #' doi:10.1214/aoms/1177729694}), or}
1131
+ #' \item{"euclidean"}
1132
+ #' }
1133
+ #' @param smooth.par
1134
+ #' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
1135
+ #' Use \code{NULL} to automatically determine this value using leave-one-out
1136
+ #' cross-validation (see \code{smooth.spline} parameter \code{spar}).
1137
+ #' @param pseudocounts
1138
+ #' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
1139
+ #' 1 for add-one smoothing.
1140
+ #' @param max.deviation
1141
+ #' Maximum mean deviation between identity breaks tolerated (as percent
1142
+ #' identity). Difference in number of \code{id.breaks} is never tolerated.
1143
+ #'
1144
+ #' @author Luis M. Rodriguez-R [aut, cre]
1145
+ #'
1146
+ #' @export
1147
+
866
1148
  enve.recplot2.compareIdentities <- function
867
- ### Compare the distribution of identities between two enve.RecPlot2 objects.
868
- (x,
869
- ### First enve.RecPlot2 object.
870
- y,
871
- ### Second enve.RecPlot2 object.
872
- method="hellinger",
873
- ### Distance method to use. This should be (an unambiguous abbreviation of)
874
- ### one of:
875
- ### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
876
- ### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
877
- ### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
878
- ### doi:10.1214/aoms/1177729694), or "euclidean".
879
- smooth.par=NULL,
880
- ### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
881
- ### Use NULL to automatically determine this value using leave-one-out
882
- ### cross-validation (see `smooth.spline` parameter `spar`).
883
- pseudocounts=0,
884
- ### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
885
- ### 1 for add-one smoothing.
886
- max.deviation=0.75
887
- ### Maximum mean deviation between identity breaks tolerated (as percent
888
- ### identity). Difference in number of id.breaks is never tolerated.
889
- ){
1149
+ (x,
1150
+ y,
1151
+ method="hellinger",
1152
+ smooth.par=NULL,
1153
+ pseudocounts=0,
1154
+ max.deviation=0.75
1155
+ ){
890
1156
  METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
891
1157
  i.meth <- pmatch(method, METHODS)
892
1158
  if (is.na(i.meth)) stop("Invalid distance ", method)
@@ -899,7 +1165,7 @@ enve.recplot2.compareIdentities <- function
899
1165
  dev <- mean(abs(x$id.breaks - y$id.breaks))
900
1166
  if(dev > max.deviation)
901
1167
  stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
902
- dev)
1168
+ dev)
903
1169
  x.cnt <- x$id.counts
904
1170
  y.cnt <- y$id.counts
905
1171
  if(is.null(smooth.par) || smooth.par > 0){
@@ -928,63 +1194,92 @@ enve.recplot2.compareIdentities <- function
928
1194
  return(d)
929
1195
  }
930
1196
 
1197
+ #' Enveomics: Recruitment Plot (2) Coordinates
1198
+ #'
1199
+ #' Returns the sequence name and coordinates of the requested position bins.
1200
+ #'
1201
+ #' @param x
1202
+ #' \code{\link{enve.RecPlot2}} object.
1203
+ #' @param bins
1204
+ #' Vector of selected bins to return. It can be a vector of logical values
1205
+ #' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
1206
+ #' missing, returns the coordinates of all windows.
1207
+ #'
1208
+ #' @return
1209
+ #' Returns a data.frame with five columns: \code{name.from} (character),
1210
+ #' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
1211
+ #' (numeric), and \code{seq.name} (character).
1212
+ #' The first two correspond to sequence and position of the start point of the
1213
+ #' bin. The next two correspond to the sequence and position of the end point of
1214
+ #' the bin. The last one indicates the name of the sequence (if defined).
1215
+ #'
1216
+ #' @author Luis M. Rodriguez-R [aut, cre]
1217
+ #'
1218
+ #' @export
1219
+
931
1220
  enve.recplot2.coordinates <- function
932
- ### Returns the sequence name and coordinates of the requested position bins.
933
- (x,
934
- ### `enve.RecPlot2` object.
935
- bins
936
- ### Vector of selected bins to return. It can be a vector of logical values
937
- ### with the same length as `x$pos.breaks`-1 or a vector of integers. If
938
- ### missing, returns the coordinates of all windows.
939
- ){
1221
+ (x,
1222
+ bins
1223
+ ){
940
1224
  if(!inherits(x, "enve.RecPlot2"))
941
1225
  stop("'x' must inherit from class `enve.RecPlot2`")
942
1226
  if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
943
1227
  if(!is.vector(bins)) stop("'bins' must be a vector")
944
1228
  if(inherits(bins, "logical")) bins <- which(bins)
945
1229
 
946
- y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
947
-
1230
+ y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
1231
+
948
1232
  for(i in 1:length(bins)){
949
1233
  j <- bins[i]
950
1234
  # Concatenated coordinates
951
1235
  cc <- x$pos.breaks[c(j, j+1)]
952
1236
  # Find the corresponding `seq.breaks`
953
1237
  sb.from <- which(
954
- cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
955
- cc[1] < x$seq.breaks[-1])
1238
+ cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
1239
+ cc[1] < x$seq.breaks[-1])
956
1240
  sb.to <- which(
957
- cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
958
- cc[2] <=x$seq.breaks[-1])
1241
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
1242
+ cc[2] <= x$seq.breaks[-1])
959
1243
  # Translate coordinates
960
1244
  if(length(sb.from)==1 & length(sb.to)==1){
961
1245
  y[i, 'name.from'] <- x$seq.names[sb.from]
962
- y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
1246
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
963
1247
  y[i, 'name.to'] <- x$seq.names[sb.to]
964
- y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1248
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1249
+ y[i, 'seq.name'] <- x$pos.names[i]
965
1250
  }
966
1251
  }
967
1252
 
968
1253
  return(y)
969
- ### Returns a data.frame with four columns: name.from (character), pos.from
970
- ### (numeric) name.to (character), and pos.to (numeric). The first two
971
- ### correspond to sequence and position of the start point of the bin, the
972
- ### last two correspond to the sequence and position of the end point of the
973
- ### bin.
974
1254
  }
975
1255
 
1256
+ #' Enveomics: Recruitment Plot (2) Sequencing Depth
1257
+ #'
1258
+ #' Calculate the sequencing depth of the given window(s).
1259
+ #'
1260
+ #' @param x
1261
+ #' \code{\link{enve.RecPlot2}} object.
1262
+ #' @param sel
1263
+ #' Window(s) for which the sequencing depth is to be calculated. If not
1264
+ #' passed, it returns the sequencing depth of all windows.
1265
+ #' @param low.identity
1266
+ #' A logical indicating if the sequencing depth is to be estimated only
1267
+ #' with low-identity matches. By default, only high-identity matches are
1268
+ #' used.
1269
+ #'
1270
+ #' @return
1271
+ #' Returns a numeric vector of sequencing depths (in bp/bp).
1272
+ #'
1273
+ #' @author Luis M. Rodriguez-R [aut, cre]
1274
+ #'
1275
+ #' @export
1276
+
976
1277
  enve.recplot2.seqdepth <- function
977
- ### Calculate the sequencing depth of the given window(s)
978
- (x,
979
- ### `enve.RecPlot2` object.
980
- sel,
981
- ### Window(s) for which the sequencing depth is to be calculated. If not
982
- ### passed, it returns the sequencing depth of all windows
983
- low.identity=FALSE
984
- ### A logical indicating if the sequencing depth is to be estimated only
985
- ### with low-identity matches. By default, only high-identity matches are
986
- ### used.
987
- ){
1278
+
1279
+ (x,
1280
+ sel,
1281
+ low.identity=FALSE
1282
+ ){
988
1283
  if(!inherits(x, "enve.RecPlot2"))
989
1284
  stop("'x' must inherit from class `enve.RecPlot2`")
990
1285
  if(low.identity){
@@ -997,21 +1292,30 @@ enve.recplot2.seqdepth <- function
997
1292
  seqdepth.in <- pos.cnts.in/pos.binsize
998
1293
  if(missing(sel)) return(seqdepth.in)
999
1294
  return(seqdepth.in[sel])
1000
- ### Returns a numeric vector of sequencing depths (in bp/bp).
1001
1295
  }
1002
1296
 
1297
+ #' Enveomics: Recruitment Plot (2) ANI Estimate
1298
+ #'
1299
+ #' Estimate the Average Nucleotide Identity from reads (ANIr) from a
1300
+ #' recruitment plot.
1301
+ #'
1302
+ #' @param x
1303
+ #' \code{\link{enve.RecPlot2}} object.
1304
+ #' @param range
1305
+ #' Range of identities to be considered. By default, the full range
1306
+ #' is used (note that the upper boundary is \code{Inf} and not 100 because
1307
+ #' recruitment plots can also be built with bit-scores). To use only
1308
+ #' intra-population matches (with identities), use c(95,100). To use only
1309
+ #' inter-population values, use c(0,95).
1310
+ #'
1311
+ #' @author Luis M. Rodriguez-R [aut, cre]
1312
+ #'
1313
+ #' @export
1314
+
1003
1315
  enve.recplot2.ANIr <- function
1004
- ### Estimate the Average Nucleotide Identity from reads (ANIr) from a
1005
- ### recruitment plot
1006
- (x,
1007
- ### `enve.RecPlot2` object.
1008
- range=c(0,Inf)
1009
- ### Range of identities to be considered. By default, the full range
1010
- ### is used (note that the upper boundary is `Inf` and not 100 because
1011
- ### recruitment plots can also be built with bit-scores). To use only
1012
- ### intra-population matches (with identities), use c(95,100). To use only
1013
- ### inter-population values, use c(0,95).
1014
- ){
1316
+ (x,
1317
+ range=c(0,Inf)
1318
+ ){
1015
1319
  if(!inherits(x, "enve.RecPlot2"))
1016
1320
  stop("'x' must inherit from class `enve.RecPlot2`")
1017
1321
  id.b <- x$id.breaks
@@ -1023,35 +1327,82 @@ enve.recplot2.ANIr <- function
1023
1327
  }
1024
1328
 
1025
1329
  #==============> Define internal functions
1330
+
1331
+ #' Enveomics: Recruitment Plot (2) Internal Ancillary Function
1332
+ #'
1333
+ #' Internal ancillary function (see \code{\link{enve.recplot2}}).
1334
+ #'
1335
+ #' @param x \code{\link{enve.RecPlot2}} object
1336
+ #' @param pos.breaks Position breaks
1337
+ #' @param id.breaks Identity breaks
1338
+ #' @param rec.idcol Identity column to use
1339
+ #'
1340
+ #' @author Luis M. Rodriguez-R [aut, cre]
1341
+ #' @author Kenji Gerhardt [aut]
1342
+ #'
1343
+ #' @export
1344
+
1026
1345
  enve.recplot2.__counts <- function
1027
- ### Internal ancilliary function (see `enve.recplot2`).
1028
- (x, pos.breaks, id.breaks, rec.idcol){
1029
- rec <- x$rec
1030
- verbose <- x$verbose
1031
- counts <- matrix(0, nrow=length(pos.breaks)-1, ncol=length(id.breaks)-1);
1032
- for(i in 1:nrow(rec)){
1033
- if(verbose & i%%100==0) cat(" [",signif(i*100/nrow(rec),3),"% ] \r");
1034
- y.bin <- which(
1035
- rec[i,rec.idcol]>=id.breaks[-length(id.breaks)] &
1036
- rec[i,rec.idcol]<=id.breaks[-1])[1] ;
1037
- for(pos in rec[i,1]:rec[i,2]){
1038
- x.bin <- which(
1039
- pos>=pos.breaks[-length(pos.breaks)] & pos<=pos.breaks[-1])[1] ;
1040
- counts[x.bin, y.bin] <- counts[x.bin, y.bin]+1 ;
1041
- }
1042
- }
1043
- return(counts);
1346
+ (x, pos.breaks, id.breaks, rec.idcol) {
1347
+ rec2 <- x$rec
1348
+ verbose <- x$verbose
1349
+
1350
+ # get counts of how many occurrences of each genome pos.bin there are per read
1351
+ x.bins <- mapply(
1352
+ function(start, end) {
1353
+ list(rle(findInterval(start:end, pos.breaks, left.open = T)))
1354
+ }, rec2[, 1], rec2[, 2])
1355
+
1356
+ # find the single y bin for each row, replicates it at the correct places to
1357
+ # the number of distinct bins found in its row
1358
+ y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
1359
+ times = unlist(lapply(x.bins, function(a) length(a$lengths))))
1360
+
1361
+ # x.bins_counts is the number of occurrences of each bin a row contains,
1362
+ # per row, then unlisted
1363
+ x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
1364
+
1365
+ # these are the pos. in. genome bins that each count in x.bins_counts falls into
1366
+ x.bins <- unlist(lapply(x.bins, function(a) a$values))
1367
+
1368
+ # much more efficient counts implementation in R using lists instead of a matrix:
1369
+ counts <- lapply(
1370
+ 1:(length(pos.breaks) - 1),
1371
+ function(col_len) rep(0, length(id.breaks) - 1))
1372
+
1373
+ # accesses the correct list in counts by x.bin, then
1374
+ # accesses the position in that row by y.bins and adds the new count
1375
+ for(i in 1:length(x.bins)){
1376
+ counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
1377
+ }
1378
+
1379
+ counts <- do.call(rbind, counts)
1380
+ return(counts)
1044
1381
  }
1045
1382
 
1383
+ #' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
1384
+ #'
1385
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
1386
+ #'
1387
+ #' @param x \code{\link{enve.RecPlot2}} object
1388
+ #' @param comp Components
1389
+ #' @param do_crit Function estimating the criterion
1390
+ #' @param best Best solution thus far
1391
+ #' @param verbose If verbose
1392
+ #' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
1393
+ #'
1394
+ #' @author Luis M. Rodriguez-R [aut, cre]
1395
+ #'
1396
+ #' @export
1397
+
1046
1398
  enve.recplot2.findPeaks.__emauto_one <- function
1047
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
1048
- (x, comp, do_crit, best, verbose, ...){
1399
+ (x, comp, do_crit, best, verbose, ...){
1049
1400
  peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1050
1401
  if(length(peaks)==0) return(best)
1051
1402
  k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
1052
1403
  crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
1053
1404
  if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
1054
- ifelse(crit > best[['crit']], '*', ''), '\n')
1405
+ ifelse(crit > best[['crit']], '*', ''), '\n')
1055
1406
  if(crit > best[['crit']]){
1056
1407
  best[['crit']] <- crit
1057
1408
  best[['peaks']] <- peaks
@@ -1059,148 +1410,217 @@ enve.recplot2.findPeaks.__emauto_one <- function
1059
1410
  best[['pstore']][[comp]] <- peaks
1060
1411
  return(best)
1061
1412
  }
1413
+
1414
+ #' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
1415
+ #'
1416
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1417
+ #'
1418
+ #' @param x Vector of log-transformed sequencing depths
1419
+ #' @param theta Parameters list
1420
+ #'
1421
+ #' @author Luis M. Rodriguez-R [aut, cre]
1422
+ #'
1423
+ #' @export
1424
+
1062
1425
  enve.recplot2.findPeaks.__em_e <- function
1063
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
1064
- (x, theta){
1426
+ (x, theta){
1065
1427
  components <- length(theta[['mu']])
1066
1428
  product <- do.call(cbind,
1067
- lapply(1:components,
1068
- function(i) dnorm(x, theta[['mu']][i],
1069
- theta[['sd']][i])*theta[['alpha']][i]))
1429
+ lapply(1:components,
1430
+ function(i) dnorm(x, theta[['mu']][i],
1431
+ theta[['sd']][i])*theta[['alpha']][i]))
1070
1432
  sum.of.components <- rowSums(product)
1071
1433
  posterior <- product / sum.of.components
1072
-
1434
+
1073
1435
  return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1074
1436
  }
1075
1437
 
1438
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
1439
+ #'
1440
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1441
+ #'
1442
+ #' @param x Vector of log-transformed sequencing depths
1443
+ #' @param posterior Posterior probability
1444
+ #'
1445
+ #' @author Luis M. Rodriguez-R [aut, cre]
1446
+ #'
1447
+ #' @export
1448
+
1076
1449
  enve.recplot2.findPeaks.__em_m <- function
1077
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
1078
- (x, posterior){
1450
+ (x, posterior){
1079
1451
  components <- ncol(posterior)
1080
1452
  n <- colSums(posterior)
1081
1453
  mu <- colSums(posterior * x) / n
1082
1454
  sd <- sqrt( colSums(
1083
- posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1455
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1084
1456
  alpha <- n/length(x)
1085
1457
  return(list(mu=mu, sd=sd, alpha=alpha))
1086
1458
  }
1087
1459
 
1460
+ #' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
1461
+ #'
1462
+ #' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
1463
+ #'
1464
+ #' @param x \code{\link{enve.RecPlot2.Peak}} object
1465
+ #' @param mids Midpoints
1466
+ #' @param counts Counts
1467
+ #'
1468
+ #' @author Luis M. Rodriguez-R [aut, cre]
1469
+ #'
1470
+ #' @export
1471
+
1088
1472
  enve.recplot2.__peakHist <- function
1089
- ### Internal ancilliary function (see `enve.RecPlot2.Peak`).
1090
- (x, mids, counts=TRUE){
1091
- d.o <- x$param.hat
1092
- if(length(x$log)==0) x$log <- FALSE
1093
- if(x$log){
1094
- d.o$x <- log(mids)
1095
- }else{
1096
- d.o$x <- mids
1097
- }
1098
- prob <- do.call(paste('d', x$dist, sep=''), d.o)
1099
- if(!counts) return(prob)
1100
- if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1101
- return(prob*x$n.hat/sum(prob))
1473
+ (x, mids, counts=TRUE){
1474
+ d.o <- x$param.hat
1475
+ if(length(x$log)==0) x$log <- FALSE
1476
+ if(x$log){
1477
+ d.o$x <- log(mids)
1478
+ }else{
1479
+ d.o$x <- mids
1480
+ }
1481
+ prob <- do.call(paste('d', x$dist, sep=''), d.o)
1482
+ if(!counts) return(prob)
1483
+ if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1484
+ return(prob*x$n.hat/sum(prob))
1102
1485
  }
1103
1486
 
1487
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
1488
+ #'
1489
+ #' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1490
+ #'
1491
+ #' @param lsd1 Vector of log-transformed sequencing depths
1492
+ #' @param min.points Minimum number of points
1493
+ #' @param quant.est Quantile estimate
1494
+ #' @param mlv.opts List of options for \code{mlv}
1495
+ #' @param fitdist.opts List of options for \code{fitdist}
1496
+ #' @param with.skewness If skewed-normal should be used
1497
+ #' @param optim.rounds Maximum number of optimization rounds
1498
+ #' @param optim.epsilon Minimum difference considered negligible
1499
+ #' @param n.total Global number of windows
1500
+ #' @param merge.logdist Attempted \code{merge.logdist} parameter
1501
+ #' @param verbose If verbose
1502
+ #' @param log If log-transformed depths
1503
+ #'
1504
+ #' @author Luis M. Rodriguez-R [aut, cre]
1505
+ #'
1506
+ #' @export
1507
+
1104
1508
  enve.recplot2.findPeaks.__mow_one <- function
1105
- ### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
1106
- (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1107
- optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1108
- ){
1109
- dist <- ifelse(with.skewness, 'sn', 'norm');
1110
-
1111
- # Find peak
1112
- o <- mlv.opts; o$x = lsd1;
1113
- mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1114
- if(verbose) cat('Anchoring at mode =',mode1,'\n')
1115
- param.hat <- fitdist.opts$start; last.hat <- param.hat;
1116
- lim <- NA;
1117
- if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1118
-
1119
- # Refine peak parameters
1120
- for(round in 1:optim.rounds){
1121
- param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1122
- lim.o <- param.hat
1123
- lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1124
- lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1125
- if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1126
- 'and params =',as.numeric(param.hat),' \r')
1127
- if(length(lsd1.pop) < min.points) break;
1128
- o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1129
- last.last.hat <- last.hat
1130
- last.hat <- param.hat
1131
- param.hat <- as.list(do.call(fitdist, o)$estimate);
1132
- if(any(is.na(param.hat))){
1133
- if(round>1) param.hat <- last.hat;
1134
- break;
1135
- }
1136
- if(round > 1){
1137
- epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1138
- if(epsilon1 < optim.epsilon) break;
1139
- if(round > 2){
1140
- epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1141
- if(epsilon2 < optim.epsilon) break;
1142
- }
1509
+ (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1510
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1511
+ ){
1512
+ dist <- ifelse(with.skewness, 'sn', 'norm');
1513
+
1514
+ # Find peak
1515
+ o <- mlv.opts; o$x = lsd1;
1516
+ mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1517
+ if(verbose) cat('Anchoring at mode =',mode1,'\n')
1518
+ param.hat <- fitdist.opts$start; last.hat <- param.hat;
1519
+ lim <- NA;
1520
+ if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1521
+
1522
+ # Refine peak parameters
1523
+ for(round in 1:optim.rounds){
1524
+ param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1525
+ lim.o <- param.hat
1526
+ lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1527
+ lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1528
+ if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1529
+ 'and params =',as.numeric(param.hat),' \r')
1530
+ if(length(lsd1.pop) < min.points) break;
1531
+ o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1532
+ last.last.hat <- last.hat
1533
+ last.hat <- param.hat
1534
+ param.hat <- as.list(do.call(fitdist, o)$estimate);
1535
+ if(any(is.na(param.hat))){
1536
+ if(round>1) param.hat <- last.hat;
1537
+ break;
1538
+ }
1539
+ if(round > 1){
1540
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1541
+ if(epsilon1 < optim.epsilon) break;
1542
+ if(round > 2){
1543
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1544
+ if(epsilon2 < optim.epsilon) break;
1143
1545
  }
1144
- }
1145
- if(verbose) cat('\n')
1146
- if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1147
-
1148
- # Mow distribution
1149
- lsd2 <- c();
1150
- lsd.pop <- c();
1151
- n.hat <- length(lsd1.pop)/diff(quant.est)
1152
- peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1153
- param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1154
- merge.logdist=merge.logdist, log=log)
1155
- peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1156
- peak.cnt <- enve.recplot2.__peakHist(peak,
1157
- (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1158
- for(i in 2:length(peak.breaks)){
1159
- values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1160
- n.exp <- peak.cnt[i-1]
1161
- if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1162
- if(length(values)==0) next
1163
- in.peak <- runif(length(values)) <= n.exp/length(values)
1164
- lsd2 <- c(lsd2, values[!in.peak])
1165
- lsd.pop <- c(lsd.pop, values[in.peak])
1166
- }
1167
- if(length(lsd.pop) < min.points) return(NULL)
1168
-
1169
- # Return peak
1170
- attr(peak, 'values') <- lsd.pop
1171
- attr(peak, 'values.res') <- lsd2
1172
- attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1173
- plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1174
- plot=FALSE)$counts)+1)/2
1175
- mu <- tail(param.hat, n=1)
1176
- attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1177
- if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1178
- 'with expected n =',n.hat,'\n')
1179
- return(peak)
1546
+ }
1547
+ }
1548
+ if(verbose) cat('\n')
1549
+ if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1550
+
1551
+ # Mow distribution
1552
+ lsd2 <- c();
1553
+ lsd.pop <- c();
1554
+ n.hat <- length(lsd1.pop)/diff(quant.est)
1555
+ peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1556
+ param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1557
+ merge.logdist=merge.logdist, log=log)
1558
+ peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1559
+ peak.cnt <- enve.recplot2.__peakHist(peak,
1560
+ (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1561
+ for(i in 2:length(peak.breaks)){
1562
+ values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1563
+ n.exp <- peak.cnt[i-1]
1564
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1565
+ if(length(values)==0) next
1566
+ in.peak <- runif(length(values)) <= n.exp/length(values)
1567
+ lsd2 <- c(lsd2, values[!in.peak])
1568
+ lsd.pop <- c(lsd.pop, values[in.peak])
1569
+ }
1570
+ if(length(lsd.pop) < min.points) return(NULL)
1571
+
1572
+ # Return peak
1573
+ attr(peak, 'values') <- lsd.pop
1574
+ attr(peak, 'values.res') <- lsd2
1575
+ attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1576
+ plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1577
+ plot=FALSE)$counts)+1)/2
1578
+ mu <- tail(param.hat, n=1)
1579
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1580
+ if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1581
+ 'with expected n =',n.hat,'\n')
1582
+ return(peak)
1180
1583
  }
1181
1584
 
1585
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
1586
+ #'
1587
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1588
+ #'
1589
+ #' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
1590
+ #'
1591
+ #' @author Luis M. Rodriguez-R [aut, cre]
1592
+ #'
1593
+ #' @export
1594
+
1182
1595
  enve.recplot2.findPeaks.__mower <- function
1183
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
1184
- (peaks.opts){
1185
- peaks <- list()
1186
- while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1187
- peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1188
- if(is.null(peak)) break
1189
- peaks[[ length(peaks)+1 ]] <- peak
1190
- peaks.opts$lsd1 <- peak$values.res
1191
- }
1192
- return(peaks)
1596
+ (peaks.opts){
1597
+ peaks <- list()
1598
+ while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1599
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1600
+ if(is.null(peak)) break
1601
+ peaks[[ length(peaks)+1 ]] <- peak
1602
+ peaks.opts$lsd1 <- peak$values.res
1603
+ }
1604
+ return(peaks)
1193
1605
  }
1194
1606
 
1607
+ #' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
1608
+ #'
1609
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
1610
+ #'
1611
+ #' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
1612
+ #' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
1613
+ #'
1614
+ #' @author Luis M. Rodriguez-R [aut, cre]
1615
+ #'
1616
+ #' @export
1195
1617
 
1196
1618
  enve.recplot2.__whichClosestPeak <- function
1197
- ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
1198
- (peak, peaks){
1199
- dist <- as.numeric(lapply(peaks,
1200
- function(x)
1201
- abs(log(x$param.hat[[ length(x$param.hat) ]] /
1202
- peak$param.hat[[ length(peak$param.hat) ]] ))))
1203
- dist[ dist==0 ] <- Inf
1204
- return(which.min(dist))
1619
+ (peak, peaks){
1620
+ dist <- as.numeric(lapply(peaks,
1621
+ function(x)
1622
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1623
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
1624
+ dist[ dist==0 ] <- Inf
1625
+ return(which.min(dist))
1205
1626
  }
1206
-