miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -1,497 +1,647 @@
1
1
  #==============> Define S4 classes
2
- setClass("enve.RecPlot2",
3
- ### Enve-omics representation of Recruitment plots. This object can
4
- ### be produced by `enve.recplot2` and supports S4 method plot.
5
- representation(
6
- counts='matrix', ##<< Counts as a two-dimensional histogram.
7
- pos.counts.in='numeric', ##<< Counts of in-group hits per position bin.
8
- pos.counts.out='numeric', ##<< Counts of out-group hits per position bin.
9
- id.counts='numeric', ##<< Counts per ID bin.
10
- id.breaks='numeric', ##<< Breaks of identity bins.
11
- pos.breaks='numeric', ##<< Breaks of position bins.
12
- pos.names='character', ##<< Names of the position bins.
13
- seq.breaks='numeric', ##<< Breaks of input sequences.
14
- peaks='list', ##<< Peaks identified in the recplot.
15
- ### Limits of the subject sequences after concatenation.
16
- seq.names='character', ##<< Names of the subject sequences.
17
- id.metric='character', ##<< Metric used as 'identity'.
18
- id.ingroup='logical', ##<< Identity bins considered in-group.
19
- call='call') ##<< Call producing this object.
20
- ,package='enveomics.R'
21
- );
22
- setClass("enve.RecPlot2.Peak",
23
- ### Enve-omics representation of a peak in the sequencing depth histogram
24
- ### of a Recruitment plot (see `enve.recplot2.findPeaks`).
25
- representation(
26
- dist='character',
27
- ### Distribution of the peak. Currently supported: 'norm' (normal) and 'sn'
28
- ### (skew-normal).
29
- values='numeric',
30
- ### Sequencing depth values predicted to conform the peak.
31
- values.res='numeric',
32
- ### Sequencing depth values not explained by this or previously identified
33
- ### peaks.
34
- mode='numeric',
35
- ### Seed-value of mode anchoring the peak.
36
- param.hat='list',
37
- ### Parameters of the distribution. A list of two values if dist='norm' (sd
38
- ### and mean), or three values if dist='sn' (omega=scale, alpha=shape, and
39
- ### xi=location). Note that the "dispersion" parameter is always first and
40
- ### the "location" parameter is always last.
41
- n.hat='numeric',
42
- ### Number of bins estimated to be explained by this peak. This should
43
- ### ideally be equal to the length of `values`, but it's not and integer.
44
- n.total='numeric',
45
- ### Total number of bins from which the peak was extracted. I.e., total
46
- ### number of position bins with non-zero sequencing depth in the recruitment
47
- ### plot (regardless of peak count).
48
- err.res='numeric',
49
- ### Error left after adding the peak (mower) or log-likelihood (em or emauto).
50
- merge.logdist='numeric',
51
- ### Attempted `merge.logdist` parameter.
52
- seq.depth='numeric',
53
- ### Best estimate available for the sequencing depth of the peak (centrality).
54
- log='logical'
55
- ### Indicates if the estimation was performed in natural logarithm space
56
- ));
2
+
3
+ #' Enveomics: Recruitment Plot (2) - S4 Class
4
+ #'
5
+ #' Enve-omics representation of Recruitment plots. This object can
6
+ #' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
7
+ #'
8
+ #' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
9
+ #' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
10
+ #' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
11
+ #' @slot id.counts \code{(numeric)} Counts per ID bin.
12
+ #' @slot id.breaks \code{(numeric)} Breaks of identity bins.
13
+ #' @slot pos.breaks \code{(numeric)} Breaks of position bins.
14
+ #' @slot pos.names \code{(character)} Names of the position bins.
15
+ #' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
16
+ #' @slot peaks \code{(list)} Peaks identified in the recplot.
17
+ #' Limits of the subject sequences after concatenation.
18
+ #' @slot seq.names \code{(character}) Names of the subject sequences.
19
+ #' @slot id.metric \code{(character}) Metric used as 'identity'.
20
+ #' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
21
+ #' @slot call \code{(call)} Call producing this object.
22
+ #'
23
+ #' @author Luis M. Rodriguez-R [aut, cre]
24
+ #'
25
+ #' @exportClass
26
+
27
+ enve.RecPlot2 <- setClass("enve.RecPlot2",
28
+ representation(
29
+ # slots = list(
30
+ counts='matrix',
31
+ pos.counts.in='numeric',
32
+ pos.counts.out='numeric',
33
+ id.counts='numeric',
34
+ id.breaks='numeric',
35
+ pos.breaks='numeric',
36
+ pos.names='character',
37
+ seq.breaks='numeric',
38
+ peaks='list',
39
+ seq.names='character',
40
+ id.metric='character',
41
+ id.ingroup='logical',
42
+ call='call')
43
+ ,package='enveomics.R'
44
+ );
45
+
46
+ #' Enveomics: Recruitment Plot (2) Peak - S4 Class
47
+ #'
48
+ #' Enve-omics representation of a peak in the sequencing depth histogram
49
+ #' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
50
+ #'
51
+ #' @slot dist \code{(character)}
52
+ #' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
53
+ #' (skew-normal).
54
+ #' @slot values \code{(numeric)}
55
+ #' Sequencing depth values predicted to conform the peak.
56
+ #' @slot values.res \code{(numeric)}
57
+ #' Sequencing depth values not explained by this or previously identified
58
+ #' peaks.
59
+ #' @slot mode \code{(numeric)}
60
+ #' Seed-value of mode anchoring the peak.
61
+ #' @slot param.hat \code{(list)}
62
+ #' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
63
+ #' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
64
+ #' xi=location). Note that the "dispersion" parameter is always first and
65
+ #' the "location" parameter is always last.
66
+ #' @slot n.hat \code{(numeric)}
67
+ #' Number of bins estimated to be explained by this peak. This should
68
+ #' ideally be equal to the length of \code{values}, but it's not an integer.
69
+ #' @slot n.total \code{(numeric)}
70
+ #' Total number of bins from which the peak was extracted. I.e., total
71
+ #' number of position bins with non-zero sequencing depth in the recruitment
72
+ #' plot (regardless of peak count).
73
+ #' @slot err.res \code{(numeric)}
74
+ #' Error left after adding the peak (mower) or log-likelihood (em or emauto).
75
+ #' @slot merge.logdist \code{(numeric)}
76
+ #' Attempted \code{merge.logdist} parameter.
77
+ #' @slot seq.depth \code{(numeric)}
78
+ #' Best estimate available for the sequencing depth of the peak (centrality).
79
+ #' @slot log \code{(logical)}
80
+ #' Indicates if the estimation was performed in natural logarithm space.
81
+ #'
82
+ #' @author Luis M. Rodriguez-R [aut, cre]
83
+ #'
84
+ #' @exportClass
85
+
86
+ enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
87
+ representation(
88
+ # slots = list(
89
+ dist='character',
90
+ values='numeric',
91
+ values.res='numeric',
92
+ mode='numeric',
93
+ param.hat='list',
94
+ n.hat='numeric',
95
+ n.total='numeric',
96
+ err.res='numeric',
97
+ merge.logdist='numeric',
98
+ seq.depth='numeric',
99
+ log='logical'
100
+ ));
101
+
102
+ #' Attribute accessor
103
+ #'
104
+ #'
105
+ #' @param x Object
106
+ #' @param name Attribute name
57
107
  setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
108
+
109
+ #' Attribute accessor
110
+ #'
111
+ #'
112
+ #' @param x Object
113
+ #' @param name Attribute name
58
114
  setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
59
115
 
60
116
  #==============> Define S4 methods
117
+
118
+ #' Enveomics: Recruitment Plot (2)
119
+ #'
120
+ #' Plots an \code{\link{enve.RecPlot2}} object.
121
+ #'
122
+ #' @param x
123
+ #' \code{\link{enve.RecPlot2}} object to plot.
124
+ #' @param layout
125
+ #' Matrix indicating the position of the different panels in the layout,
126
+ #' where:
127
+ #' \itemize{
128
+ #' \item 0: Empty space
129
+ #' \item 1: Counts matrix
130
+ #' \item 2: position histogram (sequencing depth)
131
+ #' \item 3: identity histogram
132
+ #' \item 4: Populations histogram (histogram of sequencing depths)
133
+ #' \item 5: Color scale for the counts matrix (vertical)
134
+ #' \item 6: Color scale of the counts matrix (horizontal)
135
+ #' }
136
+ #' Only panels indicated here will be plotted. To plot only one panel
137
+ #' simply set this to the number of the panel you want to plot.
138
+ #' @param panel.fun
139
+ #' List of functions to be executed after drawing each panel. Use the
140
+ #' indices in \code{layout} (as characters) as keys. Functions for indices
141
+ #' missing in \code{layout} are ignored. For example, to add a vertical line
142
+ #' at the 3Mbp mark in both the position histogram and the counts matrix:
143
+ #' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
144
+ #' Note that the X-axis in both panels is in Mbp by default. To change
145
+ #' this behavior, set \code{pos.units} accordingly.
146
+ #' @param widths
147
+ #' Relative widths of the columns of \code{layout}.
148
+ #' @param heights
149
+ #' Relative heights of the rows of \code{layout}.
150
+ #' @param palette
151
+ #' Colors to be used to represent the counts matrix, sorted from no hits
152
+ #' to the maximum sequencing depth.
153
+ #' @param underlay.group
154
+ #' If TRUE, it indicates the in-group and out-group areas couloured based
155
+ #' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
156
+ #' @param peaks.col
157
+ #' If not \code{NA}, it attempts to represent peaks in the population histogram
158
+ #' in the specified color. Set to \code{NA} to avoid peak-finding.
159
+ #' @param use.peaks
160
+ #' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
161
+ #' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
162
+ #' @param id.lim
163
+ #' Limits of identities to represent.
164
+ #' @param pos.lim
165
+ #' Limits of positions to represent (in bp, regardless of \code{pos.units}).
166
+ #' @param pos.units
167
+ #' Units in which the positions should be represented (powers of 1,000
168
+ #' base pairs).
169
+ #' @param mar
170
+ #' Margins of the panels as a list, with the character representation of
171
+ #' the number of the panel as index (see \code{layout}).
172
+ #' @param pos.splines
173
+ #' Smoothing parameter for the splines in the position histogram. Zero
174
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
175
+ #' cross-validation.
176
+ #' @param id.splines
177
+ #' Smoothing parameter for the splines in the identity histogram. Zero
178
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
179
+ #' cross-validation.
180
+ #' @param in.lwd
181
+ #' Line width for the sequencing depth of in-group matches.
182
+ #' @param out.lwd
183
+ #' Line width for the sequencing depth of out-group matches.
184
+ #' @param id.lwd
185
+ #' Line width for the identity histogram.
186
+ #' @param in.col
187
+ #' Color associated to in-group matches.
188
+ #' @param out.col
189
+ #' Color associated to out-group matches.
190
+ #' @param id.col
191
+ #' Color for the identity histogram.
192
+ #' @param breaks.col
193
+ #' Color of the vertical lines indicating sequence breaks.
194
+ #' @param peaks.opts
195
+ #' Options passed to \code{\link{enve.recplot2.findPeaks}},
196
+ #' if \code{peaks.col} is not \code{NA}.
197
+ #' @param ...
198
+ #' Any other graphic parameters (currently ignored).
199
+ #'
200
+ #' @return
201
+ #' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
202
+ #' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
203
+ #' \code{layout} doesn't include 4, returns \code{NA}.
204
+ #'
205
+ #' @author Luis M. Rodriguez-R [aut, cre]
206
+ #'
207
+ #' @method plot enve.RecPlot2
208
+ #' @export
209
+
61
210
  plot.enve.RecPlot2 <- function
62
- ### Plots an `enve.RecPlot2` object.
63
- (x,
64
- ### `enve.RecPlot2` object to plot.
65
- layout=matrix(c(5,5,2,1,4,3), nrow=2),
66
- ### Matrix indicating the position of the different panels in the layout,
67
- ### where:
68
- ### 0: Empty space,
69
- ### 1: Counts matrix,
70
- ### 2: position histogram (sequencing depth),
71
- ### 3: identity histogram,
72
- ### 4: Populations histogram (histogram of sequencing depths),
73
- ### 5: Color scale for the counts matrix (vertical),
74
- ### 6: Color scale of the counts matrix (horizontal)
75
- ### Only panels indicated here will be plotted. To plot only one panel
76
- ### simply set this to the number of the panel you want to plot.
77
- panel.fun=list(),
78
- ### List of functions to be executed after drawing each panel. Use the
79
- ### indices in `layout` (as characters) as keys. Functions for indices
80
- ### missing in `layout` are ignored. For example, to add a vertical line
81
- ### at the 3Mbp mark in both the position histogram and the counts matrix:
82
- ### `list('1'=function() abline(v=3), '2'=function() abline(v=3))`.
83
- ### Note that the X-axis in both panels is in Mbp by default. To change
84
- ### this behavior, set `pos.units` accordingly.
85
- widths=c(1,7,2),
86
- ### Relative widths of the columns of `layout`.
87
- heights=c(1,2),
88
- ### Relative heights of the rows of `layout`.
89
- palette=grey((100:0)/100),
90
- ### Colors to be used to represent the counts matrix, sorted from no hits
91
- ### to the maximum sequencing depth.
92
- underlay.group=TRUE,
93
- ### If TRUE, it indicates the in-group and out-group areas couloured based
94
- ### on `in.col` and `out.col`. Requires support for semi-transparency.
95
- peaks.col='darkred',
96
- ### If not NA, it attempts to represent peaks in the population histogram
97
- ### in the specified color. Set to NA to avoid peak-finding.
98
- use.peaks,
99
- ### A list of `enve.RecPlot2.Peak` objects, as returned by
100
- ### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
101
- id.lim=range(x$id.breaks),
102
- ### Limits of identities to represent.
103
- pos.lim=range(x$pos.breaks),
104
- ### Limits of positions to represent (in bp, regardless of `pos.units`).
105
- pos.units=c('Mbp','Kbp','bp'),
106
- ### Units in which the positions should be represented (powers of 1,000
107
- ### base pairs).
108
- mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
109
- '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
110
- '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
111
- '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
112
- ### Margins of the panels as a list, with the character representation of
113
- ### the number of the panel as index (see `layout`).
114
- pos.splines=0,
115
- ### Smoothing parameter for the splines in the position histogram. Zero
116
- ### (0) for no splines. Use NULL to automatically detect by leave-one-out
117
- ### cross-validation.
118
- id.splines=1/2,
119
- ### Smoothing parameter for the splines in the identity histogram. Zero
120
- ### (0) for no splines. Use NULL to automatically detect by leave-one-out
121
- ### cross-validation.
122
- in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
123
- ### Line width for the sequencing depth of in-group matches.
124
- out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
125
- ### Line width for the sequencing depth of out-group matches.
126
- id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
127
- ### Line width for the identity histogram.
128
- in.col='darkblue',
129
- ### Color associated to in-group matches.
130
- out.col='lightblue',
131
- ### Color associated to out-group matches.
132
- id.col='black',
133
- ### Color for the identity histogram.
134
- breaks.col='#AAAAAA40',
135
- ### Color of the vertical lines indicating sequence breaks.
136
- peaks.opts=list(),
137
- ### Options passed to `enve.recplot2.findPeaks`, if `peaks.col` is not NA.
138
- ...
139
- ### Any other graphic parameters (currently ignored).
140
- ){
141
- pos.units <- match.arg(pos.units);
142
- pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
143
- pos.lim <- pos.lim/pos.factor;
144
- lmat <- layout;
145
- for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
146
-
147
- layout(lmat, widths=widths, heights=heights);
148
- ori.mar <- par('mar');
149
-
150
- # Essential vars
151
- counts <- x$counts
152
-
153
- id.ingroup <- x$id.ingroup
154
- id.counts <- x$id.counts
155
- id.breaks <- x$id.breaks
156
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
157
- id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
158
-
159
- pos.counts.in <- x$pos.counts.in
160
- pos.counts.out <- x$pos.counts.out
161
- pos.breaks <- x$pos.breaks/pos.factor
162
- pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
163
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
164
-
165
- seqdepth.in <- pos.counts.in/pos.binsize
166
- seqdepth.out <- pos.counts.out/pos.binsize
167
- seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
168
- seqdepth.out[seqdepth.out>0]))*c(1/2,2)
169
-
170
- if(underlay.group){
171
- in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
172
- list(maxColorValue=256, alpha=62)));
173
- out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
174
- list(maxColorValue=256, alpha=52)));
175
- }
176
-
177
- # [1] Counts matrix
178
- if(any(layout==1)){
179
- par(mar=mar[['1']]);
180
- plot(1, t='n', bty='l',
181
- xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
182
- xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
211
+ (x,
212
+ layout=matrix(c(5,5,2,1,4,3), nrow=2),
213
+ panel.fun=list(),
214
+ widths=c(1,7,2),
215
+ heights=c(1,2),
216
+ palette=grey((100:0)/100),
217
+ underlay.group=TRUE,
218
+ peaks.col='darkred',
219
+ use.peaks,
220
+ id.lim=range(x$id.breaks),
221
+ pos.lim=range(x$pos.breaks),
222
+ pos.units=c('Mbp','Kbp','bp'),
223
+ mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
224
+ '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
225
+ '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
226
+ '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
227
+ pos.splines=0,
228
+ id.splines=1/2,
229
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
230
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
231
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
232
+ in.col='darkblue',
233
+ out.col='lightblue',
234
+ id.col='black',
235
+ breaks.col='#AAAAAA40',
236
+ peaks.opts=list(),
237
+ ...
238
+ ){
239
+ pos.units <- match.arg(pos.units);
240
+ pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
241
+ pos.lim <- pos.lim/pos.factor;
242
+ lmat <- layout;
243
+ for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
244
+
245
+ layout(lmat, widths=widths, heights=heights);
246
+ ori.mar <- par('mar');
247
+
248
+ # Essential vars
249
+ counts <- x$counts
250
+
251
+ id.ingroup <- x$id.ingroup
252
+ id.counts <- x$id.counts
253
+ id.breaks <- x$id.breaks
254
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
255
+ id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
256
+
257
+ pos.counts.in <- x$pos.counts.in
258
+ pos.counts.out <- x$pos.counts.out
259
+ pos.breaks <- x$pos.breaks/pos.factor
260
+ pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
261
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
262
+
263
+ seqdepth.in <- pos.counts.in/pos.binsize
264
+ seqdepth.out <- pos.counts.out/pos.binsize
265
+ seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
266
+ seqdepth.out[seqdepth.out>0]))*c(1/2,2)
267
+
268
+ if(underlay.group){
269
+ in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
270
+ list(maxColorValue=256, alpha=62)));
271
+ out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
272
+ list(maxColorValue=256, alpha=52)));
273
+ }
274
+
275
+ # [1] Counts matrix
276
+ if(any(layout==1)){
277
+ par(mar=mar[['1']]);
278
+ plot(1, t='n', bty='l',
279
+ xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
280
+ xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
281
+ if(underlay.group){
282
+ rect(pos.lim[1], id.lim[1], pos.lim[2],
283
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
284
+ rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
285
+ id.lim[2], col=in.bg, border=NA);
286
+ }
287
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col);
288
+ image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
289
+ bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
290
+ length.out=1+length(palette)), add=TRUE);
291
+ if(exists('1',panel.fun)) panel.fun[['1']]();
292
+ }
293
+
294
+ # [2] Position histogram
295
+ if(any(layout==2)){
296
+ par(mar=mar[['2']]);
297
+ if(any(layout==1)){
298
+ xlab=''
299
+ xaxt='n'
300
+ }else{
301
+ xlab=paste('Position in genome (',pos.units,')',sep='')
302
+ xaxt='s'
303
+ }
304
+ plot(1,t='n', bty='l', log='y',
305
+ xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
306
+ ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
307
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col)
308
+ pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
309
+ pos.f <- rep(seqdepth.in,each=2)
310
+ lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
311
+ lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
312
+ if(is.null(pos.splines) || pos.splines > 0){
313
+ pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
314
+ spar=pos.splines)
315
+ lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
316
+ }
317
+ if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
318
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
319
+ seqdepth.lim[1]*3/2, col=out.col, border=NA);
320
+ if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
321
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
322
+ seqdepth.lim[1]*3/2, col=in.col, border=NA);
323
+ if(exists('2',panel.fun)) panel.fun[['2']]();
324
+ }
325
+
326
+ # [3] Identity histogram
327
+ if(any(layout==3)){
328
+ par(mar=mar[['3']]);
329
+ if(any(layout==1)){
330
+ ylab=''
331
+ yaxt='n'
332
+ }else{
333
+ ylab=x$id.metric
334
+ yaxt='s'
335
+ }
336
+ if(sum(id.counts>0) >= 4){
337
+ id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
338
+ plot(1,t='n', bty='l', log='x',
339
+ xlim=id.counts.range, xlab='bps per bin', xaxs='i',
340
+ ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
183
341
  if(underlay.group){
184
- rect(pos.lim[1], id.lim[1], pos.lim[2],
185
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
186
- rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
187
- id.lim[2], col=in.bg, border=NA);
188
- }
189
- abline(v=x$seq.breaks/pos.factor, col=breaks.col);
190
- image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
191
- bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
192
- length.out=1+length(palette)), add=TRUE);
193
- if(exists('1',panel.fun)) panel.fun[['1']]();
194
- }
195
-
196
- # [2] Position histogram
197
- if(any(layout==2)){
198
- par(mar=mar[['2']]);
199
- if(any(layout==1)){
200
- xlab=''
201
- xaxt='n'
202
- }else{
203
- xlab=paste('Position in genome (',pos.units,')',sep='')
204
- xaxt='s'
205
- }
206
- plot(1,t='n', bty='l', log='y',
207
- xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
208
- ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
209
- abline(v=x$seq.breaks/pos.factor, col=breaks.col)
210
- pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
211
- pos.f <- rep(seqdepth.in,each=2)
212
- lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
213
- lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
214
- if(is.null(pos.splines) || pos.splines > 0){
215
- pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
216
- spar=pos.splines)
217
- lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
342
+ rect(id.counts.range[1], id.lim[1], id.counts.range[2],
343
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
344
+ rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
345
+ id.counts.range[2], id.lim[2], col=in.bg, border=NA);
218
346
  }
219
- if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
220
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
221
- seqdepth.lim[1]*3/2, col=out.col, border=NA);
222
- if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
223
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
224
- seqdepth.lim[1]*3/2, col=in.col, border=NA);
225
- if(exists('2',panel.fun)) panel.fun[['2']]();
226
- }
227
-
228
- # [3] Identity histogram
229
- if(any(layout==3)){
230
- par(mar=mar[['3']]);
231
- if(any(layout==1)){
232
- ylab=''
233
- yaxt='n'
234
- }else{
235
- ylab=x$id.metric
236
- yaxt='s'
237
- }
238
- if(sum(id.counts>0) >= 4){
239
- id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
240
- plot(1,t='n', bty='l', log='x',
241
- xlim=id.counts.range, xlab='bps per bin', xaxs='i',
242
- ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
243
- if(underlay.group){
244
- rect(id.counts.range[1], id.lim[1], id.counts.range[2],
245
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
246
- rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
247
- id.counts.range[2], id.lim[2], col=in.bg, border=NA);
248
- }
249
- id.f <- rep(id.counts,each=2)
250
- id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
251
- lines(id.f, id.x, lwd=id.lwd, col=id.col);
252
- if(is.null(id.splines) || id.splines > 0){
253
- id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
254
- spar=id.splines)
255
- lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
256
- }
257
- }else{
258
- plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
259
- text(1,1,labels='Insufficient data', srt=90)
347
+ id.f <- rep(id.counts,each=2)
348
+ id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
349
+ lines(id.f, id.x, lwd=id.lwd, col=id.col);
350
+ if(is.null(id.splines) || id.splines > 0){
351
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
352
+ spar=id.splines)
353
+ lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
260
354
  }
261
- if(exists('3',panel.fun)) panel.fun[['3']]();
262
- }
263
-
264
- # [4] Populations histogram
265
- peaks <- NA;
266
- if(any(layout==4)){
267
- par(mar=mar[['4']]);
268
- if(any(layout==2)){
269
- ylab=''
270
- yaxt='n'
355
+ }else{
356
+ plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
357
+ text(1,1,labels='Insufficient data', srt=90)
358
+ }
359
+ if(exists('3',panel.fun)) panel.fun[['3']]();
360
+ }
361
+
362
+ # [4] Populations histogram
363
+ peaks <- NA;
364
+ if(any(layout==4)){
365
+ par(mar=mar[['4']]);
366
+ if(any(layout==2)){
367
+ ylab=''
368
+ yaxt='n'
369
+ }else{
370
+ ylab='Sequencing depth (X)'
371
+ yaxt='s'
372
+ }
373
+ h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
374
+ length.out=200);
375
+ h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
376
+ h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
377
+ plot(1, t='n', log='y',
378
+ xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
379
+ xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
380
+ yaxt=yaxt)
381
+ y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
382
+ y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
383
+ lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
384
+ y.tmp.out, col=out.col)
385
+ polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
386
+ y.tmp.in, border=NA, col=in.col)
387
+ if(!is.na(peaks.col)){
388
+ o <- peaks.opts; o$x = x;
389
+ if(missing(use.peaks)){
390
+ peaks <- do.call(enve.recplot2.findPeaks, o)
271
391
  }else{
272
- ylab='Sequencing depth (X)'
273
- yaxt='s'
392
+ peaks <- use.peaks
274
393
  }
275
- h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
276
- length.out=200);
277
- h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
278
- h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
279
- plot(1, t='n', log='y',
280
- xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
281
- xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
282
- yaxt=yaxt)
283
- y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
284
- y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
285
- lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
286
- y.tmp.out, col=out.col)
287
- polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
288
- y.tmp.in, border=NA, col=in.col)
289
- if(!is.na(peaks.col)){
290
- o <- peaks.opts; o$x = x;
291
- if(missing(use.peaks)){
292
- peaks <- do.call(enve.recplot2.findPeaks, o)
293
- }else{
294
- peaks <- use.peaks
295
- }
296
- h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
297
- if(!is.null(peaks) & length(peaks)>0){
298
- pf <- h.mids*0;
299
- for(i in 1:length(peaks)){
300
- cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
301
- lines(cnt, h.mids, col='red');
302
- pf <- pf+cnt;
303
- axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
304
- }
305
- lines(pf, h.mids, col='red',lwd=1.5);
306
- dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
307
- frx <- signif(100*as.numeric(
308
- lapply(peaks,
309
- function(x) ifelse(length(x$values)==0, x$n.hat,
310
- length(x$values))/x$n.total)), 2)
311
- if(peaks[[1]]$err.res < 0){
312
- err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
313
- }else{
314
- err <- paste(', err:',
315
- signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
316
- }
317
- legend('topright', bty='n', cex=1/2,
318
- legend=paste(letters[1:length(peaks)],'. ',
319
- dpt,'X (', frx, '%', err, ')', sep=''))
320
- }
394
+ h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
395
+ if(!is.null(peaks) & length(peaks)>0){
396
+ pf <- h.mids*0;
397
+ for(i in 1:length(peaks)){
398
+ cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
399
+ lines(cnt, h.mids, col='red');
400
+ pf <- pf+cnt;
401
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
402
+ }
403
+ lines(pf, h.mids, col='red',lwd=1.5);
404
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
405
+ frx <- signif(100*as.numeric(
406
+ lapply(peaks,
407
+ function(x) ifelse(length(x$values)==0, x$n.hat,
408
+ length(x$values))/x$n.total)), 2)
409
+ if(peaks[[1]]$err.res < 0){
410
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
411
+ }else{
412
+ err <- paste(', err:',
413
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
414
+ }
415
+ legend('topright', bty='n', cex=1/2,
416
+ legend=paste(letters[1:length(peaks)],'. ',
417
+ dpt,'X (', frx, '%', err, ')', sep=''))
321
418
  }
322
- if(exists('4',panel.fun)) panel.fun[['4']]();
323
- }
324
-
325
- # [5] Color scale of the counts matrix (vertical)
326
- count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
327
- length.out=1+length(palette))
328
- if(any(layout==5)){
329
- par(mar=mar[['5']]);
330
- plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
331
- ylim=range(count.bins), yaxs='i', ylab='')
332
- rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
333
- border=NA)
334
- if(exists('5',panel.fun)) panel.fun[['5']]();
335
- }
336
-
337
- # [6] Color scale of the coutnts matrix (horizontal)
338
- if(any(layout==6)){
339
- par(mar=mar[['6']]);
340
- plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
341
- xlim=range(count.bins), xaxs='i',xlab='');
342
- rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
343
- border=NA);
344
- if(exists('6',panel.fun)) panel.fun[['6']]();
345
- }
346
-
347
- par(mar=ori.mar);
348
- return(peaks);
349
- ### Returns a list of `enve.RecPlot2.Peak` objects (see
350
- ### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
351
- ### 4, returns NA.
419
+ }
420
+ if(exists('4',panel.fun)) panel.fun[['4']]();
421
+ }
422
+
423
+ # [5] Color scale of the counts matrix (vertical)
424
+ count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
425
+ length.out=1+length(palette))
426
+ if(any(layout==5)){
427
+ par(mar=mar[['5']]);
428
+ plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
429
+ ylim=range(count.bins), yaxs='i', ylab='')
430
+ rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
431
+ border=NA)
432
+ if(exists('5',panel.fun)) panel.fun[['5']]();
433
+ }
434
+
435
+ # [6] Color scale of the coutnts matrix (horizontal)
436
+ if(any(layout==6)){
437
+ par(mar=mar[['6']]);
438
+ plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
439
+ xlim=range(count.bins), xaxs='i',xlab='');
440
+ rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
441
+ border=NA);
442
+ if(exists('6',panel.fun)) panel.fun[['6']]();
443
+ }
444
+
445
+ par(mar=ori.mar);
446
+ return(peaks);
352
447
  }
353
448
 
354
449
  #==============> Define core functions
450
+
451
+ #' Enveomics: Recruitment Plot (2)
452
+ #'
453
+ #' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
454
+ #' been previously executed.
455
+ #'
456
+ #' @param prefix
457
+ #' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
458
+ #' least the files .rec and .lim must exist with this prefix.
459
+ #' @param plot
460
+ #' Should the object be plotted?
461
+ #' @param pos.breaks
462
+ #' Breaks in the positions histogram. It can also be a vector of break
463
+ #' points, and values outside the range are ignored. If zero (0), it
464
+ #' uses the sequence breaks as defined in the .lim file, which means
465
+ #' one bin per contig (or gene, if the mapping is agains genes). Ignored
466
+ #' if `pos.breaks.tsv` is passed.
467
+ #' @param pos.breaks.tsv
468
+ #' Path to a list of (absolute) coordinates to use as position breaks.
469
+ #' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
470
+ #' must contain at least one column: coordinates of the break positions of
471
+ #' each position bin. If it has a second column, this is used as the name
472
+ #' of the position bin that ends at the given coordinate (the first row is
473
+ #' ignored). Any additional columns are currently ignored. If \code{NA},
474
+ #' position bins are determined by \code{pos.breaks}.
475
+ #' @param id.breaks
476
+ #' Breaks in the identity histogram. It can also be a vector of break
477
+ #' points, and values outside the range are ignored.
478
+ #' @param id.free.range
479
+ #' Indicates that the range should be freely set from the observed
480
+ #' values. Otherwise, 70-100\% is included in the identity histogram
481
+ #' (default).
482
+ #' @param id.metric
483
+ #' Metric of identity to be used (Y-axis). Corrected identity is only
484
+ #' supported if the original BLAST file included sequence lengths.
485
+ #' @param id.summary
486
+ #' Function summarizing the identity bins. Other recommended options
487
+ #' include: \code{median} to estimate the median instead of total bins, and
488
+ #' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
489
+ #' @param id.cutoff
490
+ #' Cutoff of identity metric above which the hits are considered
491
+ #' \code{in-group}. The 95\% identity corresponds to the expectation of
492
+ #' ANI<95\% within species.
493
+ #' @param threads
494
+ #' Number of threads to use.
495
+ #' @param verbose
496
+ #' Indicates if the function should report the advance.
497
+ #' @param ...
498
+ #' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
499
+ #'
500
+ #' @return Returns an object of class \code{\link{enve.RecPlot2}}.
501
+ #'
502
+ #' @author Luis M. Rodriguez-R [aut, cre]
503
+ #' @author Kenji Gerhardt [aut]
504
+ #'
505
+ #' @export
506
+
355
507
  enve.recplot2 <- function(
356
- ### Produces recruitment plots provided that BlastTab.catsbj.pl has
357
- ### been previously executed.
358
- prefix,
359
- ### Path to the prefix of the BlastTab.catsbj.pl output files. At
360
- ### least the files .rec and .lim must exist with this prefix.
361
- plot=TRUE,
362
- ### Should the object be plotted?
363
- pos.breaks=1e3,
364
- ### Breaks in the positions histogram. It can also be a vector of break
365
- ### points, and values outside the range are ignored. If zero (0), it
366
- ### uses the sequence breaks as defined in the .lim file, which means
367
- ### one bin per contig (or gene, if the mapping is agains genes). Ignored
368
- ### if `pos.breaks.tsv` is passed.
369
- pos.breaks.tsv=NA,
370
- ### Path to a list of (absolute) coordinates to use as position breaks.
371
- ### This tab-delimited file can be produced by `GFF.catsbj.pl`, and it
372
- ### must contain at least one column: coordinates of the break positions of
373
- ### each position bin. If it has a second column, this is used as the name
374
- ### of the position bin that ends at the given coordinate (the first row is
375
- ### ignored). Any additional columns are currently ignored. If NA,
376
- ### position bins are determined by `pos.breaks`.
377
- id.breaks=300,
378
- ### Breaks in the identity histogram. It can also be a vector of break
379
- ### points, and values outside the range are ignored.
380
- id.free.range=FALSE,
381
- ### Indicates that the range should be freely set from the observed
382
- ### values. Otherwise, 70-100% is included in the identity histogram
383
- ### (default).
384
- id.metric=c('identity', 'corrected identity', 'bit score'),
385
- ### Metric of identity to be used (Y-axis). Corrected identity is only
386
- ### supported if the original BLAST file included sequence lengths.
387
- id.summary=sum,
388
- ### Function summarizing the identity bins. Other recommended options
389
- ### include: `median` to estimate the median instead of total bins, and
390
- ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
391
- id.cutoff=95,
392
- ### Cutoff of identity metric above which the hits are considered
393
- ### 'in-group'. The 95% identity corresponds to the expectation of
394
- ### ANI<95% within species.
395
- threads=2,
396
- ### Number of threads to use.
397
- verbose=TRUE,
398
- ### Indicates if the function should report the advance.
399
- ...
400
- ### Any additional parameters supported by `plot.enve.RecPlot2`.
401
- ){
508
+ prefix,
509
+ plot = TRUE,
510
+ pos.breaks = 1e3,
511
+ pos.breaks.tsv = NA,
512
+ id.breaks = 60,
513
+ id.free.range = FALSE,
514
+ id.metric = c('identity', 'corrected identity', 'bit score'),
515
+ id.summary = sum,
516
+ id.cutoff = 95,
517
+ threads = 2,
518
+ verbose = TRUE,
519
+ ...
520
+ ){
402
521
  # Settings
403
522
  id.metric <- match.arg(id.metric);
404
523
 
405
524
  #Read files
406
- if(verbose) cat("Reading files.\n")
407
- rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
408
- quote="");
409
- lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
410
- quote="", as.is=TRUE);
525
+ if (verbose) cat("Reading files.\n")
526
+ rec <- read.table(paste(prefix, ".rec", sep = ""),
527
+ sep = "\t", comment.char = "", quote = "");
528
+ lim <- read.table(paste(prefix, ".lim", sep = ""),
529
+ sep = "\t", comment.char = "", quote = "", as.is = TRUE);
411
530
 
412
531
  # Build matrix
413
- if(verbose) cat("Building counts matrix.\n")
414
- if(id.metric=="corrected identity" & ncol(rec)<6){
532
+ if (verbose) cat("Building counts matrix.\n")
533
+ if (id.metric == "corrected identity" & ncol(rec) < 6) {
415
534
  stop("Requesting corr. identity, but .rec file doesn't have 6th column")
416
535
  }
417
- rec.idcol <- ifelse(id.metric=="identity", 3,
418
- ifelse(id.metric=="corrected identity", 6, 4))
536
+ rec.idcol <- ifelse(id.metric == "identity", 3,
537
+ ifelse(id.metric == "corrected identity", 6, 4))
419
538
  pos.names <- as.character(NULL)
420
- if(!is.na(pos.breaks.tsv)){
421
- tmp <- read.table(pos.breaks.tsv, sep='\t', header=FALSE, as.is=TRUE)
422
- pos.breaks <- as.numeric(tmp[,1])
423
- if(ncol(tmp)>1) pos.names <- as.character(tmp[-1,2])
424
- }else if(length(pos.breaks)==1){
425
- if(pos.breaks>0){
426
- pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
427
- }else{
428
- pos.breaks <- c(lim[1,2], lim[,3])
429
- pos.names <- lim[,1]
539
+ if (!is.na(pos.breaks.tsv)){
540
+ tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
541
+ pos.breaks <- as.numeric(tmp[, 1])
542
+ if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
543
+ } else if (length(pos.breaks) == 1) {
544
+ if (pos.breaks > 0){
545
+ pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
546
+ } else {
547
+ pos.breaks <- c(lim[1, 2], lim[, 3])
548
+ pos.names <- lim[, 1]
430
549
  }
431
550
  }
432
- if(length(id.breaks)==1){
433
- id.range.v <- rec[,rec.idcol]
434
- if(!id.free.range) id.range.v <- c(id.range.v,70,100)
551
+ if (length(id.breaks) == 1) {
552
+ id.range.v <- rec[, rec.idcol]
553
+ if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
435
554
  id.range.v <- range(id.range.v)
436
- id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
555
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
437
556
  }
438
557
 
439
558
  # Run in parallel
440
- if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
441
- cl <- makeCluster(threads)
442
- rec.l <- list()
443
- thl <- ceiling(nrow(rec)/threads)
444
- for(i in 0:(threads-1)){
445
- rec.l[[i+1]] <- list(
446
- rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
447
- verbose=ifelse(i==0, verbose, FALSE))
559
+ # If they already set threads to 1 manually, there's no point in launching
560
+ # clusters, it's just slower. Ditto for small files.
561
+ if (nrow(rec) < 75000 | threads == 1) {
562
+ # Coerces rec into a form that __counts is happy about
563
+ rec.l <- list()
564
+ rec.l[[1]] <- list(rec = rec, verbose = FALSE)
565
+
566
+ # No need to make a temporary variable, there's only one return for sure
567
+ # and it's not a list because it isn't coming back from an apply
568
+ counts <- enve.recplot2.__counts(
569
+ rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
570
+ rec.idcol = rec.idcol)
571
+ } else {
572
+ cl <- makeCluster(threads)
573
+ rec.l <- list()
574
+ thl <- ceiling(nrow(rec)/threads)
575
+ for (i in 0:(threads - 1)) {
576
+ rec.l[[i + 1]] <- list(
577
+ rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
578
+ verbose = ifelse(i == 0, verbose, FALSE))
579
+ }
580
+ counts.l <- clusterApply(
581
+ cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
582
+ id.breaks = id.breaks, rec.idcol = rec.idcol)
583
+ stopCluster(cl) # No spooky ghost clusters
584
+
585
+ counts <- counts.l[[1]]
586
+ for (i in 2:threads) counts <- counts + counts.l[[i]]
448
587
  }
449
- counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
450
- pos.breaks=pos.breaks, id.breaks=id.breaks,
451
- rec.idcol=rec.idcol)
452
- counts <- counts.l[[1]]
453
- if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
454
- stopCluster(cl)
455
588
 
456
589
  # Estimate 1D histograms
457
- if(verbose) cat("Building histograms.\n")
458
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
590
+ if (verbose) cat("Building histograms.\n")
591
+ id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
459
592
  id.ingroup <- (id.mids > id.cutoff);
460
593
  id.counts <- apply(counts, 2, id.summary);
461
- pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
462
- pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
594
+ pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
595
+ pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
463
596
 
464
597
  # Plot and return
465
598
  recplot <- new('enve.RecPlot2',
466
- counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
467
- pos.counts.out=pos.counts.out,
468
- id.breaks=id.breaks, pos.breaks=pos.breaks, pos.names=pos.names,
469
- seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
470
- id.ingroup=id.ingroup,id.metric=id.metric,
471
- call=match.call());
472
- if(plot){
473
- if(verbose) cat("Plotting.\n")
599
+ counts = counts, id.counts = id.counts,
600
+ pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
601
+ id.breaks = id.breaks, pos.breaks = pos.breaks,
602
+ pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
603
+ seq.names = lim[, 1], id.ingroup = id.ingroup,
604
+ id.metric = id.metric, call = match.call());
605
+ if (plot) {
606
+ if (verbose) cat("Plotting.\n")
474
607
  peaks <- plot(recplot, ...);
475
608
  attr(recplot, "peaks") <- peaks
476
609
  }
477
610
  return(recplot);
478
- ### Returns an object of class `enve.RecPlot2`.
479
611
  }
480
612
 
613
+ #' Enveomics: Recruitment Plot (2) Peak Finder
614
+ #'
615
+ #' Identifies peaks in the population histogram potentially indicating
616
+ #' sub-population mixtures.
617
+ #'
618
+ #' @param x
619
+ #' An \code{\link{enve.RecPlot2}} object.
620
+ #' @param method
621
+ #' Peak-finder method. This should be one of:
622
+ #' \itemize{
623
+ #' \item \strong{emauto}
624
+ #' (Expectation-Maximization with auto-selection of components)
625
+ #' \item \strong{em}
626
+ #' (Expectation-Maximization)
627
+ #' \item \strong{mower}
628
+ #' (Custom distribution-mowing method)
629
+ #' }
630
+ #' @param ...
631
+ #' Any additional parameters supported by
632
+ #' \code{\link{enve.recplot2.findPeaks}}.
633
+ #'
634
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
635
+ #'
636
+ #' @author Luis M. Rodriguez-R [aut, cre]
637
+ #'
638
+ #' export
639
+
481
640
  enve.recplot2.findPeaks <- function(
482
- ### Identifies peaks in the population histogram potentially indicating
483
- ### sub-population mixtures
484
- x,
485
- ### An `enve.RecPlot2` object.
486
- method="emauto",
487
- ### Peak-finder method. This should be one of:
488
- ### "emauto" (Expectation-Maximization with auto-selection of components),
489
- ### "em" (Expectation-Maximization),
490
- ### "mower" (Custom distribution-mowing method).
491
- ...
492
- ### Any additional parameters supported by
493
- ### `enve.recplot2.findPeaks.<method>`.
494
- ){
641
+ x,
642
+ method="emauto",
643
+ ...
644
+ ){
495
645
  if(method == "emauto"){
496
646
  peaks <- enve.recplot2.findPeaks.emauto(x, ...)
497
647
  }else if(method == "em"){
@@ -502,29 +652,44 @@ enve.recplot2.findPeaks <- function(
502
652
  stop("Invalid peak-finder method ", method)
503
653
  }
504
654
  return(peaks)
505
- ### Returns a list of `enve.RecPlot2.Peak` objects.
506
655
  }
507
656
 
657
+ #' Enveomics: Recruitment Plot (2) Emauto Peak Finder
658
+ #'
659
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
660
+ #' Model Expectation Maximization (GMM-EM) method with number of components
661
+ #' automatically detected.
662
+ #'
663
+ #' @param x
664
+ #' An \code{\link{enve.RecPlot2}} object.
665
+ #' @param components
666
+ #' A vector of number of components to evaluate.
667
+ #' @param criterion
668
+ #' Criterion to use for components selection. Must be one of:
669
+ #' \code{aic} (Akaike Information Criterion),
670
+ #' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
671
+ #' @param merge.tol
672
+ #' When attempting to merge peaks with very similar sequencing depth, use
673
+ #' this number of significant digits (in log-scale).
674
+ #' @param verbose
675
+ #' Display (mostly debugging) information.
676
+ #' @param ...
677
+ #' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
678
+ #'
679
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
680
+ #'
681
+ #' @author Luis M. Rodriguez-R [aut, cre]
682
+ #'
683
+ #' @export
684
+
508
685
  enve.recplot2.findPeaks.emauto <- function(
509
- ### Identifies peaks in the population histogram using a Gaussian Mixture
510
- ### Model Expectation Maximization (GMM-EM) method with number of components
511
- ### automatically detected.
512
- x,
513
- ### An `enve.RecPlot2` object.
514
- components=seq(1,10),
515
- ### A vector of number of components to evaluate.
516
- criterion='aic',
517
- ### Criterion to use for components selection. Must be one of:
518
- ### 'aic' (Akaike Information Criterion),
519
- ### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
520
- merge.tol=2L,
521
- ### When attempting to merge peaks with very similar sequencing depth, use
522
- ### this number of significant digits (in log-scale).
523
- verbose=FALSE,
524
- ### Display (mostly debugging) information.
525
- ...
526
- ### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
527
- ){
686
+ x,
687
+ components=seq(1,10),
688
+ criterion='aic',
689
+ merge.tol=2L,
690
+ verbose=FALSE,
691
+ ...
692
+ ){
528
693
  best <- list(crit=0, pstore=list())
529
694
  if(criterion == 'aic'){
530
695
  do_crit <- function(ll, k, n) 2*k - 2*ll
@@ -536,55 +701,72 @@ enve.recplot2.findPeaks.emauto <- function(
536
701
  for(comp in components){
537
702
  if(verbose) cat('Testing:',comp,'\n')
538
703
  best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
539
- verbose, ...)
704
+ verbose, ...)
540
705
  }
541
706
  if(length(best[['peaks']])==0) return(list())
542
707
 
543
708
  seqdepths.r <- signif(log(sapply(best[['peaks']],
544
- function(x) x$seq.depth)), merge.tol)
709
+ function(x) x$seq.depth)), merge.tol)
545
710
  distinct <- length(unique(seqdepths.r))
546
711
  if(distinct < length(best[['peaks']])){
547
712
  if(verbose) cat('Attempting merge to', distinct, 'components\n')
548
713
  init <- apply(sapply(best[['peaks']],
549
- function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
714
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
550
715
  init <- init[!duplicated(seqdepths.r),]
551
716
  init <- list(mu=init[,'mean'], sd=init[,'sd'],
552
- alpha=init[,'alpha']/sum(init[,'alpha']))
717
+ alpha=init[,'alpha']/sum(init[,'alpha']))
553
718
  best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
554
- verbose, ...)
719
+ verbose, ...)
555
720
  }
556
721
  return(best[['peaks']])
557
- ### Returns a list of `enve.RecPlot2.Peak` objects.
558
722
  }
559
723
 
724
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder
725
+ #'
726
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
727
+ #' Model Expectation Maximization (GMM-EM) method.
728
+ #'
729
+ #' @param x
730
+ #' An \code{\link{enve.RecPlot2}} object.
731
+ #' @param max.iter
732
+ #' Maximum number of EM iterations.
733
+ #' @param ll.diff.res
734
+ #' Maximum Log-Likelihood difference to be considered as convergent.
735
+ #' @param components
736
+ #' Number of distributions assumed in the mixture.
737
+ #' @param rm.top
738
+ #' Top-values to remove before finding peaks, as a quantile probability.
739
+ #' This step is useful to remove highly conserved regions, but can be
740
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
741
+ #' \strong{after} removing zero-coverage windows.
742
+ #' @param verbose
743
+ #' Display (mostly debugging) information.
744
+ #' @param init
745
+ #' Initialization parameters. By default, these are derived from k-means
746
+ #' clustering. A named list with vectors for \code{mu}, \code{sd}, and
747
+ #' \code{alpha}, each of length \code{components}.
748
+ #' @param log
749
+ #' Logical value indicating if the estimations should be performed in
750
+ #' natural logarithm units. Do not change unless you know what you're
751
+ #' doing.
752
+ #'
753
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
754
+ #'
755
+ #' @author Luis M. Rodriguez-R [aut, cre]
756
+ #'
757
+ #' @export
758
+
560
759
  enve.recplot2.findPeaks.em <- function(
561
- ### Identifies peaks in the population histogram using a Gaussian Mixture
562
- ### Model Expectation Maximization (GMM-EM) method.
563
- x,
564
- ### An `enve.RecPlot2` object.
565
- max.iter=1000,
566
- ### Maximum number of EM iterations.
567
- ll.diff.res=1e-8,
568
- ### Maximum Log-Likelihood difference to be considered as convergent.
569
- components=2,
570
- ### Number of distributions assumed in the mixture.
571
- rm.top=0.05,
572
- ### Top-values to remove before finding peaks, as a quantile probability.
573
- ### This step is useful to remove highly conserved regions, but can be
574
- ### turned off by setting rm.top=0. The quantile is determined *after*
575
- ### removing zero-coverage windows.
576
- verbose=FALSE,
577
- ### Display (mostly debugging) information.
578
- init,
579
- ### Initialization parameters. By default, these are derived from k-means
580
- ### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
581
- ### of length `components`.
582
- log=TRUE
583
- ### Logical value indicating if the estimations should be performed in
584
- ### natural logarithm units. Do not change unless you know what you're
585
- ### doing.
586
- ){
587
-
760
+ x,
761
+ max.iter=1000,
762
+ ll.diff.res=1e-8,
763
+ components=2,
764
+ rm.top=0.05,
765
+ verbose=FALSE,
766
+ init,
767
+ log=TRUE
768
+ ){
769
+
588
770
  # Essential vars
589
771
  pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
590
772
  lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
@@ -603,7 +785,7 @@ enve.recplot2.findPeaks.em <- function(
603
785
  m.step <- init
604
786
  ll <- c()
605
787
  cur.ll <- -Inf
606
-
788
+
607
789
  for(i in 1:max.iter){
608
790
  # 2/3. EM
609
791
  e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
@@ -621,224 +803,297 @@ enve.recplot2.findPeaks.em <- function(
621
803
  for(i in 1:components){
622
804
  n.hat <- m.step[['alpha']][i]*length(lsd1)
623
805
  peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
624
- values.res=0, mode=m.step[['mu']][i],
625
- param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
626
- n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
627
- merge.logdist=as.numeric(), log=log,
628
- seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
806
+ values.res=0, mode=m.step[['mu']][i],
807
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
808
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
809
+ merge.logdist=as.numeric(), log=log,
810
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
629
811
  }
630
812
  return(peaks)
631
- ### Returns a list of `enve.RecPlot2.Peak` objects.
632
813
  }
633
814
 
815
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder
816
+ #'
817
+ #' Identifies peaks in the population histogram potentially indicating
818
+ #' sub-population mixtures, using a custom distribution-mowing method.
819
+ #'
820
+ #' @param x
821
+ #' An \code{\link{enve.RecPlot2}} object.
822
+ #' @param min.points
823
+ #' Minimum number of points in the quantile-estimation-range
824
+ #' \code{(quant.est)} to estimate a peak.
825
+ #' @param quant.est
826
+ #' Range of quantiles to be used in the estimation of a peak's
827
+ #' parameters.
828
+ #' @param mlv.opts
829
+ #' Ignored. For backwards compatibility.
830
+ #' @param fitdist.opts.sn
831
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
832
+ #' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
833
+ #' ammended with \code{xi=estimated} mode for each peak.
834
+ #' @param fitdist.opts.norm
835
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
836
+ #' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
837
+ #' ammended with \code{mean=estimated} mode for each peak.
838
+ #' @param rm.top
839
+ #' Top-values to remove before finding peaks, as a quantile probability.
840
+ #' This step is useful to remove highly conserved regions, but can be
841
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
842
+ #' \strong{after} removing zero-coverage windows.
843
+ #' @param with.skewness
844
+ #' Allow skewness correction of the peaks. Typically, the
845
+ #' sequencing-depth distribution for a single peak is left-skewed, due
846
+ #' partly (but not exclusively) to fragmentation and mapping sensitivity.
847
+ #' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
848
+ #' alternative solution for the first problem (fragmentation) called
849
+ #' "tail distribution".
850
+ #' @param optim.rounds
851
+ #' Maximum rounds of peak optimization.
852
+ #' @param optim.epsilon
853
+ #' Trace change at which optimization stops (unless \code{optim.rounds} is
854
+ #' reached first). The trace change is estimated as the sum of square
855
+ #' differences between parameters in one round and those from two rounds
856
+ #' earlier (to avoid infinite loops from approximation).
857
+ #' @param merge.logdist
858
+ #' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
859
+ #' to attempt merging. The default of ~0.22 corresponds to a maximum
860
+ #' difference of 25\%.
861
+ #' @param verbose
862
+ #' Display (mostly debugging) information.
863
+ #' @param log
864
+ #' Logical value indicating if the estimations should be performed in
865
+ #' natural logarithm units. Do not change unless you know what you're
866
+ #' doing.
867
+ #'
868
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
869
+ #'
870
+ #' @author Luis M. Rodriguez-R [aut, cre]
871
+ #'
872
+ #' @export
873
+
634
874
  enve.recplot2.findPeaks.mower <- function(
635
- ### Identifies peaks in the population histogram potentially indicating
636
- ### sub-population mixtures, using a custom distribution-mowing method.
637
- x,
638
- ### An `enve.RecPlot2` object.
639
- min.points=10,
640
- ### Minimum number of points in the quantile-estimation-range
641
- ### (`quant.est`) to estimate a peak.
642
- quant.est=c(0.002, 0.998),
643
- ### Range of quantiles to be used in the estimation of a peak's
644
- ### parameters.
645
- mlv.opts=list(method='parzen'),
646
- ### Ignored. For backwards compatibility.
647
- fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
648
- start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
649
- ### Options passed to `fitdist` to estimate the standard deviation if
650
- ### with.skewness=TRUE. Note that the `start` parameter will be ammended
651
- ### with xi=estimated mode for each peak.
652
- fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
653
- start=list(sd=1), lower=c(0, -Inf)),
654
- ### Options passed to `fitdist` to estimate the standard deviation if
655
- ### with.skewness=FALSE. Note that the `start` parameter will be ammended
656
- ### with mean=estimated mode for each peak.
657
- rm.top=0.05,
658
- ### Top-values to remove before finding peaks, as a quantile probability.
659
- ### This step is useful to remove highly conserved regions, but can be
660
- ### turned off by setting rm.top=0. The quantile is determined *after*
661
- ### removing zero-coverage windows.
662
- with.skewness=TRUE,
663
- ### Allow skewness correction of the peaks. Typically, the
664
- ### sequencing-depth distribution for a single peak is left-skewed, due
665
- ### partly (but not exclusively) to fragmentation and mapping sensitivity.
666
- ### See Lindner et al 2013, Bioinformatics 29(10):1260-7 for an
667
- ### alternative solution for the first problem (fragmentation) called
668
- ### "tail distribution".
669
- optim.rounds=200,
670
- ### Maximum rounds of peak optimization.
671
- optim.epsilon=1e-4,
672
- ### Trace change at which optimization stops (unless `optim.rounds` is
673
- ### reached first). The trace change is estimated as the sum of square
674
- ### differences between parameters in one round and those from two rounds
675
- ### earlier (to avoid infinite loops from approximation).
676
- merge.logdist=log(1.75),
677
- ### Maximum value of |log-ratio| between centrality parameters in peaks to
678
- ### attempt merging. The default of ~0.22 corresponds to a maximum
679
- ### difference of 25%.
680
- verbose=FALSE,
681
- ### Display (mostly debugging) information.
682
- log=TRUE
683
- ### Logical value indicating if the estimations should be performed in
684
- ### natural logarithm units. Do not change unless you know what you're
685
- ### doing.
686
- ){
687
-
688
- # Essential vars
689
- pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
690
- seqdepth.in <- x$pos.counts.in/pos.binsize;
691
- lsd1 <- seqdepth.in[seqdepth.in>0];
692
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
693
- if(log) lsd1 <- log(lsd1)
694
- if(with.skewness){
695
- fitdist.opts <- fitdist.opts.sn
696
- }else{
697
- fitdist.opts <- fitdist.opts.norm
698
- }
699
- peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
700
- mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
701
- optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
702
- n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
703
-
704
- # Find seed peaks
705
- if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
706
- peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
707
-
708
- # Merge overlapping peaks
709
- if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
710
- merged <- (length(peaks)>1)
711
- while(merged){
712
- merged <- FALSE
713
- ignore <- c()
714
- peaks2 <- list();
715
- for(i in 1:length(peaks)){
716
- if(i %in% ignore) next
717
- p <- peaks[[ i ]]
718
- j <- enve.recplot2.__whichClosestPeak(p, peaks)
719
- p2 <- peaks[[ j ]]
720
- dst.a <- p$param.hat[[ length(p$param.hat) ]]
721
- dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
722
- if( abs(log(dst.a/dst.b)) < merge.logdist ){
723
- if(verbose) cat('==> Attempting a merge at',
724
- p$param.hat[[ length(p$param.hat) ]],'&',
725
- p2$param.hat[[ length(p2$param.hat) ]],'X\n');
726
- peaks.opts$lsd1 <- c(p$values, p2$values)
727
- p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
728
- if(length(p.new)==1){
729
- peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
730
- ignore <- c(ignore, j)
731
- merged <- TRUE
732
- }
733
- }
734
- if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
875
+ x,
876
+ min.points=10,
877
+ quant.est=c(0.002, 0.998),
878
+ mlv.opts=list(method='parzen'),
879
+ fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
880
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
881
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
882
+ start=list(sd=1), lower=c(0, -Inf)),
883
+ rm.top=0.05,
884
+ with.skewness=TRUE,
885
+ optim.rounds=200,
886
+ optim.epsilon=1e-4,
887
+ merge.logdist=log(1.75),
888
+ verbose=FALSE,
889
+ log=TRUE
890
+ ){
891
+
892
+ # Essential vars
893
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
894
+ seqdepth.in <- x$pos.counts.in/pos.binsize;
895
+ lsd1 <- seqdepth.in[seqdepth.in>0];
896
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
897
+ if(log) lsd1 <- log(lsd1)
898
+ if(with.skewness){
899
+ fitdist.opts <- fitdist.opts.sn
900
+ }else{
901
+ fitdist.opts <- fitdist.opts.norm
902
+ }
903
+ peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
904
+ mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
905
+ optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
906
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
907
+
908
+ # Find seed peaks
909
+ if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
910
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
911
+
912
+ # Merge overlapping peaks
913
+ if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
914
+ merged <- (length(peaks)>1)
915
+ while(merged){
916
+ merged <- FALSE
917
+ ignore <- c()
918
+ peaks2 <- list();
919
+ for(i in 1:length(peaks)){
920
+ if(i %in% ignore) next
921
+ p <- peaks[[ i ]]
922
+ j <- enve.recplot2.__whichClosestPeak(p, peaks)
923
+ p2 <- peaks[[ j ]]
924
+ dst.a <- p$param.hat[[ length(p$param.hat) ]]
925
+ dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
926
+ if( abs(log(dst.a/dst.b)) < merge.logdist ){
927
+ if(verbose) cat('==> Attempting a merge at',
928
+ p$param.hat[[ length(p$param.hat) ]],'&',
929
+ p2$param.hat[[ length(p2$param.hat) ]],'X\n');
930
+ peaks.opts$lsd1 <- c(p$values, p2$values)
931
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
932
+ if(length(p.new)==1){
933
+ peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
934
+ ignore <- c(ignore, j)
935
+ merged <- TRUE
936
+ }
735
937
  }
736
- peaks <- peaks2
737
- if(length(peaks)==1) break
738
- }
739
-
740
- if(verbose) cat('Found',length(peaks),'peak(s)\n')
741
- return(peaks);
742
- ### Returns a list of `enve.RecPlot2.Peak` objects.
938
+ if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
939
+ }
940
+ peaks <- peaks2
941
+ if(length(peaks)==1) break
942
+ }
943
+
944
+ if(verbose) cat('Found',length(peaks),'peak(s)\n')
945
+ return(peaks);
743
946
  }
744
947
 
745
948
  #==============> Define utils
949
+
950
+ #' Enveomics: Recruitment Plot (2) Core Peak Finder
951
+ #'
952
+ #' Finds the peak in a list of peaks that is most likely to represent the
953
+ #' "core genome" of a population.
954
+ #'
955
+ #' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
956
+ #'
957
+ #' @author Luis M. Rodriguez-R [aut, cre]
958
+ #'
959
+ #' @export
960
+
746
961
  enve.recplot2.corePeak <- function
747
- ### Finds the peak in a list of peaks that is most likely to represent the
748
- ### "core genome" of a population.
749
- (x
750
- ### `list` of `enve.RecPlot2.Peak` objects.
751
- ){
752
- # Find the peak with maximum depth (centrality)
753
- maxPeak <- x[[
754
- which.max(as.numeric(lapply(x,
755
- function(y) y$param.hat[[ length(y$param.hat) ]])))
756
- ]]
757
- # If a "larger" peak (a peak explaining more bins of the genome) is within
758
- # the default "merge.logdist" distance, take that one instead.
759
- corePeak <- maxPeak
760
- for(p in x){
761
- p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
762
- corePeak.len <- ifelse(
763
- length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
764
- sz.d <- log(p.len/corePeak.len)
765
- if(is.nan(sz.d) || sz.d < 0) next
766
- sq.d.a <- as.numeric(tail(p$param.hat, n=1))
767
- sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
768
- if(p$log) sq.d.a <- exp(sq.d.a)
769
- if(corePeak$log) sq.d.b <- exp(sq.d.b)
770
- if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
771
- }
772
- return(corePeak)
962
+ (x
963
+ ){
964
+ # Find the peak with maximum depth (centrality)
965
+ maxPeak <- x[[
966
+ which.max(as.numeric(lapply(x,
967
+ function(y) y$param.hat[[ length(y$param.hat) ]])))
968
+ ]]
969
+ # If a "larger" peak (a peak explaining more bins of the genome) is within
970
+ # the default "merge.logdist" distance, take that one instead.
971
+ corePeak <- maxPeak
972
+ for(p in x){
973
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
974
+ corePeak.len <- ifelse(
975
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
976
+ sz.d <- log(p.len/corePeak.len)
977
+ if(is.nan(sz.d) || sz.d < 0) next
978
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
979
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
980
+ if(p$log) sq.d.a <- exp(sq.d.a)
981
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
982
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
983
+ }
984
+ return(corePeak)
773
985
  }
774
986
 
987
+ #' Enveomics: Recruitment Plot (2) Change Cutoff
988
+ #'
989
+ #' Change the intra-species cutoff of an existing recruitment plot.
990
+ #'
991
+ #' @param rp
992
+ #' \code{\link{enve.RecPlot2}} object.
993
+ #' @param new.cutoff
994
+ #' New cutoff to use.
995
+ #'
996
+ #' @author Luis M. Rodriguez-R [aut, cre]
997
+ #'
998
+ #' @export
999
+
775
1000
  enve.recplot2.changeCutoff <- function
776
- ### Change the intra-species cutoff of an existing recruitment plot.
777
- (rp,
778
- ### enve.RecPlot2 object.
779
- new.cutoff=98
780
- ### New cutoff to use.
781
- ){
782
- # Re-calculate vectors
783
- id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
784
- id.ingroup <- (id.mids > new.cutoff)
785
- pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
786
- pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
787
- # Update object
788
- attr(rp, "id.ingroup") <- id.ingroup
789
- attr(rp, "pos.counts.in") <- pos.counts.in
790
- attr(rp, "pos.counts.out") <- pos.counts.out
791
- attr(rp, "call") <- match.call()
792
- return(rp)
1001
+ (rp,
1002
+ new.cutoff=98
1003
+ ){
1004
+ # Re-calculate vectors
1005
+ id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
1006
+ id.ingroup <- (id.mids > new.cutoff)
1007
+ pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
1008
+ pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
1009
+ # Update object
1010
+ attr(rp, "id.ingroup") <- id.ingroup
1011
+ attr(rp, "pos.counts.in") <- pos.counts.in
1012
+ attr(rp, "pos.counts.out") <- pos.counts.out
1013
+ attr(rp, "call") <- match.call()
1014
+ return(rp)
793
1015
  }
794
1016
 
1017
+ #' Enveomics: Recruitment Plot (2) Window Depth Threshold
1018
+ #'
1019
+ #' Identifies the threshold below which windows should be identified as
1020
+ #' variable or absent.
1021
+ #'
1022
+ #' @param rp
1023
+ #' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
1024
+ #' @param peak
1025
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1026
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1027
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1028
+ #' @param lower.tail
1029
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1030
+ #' sequencing depth.
1031
+ #' @param significance
1032
+ #' Significance threshold (alpha) to select windows.
1033
+ #'
1034
+ #' @return
1035
+ #' Returns a float. The units are depth if the peaks were estimated in
1036
+ #' linear scale, or log-depth otherwise (\code{peak$log}).
1037
+ #'
1038
+ #' @author Luis M. Rodriguez-R [aut, cre]
1039
+ #'
1040
+ #' @export
1041
+
795
1042
  enve.recplot2.windowDepthThreshold <- function
796
- ### Identifies the threshold below which windows should be identified as
797
- ### variable or absent.
798
- (rp,
799
- ### Recruitment plot, an `enve.RecPlot2` object.
800
- peak,
801
- ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
802
- ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
803
- ### used (see `enve.recplot2.corePeak`).
804
- lower.tail=TRUE,
805
- ### If FALSE, it returns windows significantly above the peak in
806
- ### sequencing depth.
807
- significance=0.05
808
- ### Significance threshold (alpha) to select windows.
809
- ){
810
- if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
811
- par <- peak$param.hat
812
- par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
813
- thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
814
- if(peak$log) thr <- exp(thr)
815
-
816
- return(thr)
817
- ### Returns a float. The units are depth if the peaks were estimated in
818
- ### linear scale, or log-depth otherwise (`peak$log`).
1043
+ (rp,
1044
+ peak,
1045
+ lower.tail=TRUE,
1046
+ significance=0.05
1047
+ ){
1048
+ if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
1049
+ par <- peak$param.hat
1050
+ par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
1051
+ thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
1052
+ if(peak$log) thr <- exp(thr)
1053
+
1054
+ return(thr)
819
1055
  }
820
1056
 
1057
+ #' Enveomics: Recruitment Plot (2) Extract Windows
1058
+ #'
1059
+ #' Extract windows significantly below (or above) the peak in sequencing
1060
+ #' depth.
1061
+ #'
1062
+ #' @param rp
1063
+ #' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
1064
+ #' @param peak
1065
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1066
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1067
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1068
+ #' @param lower.tail
1069
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1070
+ #' sequencing depth.
1071
+ #' @param significance
1072
+ #' Significance threshold (alpha) to select windows.
1073
+ #' @param seq.names
1074
+ #' Returns subject sequence names instead of a vector of Booleans. If
1075
+ #' the recruitment plot was generated with named position bins (e.g, using
1076
+ #' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
1077
+ #' vector of characters (the sequence identifiers), otherwise it returns a
1078
+ #' data.frame with a name column and two columns of coordinates.
1079
+ #'
1080
+ #' @return
1081
+ #' Returns a vector of logicals if \code{seq.names = FALSE}.
1082
+ #' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
1083
+ #' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
1084
+ #' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
1085
+ #'
1086
+ #' @author Luis M. Rodriguez-R [aut, cre]
1087
+ #'
1088
+ #' @export
1089
+
821
1090
  enve.recplot2.extractWindows <- function
822
- ### Extract windows significantly below (or above) the peak in sequencing
823
- ### depth.
824
- (rp,
825
- ### Recruitment plot, a `enve.RecPlot2` object.
826
- peak,
827
- ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
828
- ### list of `enve.RecPlot2.Peak` objects, in which case the core peak is
829
- ### used (see `enve.recplot2.corePeak`).
830
- lower.tail=TRUE,
831
- ### If FALSE, it returns windows significantly above the peak in
832
- ### sequencing depth.
833
- significance=0.05,
834
- ### Significance threshold (alpha) to select windows.
835
- seq.names=FALSE
836
- ### Returns subject sequence names instead of a vector of Booleans. If
837
- ### the recruitment plot was generated with named position bins (e.g, using
838
- ### `pos.breaks`=0 or a two-column `pos.breaks.tsv`), it returns a vector of
839
- ### characters (the sequence identifiers), otherwise it returns a data.frame
840
- ### with a name column and two columns of coordinates.
841
- ){
1091
+ (rp,
1092
+ peak,
1093
+ lower.tail = TRUE,
1094
+ significance = 0.05,
1095
+ seq.names = FALSE
1096
+ ){
842
1097
  # Determine the threshold
843
1098
  thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
844
1099
 
@@ -850,43 +1105,54 @@ enve.recplot2.extractWindows <- function
850
1105
  sel <- seqdepth.in > thr
851
1106
  }
852
1107
 
853
- # seq.names=FALSE
1108
+ # seq.names = FALSE
854
1109
  if(!seq.names) return(sel)
855
- # seq.names=TRUE and pos.names defined
856
- if(length(rp$pos.names) != 0) return(rp$pos.names[sel])
857
- # seq.names=TRUE and pos.names undefined
858
- return(enve.recplot2.coordinates(rp,sel))
859
- ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
860
- ### it returns a vector of characters if the object has `pos.names` defined,
861
- ### or a data.frame with four columns otherwise:
862
- ### name.from, name.to, pos.from, and pos.to
863
- ### (see `enve.recplot2.coordinates`).
1110
+ # seq.names = TRUE
1111
+ return(enve.recplot2.coordinates(rp, sel))
864
1112
  }
865
1113
 
1114
+ #' Enveomics: Recruitment Plot (2) Compare Identities
1115
+ #'
1116
+ #' Compare the distribution of identities between two
1117
+ #' \code{\link{enve.RecPlot2}} objects.
1118
+ #'
1119
+ #' @param x
1120
+ #' First \code{\link{enve.RecPlot2}} object.
1121
+ #' @param y
1122
+ #' Second \code{\link{enve.RecPlot2}} object.
1123
+ #' @param method
1124
+ #' Distance method to use. This should be (an unambiguous abbreviation of)
1125
+ #' one of:
1126
+ #' \itemize{
1127
+ #' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
1128
+ #' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
1129
+ #' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
1130
+ #' doi:10.1214/aoms/1177729694}), or}
1131
+ #' \item{"euclidean"}
1132
+ #' }
1133
+ #' @param smooth.par
1134
+ #' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
1135
+ #' Use \code{NULL} to automatically determine this value using leave-one-out
1136
+ #' cross-validation (see \code{smooth.spline} parameter \code{spar}).
1137
+ #' @param pseudocounts
1138
+ #' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
1139
+ #' 1 for add-one smoothing.
1140
+ #' @param max.deviation
1141
+ #' Maximum mean deviation between identity breaks tolerated (as percent
1142
+ #' identity). Difference in number of \code{id.breaks} is never tolerated.
1143
+ #'
1144
+ #' @author Luis M. Rodriguez-R [aut, cre]
1145
+ #'
1146
+ #' @export
1147
+
866
1148
  enve.recplot2.compareIdentities <- function
867
- ### Compare the distribution of identities between two enve.RecPlot2 objects.
868
- (x,
869
- ### First enve.RecPlot2 object.
870
- y,
871
- ### Second enve.RecPlot2 object.
872
- method="hellinger",
873
- ### Distance method to use. This should be (an unambiguous abbreviation of)
874
- ### one of:
875
- ### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
876
- ### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
877
- ### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
878
- ### doi:10.1214/aoms/1177729694), or "euclidean".
879
- smooth.par=NULL,
880
- ### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
881
- ### Use NULL to automatically determine this value using leave-one-out
882
- ### cross-validation (see `smooth.spline` parameter `spar`).
883
- pseudocounts=0,
884
- ### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
885
- ### 1 for add-one smoothing.
886
- max.deviation=0.75
887
- ### Maximum mean deviation between identity breaks tolerated (as percent
888
- ### identity). Difference in number of id.breaks is never tolerated.
889
- ){
1149
+ (x,
1150
+ y,
1151
+ method="hellinger",
1152
+ smooth.par=NULL,
1153
+ pseudocounts=0,
1154
+ max.deviation=0.75
1155
+ ){
890
1156
  METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
891
1157
  i.meth <- pmatch(method, METHODS)
892
1158
  if (is.na(i.meth)) stop("Invalid distance ", method)
@@ -899,7 +1165,7 @@ enve.recplot2.compareIdentities <- function
899
1165
  dev <- mean(abs(x$id.breaks - y$id.breaks))
900
1166
  if(dev > max.deviation)
901
1167
  stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
902
- dev)
1168
+ dev)
903
1169
  x.cnt <- x$id.counts
904
1170
  y.cnt <- y$id.counts
905
1171
  if(is.null(smooth.par) || smooth.par > 0){
@@ -928,63 +1194,92 @@ enve.recplot2.compareIdentities <- function
928
1194
  return(d)
929
1195
  }
930
1196
 
1197
+ #' Enveomics: Recruitment Plot (2) Coordinates
1198
+ #'
1199
+ #' Returns the sequence name and coordinates of the requested position bins.
1200
+ #'
1201
+ #' @param x
1202
+ #' \code{\link{enve.RecPlot2}} object.
1203
+ #' @param bins
1204
+ #' Vector of selected bins to return. It can be a vector of logical values
1205
+ #' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
1206
+ #' missing, returns the coordinates of all windows.
1207
+ #'
1208
+ #' @return
1209
+ #' Returns a data.frame with five columns: \code{name.from} (character),
1210
+ #' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
1211
+ #' (numeric), and \code{seq.name} (character).
1212
+ #' The first two correspond to sequence and position of the start point of the
1213
+ #' bin. The next two correspond to the sequence and position of the end point of
1214
+ #' the bin. The last one indicates the name of the sequence (if defined).
1215
+ #'
1216
+ #' @author Luis M. Rodriguez-R [aut, cre]
1217
+ #'
1218
+ #' @export
1219
+
931
1220
  enve.recplot2.coordinates <- function
932
- ### Returns the sequence name and coordinates of the requested position bins.
933
- (x,
934
- ### `enve.RecPlot2` object.
935
- bins
936
- ### Vector of selected bins to return. It can be a vector of logical values
937
- ### with the same length as `x$pos.breaks`-1 or a vector of integers. If
938
- ### missing, returns the coordinates of all windows.
939
- ){
1221
+ (x,
1222
+ bins
1223
+ ){
940
1224
  if(!inherits(x, "enve.RecPlot2"))
941
1225
  stop("'x' must inherit from class `enve.RecPlot2`")
942
1226
  if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
943
1227
  if(!is.vector(bins)) stop("'bins' must be a vector")
944
1228
  if(inherits(bins, "logical")) bins <- which(bins)
945
1229
 
946
- y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
947
-
1230
+ y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
1231
+
948
1232
  for(i in 1:length(bins)){
949
1233
  j <- bins[i]
950
1234
  # Concatenated coordinates
951
1235
  cc <- x$pos.breaks[c(j, j+1)]
952
1236
  # Find the corresponding `seq.breaks`
953
1237
  sb.from <- which(
954
- cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
955
- cc[1] < x$seq.breaks[-1])
1238
+ cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
1239
+ cc[1] < x$seq.breaks[-1])
956
1240
  sb.to <- which(
957
- cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
958
- cc[2] <=x$seq.breaks[-1])
1241
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
1242
+ cc[2] <= x$seq.breaks[-1])
959
1243
  # Translate coordinates
960
1244
  if(length(sb.from)==1 & length(sb.to)==1){
961
1245
  y[i, 'name.from'] <- x$seq.names[sb.from]
962
- y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
1246
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
963
1247
  y[i, 'name.to'] <- x$seq.names[sb.to]
964
- y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1248
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1249
+ y[i, 'seq.name'] <- x$pos.names[i]
965
1250
  }
966
1251
  }
967
1252
 
968
1253
  return(y)
969
- ### Returns a data.frame with four columns: name.from (character), pos.from
970
- ### (numeric) name.to (character), and pos.to (numeric). The first two
971
- ### correspond to sequence and position of the start point of the bin, the
972
- ### last two correspond to the sequence and position of the end point of the
973
- ### bin.
974
1254
  }
975
1255
 
1256
+ #' Enveomics: Recruitment Plot (2) Sequencing Depth
1257
+ #'
1258
+ #' Calculate the sequencing depth of the given window(s).
1259
+ #'
1260
+ #' @param x
1261
+ #' \code{\link{enve.RecPlot2}} object.
1262
+ #' @param sel
1263
+ #' Window(s) for which the sequencing depth is to be calculated. If not
1264
+ #' passed, it returns the sequencing depth of all windows.
1265
+ #' @param low.identity
1266
+ #' A logical indicating if the sequencing depth is to be estimated only
1267
+ #' with low-identity matches. By default, only high-identity matches are
1268
+ #' used.
1269
+ #'
1270
+ #' @return
1271
+ #' Returns a numeric vector of sequencing depths (in bp/bp).
1272
+ #'
1273
+ #' @author Luis M. Rodriguez-R [aut, cre]
1274
+ #'
1275
+ #' @export
1276
+
976
1277
  enve.recplot2.seqdepth <- function
977
- ### Calculate the sequencing depth of the given window(s)
978
- (x,
979
- ### `enve.RecPlot2` object.
980
- sel,
981
- ### Window(s) for which the sequencing depth is to be calculated. If not
982
- ### passed, it returns the sequencing depth of all windows
983
- low.identity=FALSE
984
- ### A logical indicating if the sequencing depth is to be estimated only
985
- ### with low-identity matches. By default, only high-identity matches are
986
- ### used.
987
- ){
1278
+
1279
+ (x,
1280
+ sel,
1281
+ low.identity=FALSE
1282
+ ){
988
1283
  if(!inherits(x, "enve.RecPlot2"))
989
1284
  stop("'x' must inherit from class `enve.RecPlot2`")
990
1285
  if(low.identity){
@@ -997,21 +1292,30 @@ enve.recplot2.seqdepth <- function
997
1292
  seqdepth.in <- pos.cnts.in/pos.binsize
998
1293
  if(missing(sel)) return(seqdepth.in)
999
1294
  return(seqdepth.in[sel])
1000
- ### Returns a numeric vector of sequencing depths (in bp/bp).
1001
1295
  }
1002
1296
 
1297
+ #' Enveomics: Recruitment Plot (2) ANI Estimate
1298
+ #'
1299
+ #' Estimate the Average Nucleotide Identity from reads (ANIr) from a
1300
+ #' recruitment plot.
1301
+ #'
1302
+ #' @param x
1303
+ #' \code{\link{enve.RecPlot2}} object.
1304
+ #' @param range
1305
+ #' Range of identities to be considered. By default, the full range
1306
+ #' is used (note that the upper boundary is \code{Inf} and not 100 because
1307
+ #' recruitment plots can also be built with bit-scores). To use only
1308
+ #' intra-population matches (with identities), use c(95,100). To use only
1309
+ #' inter-population values, use c(0,95).
1310
+ #'
1311
+ #' @author Luis M. Rodriguez-R [aut, cre]
1312
+ #'
1313
+ #' @export
1314
+
1003
1315
  enve.recplot2.ANIr <- function
1004
- ### Estimate the Average Nucleotide Identity from reads (ANIr) from a
1005
- ### recruitment plot
1006
- (x,
1007
- ### `enve.RecPlot2` object.
1008
- range=c(0,Inf)
1009
- ### Range of identities to be considered. By default, the full range
1010
- ### is used (note that the upper boundary is `Inf` and not 100 because
1011
- ### recruitment plots can also be built with bit-scores). To use only
1012
- ### intra-population matches (with identities), use c(95,100). To use only
1013
- ### inter-population values, use c(0,95).
1014
- ){
1316
+ (x,
1317
+ range=c(0,Inf)
1318
+ ){
1015
1319
  if(!inherits(x, "enve.RecPlot2"))
1016
1320
  stop("'x' must inherit from class `enve.RecPlot2`")
1017
1321
  id.b <- x$id.breaks
@@ -1023,35 +1327,82 @@ enve.recplot2.ANIr <- function
1023
1327
  }
1024
1328
 
1025
1329
  #==============> Define internal functions
1330
+
1331
+ #' Enveomics: Recruitment Plot (2) Internal Ancillary Function
1332
+ #'
1333
+ #' Internal ancillary function (see \code{\link{enve.recplot2}}).
1334
+ #'
1335
+ #' @param x \code{\link{enve.RecPlot2}} object
1336
+ #' @param pos.breaks Position breaks
1337
+ #' @param id.breaks Identity breaks
1338
+ #' @param rec.idcol Identity column to use
1339
+ #'
1340
+ #' @author Luis M. Rodriguez-R [aut, cre]
1341
+ #' @author Kenji Gerhardt [aut]
1342
+ #'
1343
+ #' @export
1344
+
1026
1345
  enve.recplot2.__counts <- function
1027
- ### Internal ancilliary function (see `enve.recplot2`).
1028
- (x, pos.breaks, id.breaks, rec.idcol){
1029
- rec <- x$rec
1030
- verbose <- x$verbose
1031
- counts <- matrix(0, nrow=length(pos.breaks)-1, ncol=length(id.breaks)-1);
1032
- for(i in 1:nrow(rec)){
1033
- if(verbose & i%%100==0) cat(" [",signif(i*100/nrow(rec),3),"% ] \r");
1034
- y.bin <- which(
1035
- rec[i,rec.idcol]>=id.breaks[-length(id.breaks)] &
1036
- rec[i,rec.idcol]<=id.breaks[-1])[1] ;
1037
- for(pos in rec[i,1]:rec[i,2]){
1038
- x.bin <- which(
1039
- pos>=pos.breaks[-length(pos.breaks)] & pos<=pos.breaks[-1])[1] ;
1040
- counts[x.bin, y.bin] <- counts[x.bin, y.bin]+1 ;
1041
- }
1042
- }
1043
- return(counts);
1346
+ (x, pos.breaks, id.breaks, rec.idcol) {
1347
+ rec2 <- x$rec
1348
+ verbose <- x$verbose
1349
+
1350
+ # get counts of how many occurrences of each genome pos.bin there are per read
1351
+ x.bins <- mapply(
1352
+ function(start, end) {
1353
+ list(rle(findInterval(start:end, pos.breaks, left.open = T)))
1354
+ }, rec2[, 1], rec2[, 2])
1355
+
1356
+ # find the single y bin for each row, replicates it at the correct places to
1357
+ # the number of distinct bins found in its row
1358
+ y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
1359
+ times = unlist(lapply(x.bins, function(a) length(a$lengths))))
1360
+
1361
+ # x.bins_counts is the number of occurrences of each bin a row contains,
1362
+ # per row, then unlisted
1363
+ x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
1364
+
1365
+ # these are the pos. in. genome bins that each count in x.bins_counts falls into
1366
+ x.bins <- unlist(lapply(x.bins, function(a) a$values))
1367
+
1368
+ # much more efficient counts implementation in R using lists instead of a matrix:
1369
+ counts <- lapply(
1370
+ 1:(length(pos.breaks) - 1),
1371
+ function(col_len) rep(0, length(id.breaks) - 1))
1372
+
1373
+ # accesses the correct list in counts by x.bin, then
1374
+ # accesses the position in that row by y.bins and adds the new count
1375
+ for(i in 1:length(x.bins)){
1376
+ counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
1377
+ }
1378
+
1379
+ counts <- do.call(rbind, counts)
1380
+ return(counts)
1044
1381
  }
1045
1382
 
1383
+ #' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
1384
+ #'
1385
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
1386
+ #'
1387
+ #' @param x \code{\link{enve.RecPlot2}} object
1388
+ #' @param comp Components
1389
+ #' @param do_crit Function estimating the criterion
1390
+ #' @param best Best solution thus far
1391
+ #' @param verbose If verbose
1392
+ #' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
1393
+ #'
1394
+ #' @author Luis M. Rodriguez-R [aut, cre]
1395
+ #'
1396
+ #' @export
1397
+
1046
1398
  enve.recplot2.findPeaks.__emauto_one <- function
1047
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
1048
- (x, comp, do_crit, best, verbose, ...){
1399
+ (x, comp, do_crit, best, verbose, ...){
1049
1400
  peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1050
1401
  if(length(peaks)==0) return(best)
1051
1402
  k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
1052
1403
  crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
1053
1404
  if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
1054
- ifelse(crit > best[['crit']], '*', ''), '\n')
1405
+ ifelse(crit > best[['crit']], '*', ''), '\n')
1055
1406
  if(crit > best[['crit']]){
1056
1407
  best[['crit']] <- crit
1057
1408
  best[['peaks']] <- peaks
@@ -1059,148 +1410,217 @@ enve.recplot2.findPeaks.__emauto_one <- function
1059
1410
  best[['pstore']][[comp]] <- peaks
1060
1411
  return(best)
1061
1412
  }
1413
+
1414
+ #' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
1415
+ #'
1416
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1417
+ #'
1418
+ #' @param x Vector of log-transformed sequencing depths
1419
+ #' @param theta Parameters list
1420
+ #'
1421
+ #' @author Luis M. Rodriguez-R [aut, cre]
1422
+ #'
1423
+ #' @export
1424
+
1062
1425
  enve.recplot2.findPeaks.__em_e <- function
1063
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
1064
- (x, theta){
1426
+ (x, theta){
1065
1427
  components <- length(theta[['mu']])
1066
1428
  product <- do.call(cbind,
1067
- lapply(1:components,
1068
- function(i) dnorm(x, theta[['mu']][i],
1069
- theta[['sd']][i])*theta[['alpha']][i]))
1429
+ lapply(1:components,
1430
+ function(i) dnorm(x, theta[['mu']][i],
1431
+ theta[['sd']][i])*theta[['alpha']][i]))
1070
1432
  sum.of.components <- rowSums(product)
1071
1433
  posterior <- product / sum.of.components
1072
-
1434
+
1073
1435
  return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1074
1436
  }
1075
1437
 
1438
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
1439
+ #'
1440
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1441
+ #'
1442
+ #' @param x Vector of log-transformed sequencing depths
1443
+ #' @param posterior Posterior probability
1444
+ #'
1445
+ #' @author Luis M. Rodriguez-R [aut, cre]
1446
+ #'
1447
+ #' @export
1448
+
1076
1449
  enve.recplot2.findPeaks.__em_m <- function
1077
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
1078
- (x, posterior){
1450
+ (x, posterior){
1079
1451
  components <- ncol(posterior)
1080
1452
  n <- colSums(posterior)
1081
1453
  mu <- colSums(posterior * x) / n
1082
1454
  sd <- sqrt( colSums(
1083
- posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1455
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1084
1456
  alpha <- n/length(x)
1085
1457
  return(list(mu=mu, sd=sd, alpha=alpha))
1086
1458
  }
1087
1459
 
1460
+ #' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
1461
+ #'
1462
+ #' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
1463
+ #'
1464
+ #' @param x \code{\link{enve.RecPlot2.Peak}} object
1465
+ #' @param mids Midpoints
1466
+ #' @param counts Counts
1467
+ #'
1468
+ #' @author Luis M. Rodriguez-R [aut, cre]
1469
+ #'
1470
+ #' @export
1471
+
1088
1472
  enve.recplot2.__peakHist <- function
1089
- ### Internal ancilliary function (see `enve.RecPlot2.Peak`).
1090
- (x, mids, counts=TRUE){
1091
- d.o <- x$param.hat
1092
- if(length(x$log)==0) x$log <- FALSE
1093
- if(x$log){
1094
- d.o$x <- log(mids)
1095
- }else{
1096
- d.o$x <- mids
1097
- }
1098
- prob <- do.call(paste('d', x$dist, sep=''), d.o)
1099
- if(!counts) return(prob)
1100
- if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1101
- return(prob*x$n.hat/sum(prob))
1473
+ (x, mids, counts=TRUE){
1474
+ d.o <- x$param.hat
1475
+ if(length(x$log)==0) x$log <- FALSE
1476
+ if(x$log){
1477
+ d.o$x <- log(mids)
1478
+ }else{
1479
+ d.o$x <- mids
1480
+ }
1481
+ prob <- do.call(paste('d', x$dist, sep=''), d.o)
1482
+ if(!counts) return(prob)
1483
+ if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1484
+ return(prob*x$n.hat/sum(prob))
1102
1485
  }
1103
1486
 
1487
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
1488
+ #'
1489
+ #' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1490
+ #'
1491
+ #' @param lsd1 Vector of log-transformed sequencing depths
1492
+ #' @param min.points Minimum number of points
1493
+ #' @param quant.est Quantile estimate
1494
+ #' @param mlv.opts List of options for \code{mlv}
1495
+ #' @param fitdist.opts List of options for \code{fitdist}
1496
+ #' @param with.skewness If skewed-normal should be used
1497
+ #' @param optim.rounds Maximum number of optimization rounds
1498
+ #' @param optim.epsilon Minimum difference considered negligible
1499
+ #' @param n.total Global number of windows
1500
+ #' @param merge.logdist Attempted \code{merge.logdist} parameter
1501
+ #' @param verbose If verbose
1502
+ #' @param log If log-transformed depths
1503
+ #'
1504
+ #' @author Luis M. Rodriguez-R [aut, cre]
1505
+ #'
1506
+ #' @export
1507
+
1104
1508
  enve.recplot2.findPeaks.__mow_one <- function
1105
- ### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
1106
- (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1107
- optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1108
- ){
1109
- dist <- ifelse(with.skewness, 'sn', 'norm');
1110
-
1111
- # Find peak
1112
- o <- mlv.opts; o$x = lsd1;
1113
- mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1114
- if(verbose) cat('Anchoring at mode =',mode1,'\n')
1115
- param.hat <- fitdist.opts$start; last.hat <- param.hat;
1116
- lim <- NA;
1117
- if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1118
-
1119
- # Refine peak parameters
1120
- for(round in 1:optim.rounds){
1121
- param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1122
- lim.o <- param.hat
1123
- lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1124
- lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1125
- if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1126
- 'and params =',as.numeric(param.hat),' \r')
1127
- if(length(lsd1.pop) < min.points) break;
1128
- o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1129
- last.last.hat <- last.hat
1130
- last.hat <- param.hat
1131
- param.hat <- as.list(do.call(fitdist, o)$estimate);
1132
- if(any(is.na(param.hat))){
1133
- if(round>1) param.hat <- last.hat;
1134
- break;
1135
- }
1136
- if(round > 1){
1137
- epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1138
- if(epsilon1 < optim.epsilon) break;
1139
- if(round > 2){
1140
- epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1141
- if(epsilon2 < optim.epsilon) break;
1142
- }
1509
+ (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1510
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1511
+ ){
1512
+ dist <- ifelse(with.skewness, 'sn', 'norm');
1513
+
1514
+ # Find peak
1515
+ o <- mlv.opts; o$x = lsd1;
1516
+ mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1517
+ if(verbose) cat('Anchoring at mode =',mode1,'\n')
1518
+ param.hat <- fitdist.opts$start; last.hat <- param.hat;
1519
+ lim <- NA;
1520
+ if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1521
+
1522
+ # Refine peak parameters
1523
+ for(round in 1:optim.rounds){
1524
+ param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1525
+ lim.o <- param.hat
1526
+ lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1527
+ lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1528
+ if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1529
+ 'and params =',as.numeric(param.hat),' \r')
1530
+ if(length(lsd1.pop) < min.points) break;
1531
+ o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1532
+ last.last.hat <- last.hat
1533
+ last.hat <- param.hat
1534
+ param.hat <- as.list(do.call(fitdist, o)$estimate);
1535
+ if(any(is.na(param.hat))){
1536
+ if(round>1) param.hat <- last.hat;
1537
+ break;
1538
+ }
1539
+ if(round > 1){
1540
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1541
+ if(epsilon1 < optim.epsilon) break;
1542
+ if(round > 2){
1543
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1544
+ if(epsilon2 < optim.epsilon) break;
1143
1545
  }
1144
- }
1145
- if(verbose) cat('\n')
1146
- if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1147
-
1148
- # Mow distribution
1149
- lsd2 <- c();
1150
- lsd.pop <- c();
1151
- n.hat <- length(lsd1.pop)/diff(quant.est)
1152
- peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1153
- param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1154
- merge.logdist=merge.logdist, log=log)
1155
- peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1156
- peak.cnt <- enve.recplot2.__peakHist(peak,
1157
- (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1158
- for(i in 2:length(peak.breaks)){
1159
- values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1160
- n.exp <- peak.cnt[i-1]
1161
- if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1162
- if(length(values)==0) next
1163
- in.peak <- runif(length(values)) <= n.exp/length(values)
1164
- lsd2 <- c(lsd2, values[!in.peak])
1165
- lsd.pop <- c(lsd.pop, values[in.peak])
1166
- }
1167
- if(length(lsd.pop) < min.points) return(NULL)
1168
-
1169
- # Return peak
1170
- attr(peak, 'values') <- lsd.pop
1171
- attr(peak, 'values.res') <- lsd2
1172
- attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1173
- plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1174
- plot=FALSE)$counts)+1)/2
1175
- mu <- tail(param.hat, n=1)
1176
- attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1177
- if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1178
- 'with expected n =',n.hat,'\n')
1179
- return(peak)
1546
+ }
1547
+ }
1548
+ if(verbose) cat('\n')
1549
+ if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1550
+
1551
+ # Mow distribution
1552
+ lsd2 <- c();
1553
+ lsd.pop <- c();
1554
+ n.hat <- length(lsd1.pop)/diff(quant.est)
1555
+ peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1556
+ param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1557
+ merge.logdist=merge.logdist, log=log)
1558
+ peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1559
+ peak.cnt <- enve.recplot2.__peakHist(peak,
1560
+ (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1561
+ for(i in 2:length(peak.breaks)){
1562
+ values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1563
+ n.exp <- peak.cnt[i-1]
1564
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1565
+ if(length(values)==0) next
1566
+ in.peak <- runif(length(values)) <= n.exp/length(values)
1567
+ lsd2 <- c(lsd2, values[!in.peak])
1568
+ lsd.pop <- c(lsd.pop, values[in.peak])
1569
+ }
1570
+ if(length(lsd.pop) < min.points) return(NULL)
1571
+
1572
+ # Return peak
1573
+ attr(peak, 'values') <- lsd.pop
1574
+ attr(peak, 'values.res') <- lsd2
1575
+ attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1576
+ plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1577
+ plot=FALSE)$counts)+1)/2
1578
+ mu <- tail(param.hat, n=1)
1579
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1580
+ if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1581
+ 'with expected n =',n.hat,'\n')
1582
+ return(peak)
1180
1583
  }
1181
1584
 
1585
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
1586
+ #'
1587
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1588
+ #'
1589
+ #' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
1590
+ #'
1591
+ #' @author Luis M. Rodriguez-R [aut, cre]
1592
+ #'
1593
+ #' @export
1594
+
1182
1595
  enve.recplot2.findPeaks.__mower <- function
1183
- ### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
1184
- (peaks.opts){
1185
- peaks <- list()
1186
- while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1187
- peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1188
- if(is.null(peak)) break
1189
- peaks[[ length(peaks)+1 ]] <- peak
1190
- peaks.opts$lsd1 <- peak$values.res
1191
- }
1192
- return(peaks)
1596
+ (peaks.opts){
1597
+ peaks <- list()
1598
+ while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1599
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1600
+ if(is.null(peak)) break
1601
+ peaks[[ length(peaks)+1 ]] <- peak
1602
+ peaks.opts$lsd1 <- peak$values.res
1603
+ }
1604
+ return(peaks)
1193
1605
  }
1194
1606
 
1607
+ #' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
1608
+ #'
1609
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
1610
+ #'
1611
+ #' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
1612
+ #' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
1613
+ #'
1614
+ #' @author Luis M. Rodriguez-R [aut, cre]
1615
+ #'
1616
+ #' @export
1195
1617
 
1196
1618
  enve.recplot2.__whichClosestPeak <- function
1197
- ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
1198
- (peak, peaks){
1199
- dist <- as.numeric(lapply(peaks,
1200
- function(x)
1201
- abs(log(x$param.hat[[ length(x$param.hat) ]] /
1202
- peak$param.hat[[ length(peak$param.hat) ]] ))))
1203
- dist[ dist==0 ] <- Inf
1204
- return(which.min(dist))
1619
+ (peak, peaks){
1620
+ dist <- as.numeric(lapply(peaks,
1621
+ function(x)
1622
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1623
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
1624
+ dist[ dist==0 ] <- Inf
1625
+ return(which.min(dist))
1205
1626
  }
1206
-