miga-base 0.3.0.0 → 0.3.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (260) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +21 -4
  3. data/actions/init.rb +258 -0
  4. data/actions/run_local.rb +1 -2
  5. data/actions/test_taxonomy.rb +4 -1
  6. data/bin/miga +8 -1
  7. data/lib/miga/dataset.rb +4 -4
  8. data/lib/miga/dataset_result.rb +7 -4
  9. data/lib/miga/version.rb +2 -2
  10. data/scripts/_distances_noref_nomulti.bash +3 -1
  11. data/scripts/clade_finding.bash +1 -1
  12. data/scripts/init.bash +1 -1
  13. data/scripts/miga.bash +1 -1
  14. data/scripts/mytaxa.bash +78 -72
  15. data/scripts/mytaxa_scan.bash +67 -62
  16. data/scripts/ogs.bash +1 -1
  17. data/scripts/trimmed_fasta.bash +4 -3
  18. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  19. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  20. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  21. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  22. data/utils/enveomics/LICENSE.txt +73 -0
  23. data/utils/enveomics/Makefile +52 -0
  24. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +703 -0
  26. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  27. data/utils/enveomics/Manifest/Tasks/fasta.json +571 -0
  28. data/utils/enveomics/Manifest/Tasks/fastq.json +208 -0
  29. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  30. data/utils/enveomics/Manifest/Tasks/ogs.json +339 -0
  31. data/utils/enveomics/Manifest/Tasks/other.json +746 -0
  32. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  33. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +454 -0
  34. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  35. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  36. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  37. data/utils/enveomics/Manifest/categories.json +132 -0
  38. data/utils/enveomics/Manifest/examples.json +154 -0
  39. data/utils/enveomics/Manifest/tasks.json +4 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  48. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  49. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  50. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  51. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  52. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  53. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  54. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  62. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  63. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  64. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  65. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  66. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  67. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  68. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  69. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  70. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  71. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  72. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  73. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  74. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  75. data/utils/enveomics/README.md +40 -0
  76. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  77. data/utils/enveomics/Scripts/Aln.cat.rb +162 -0
  78. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  79. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  80. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  81. data/utils/enveomics/Scripts/BlastTab.addlen.rb +61 -0
  82. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  83. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  84. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +106 -0
  85. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  86. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  87. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  88. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  89. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  90. data/utils/enveomics/Scripts/BlastTab.recplot2.R +40 -0
  91. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  92. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  93. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  94. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  95. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  96. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  97. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  98. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  99. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  100. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  101. data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
  102. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  103. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  104. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  105. data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
  106. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  107. data/utils/enveomics/Scripts/FastA.interpose.pl +87 -0
  108. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  109. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  110. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  111. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  112. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  113. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  114. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  115. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  116. data/utils/enveomics/Scripts/FastA.tag.rb +64 -0
  117. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  118. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  119. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  120. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  121. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  122. data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
  123. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  124. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  125. data/utils/enveomics/Scripts/HMM.essential.rb +254 -0
  126. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  127. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  128. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +306 -0
  129. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  130. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  131. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  132. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  133. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  134. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  135. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  136. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  137. data/utils/enveomics/Scripts/SRA.download.bash +50 -0
  138. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  139. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  140. data/utils/enveomics/Scripts/Table.barplot.R +30 -0
  141. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  142. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  143. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  144. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  145. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  146. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  147. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  148. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  149. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  150. data/utils/enveomics/Scripts/aai.rb +373 -0
  151. data/utils/enveomics/Scripts/ani.rb +362 -0
  152. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  153. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  154. data/utils/enveomics/Scripts/lib/data/essential.hmm.gz +0 -0
  155. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +26 -0
  156. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  157. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  158. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  159. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  160. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
  161. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  162. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  163. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  164. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  165. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  166. data/utils/enveomics/Scripts/ogs.rb +104 -0
  167. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  168. data/utils/enveomics/Scripts/rbm.rb +137 -0
  169. data/utils/enveomics/Tests/Makefile +10 -0
  170. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  171. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  172. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  173. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  174. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  175. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  176. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  177. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  178. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  179. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  180. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  181. data/utils/enveomics/Tests/alkB.nwk +1 -0
  182. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  183. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  184. data/utils/enveomics/Tests/hiv1.faa +59 -0
  185. data/utils/enveomics/Tests/hiv1.fna +134 -0
  186. data/utils/enveomics/Tests/hiv2.faa +70 -0
  187. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  188. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  189. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  190. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  191. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  192. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  193. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  194. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  195. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  196. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  197. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  198. data/utils/enveomics/build_enveomics_r.bash +44 -0
  199. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  200. data/utils/enveomics/enveomics.R/NAMESPACE +35 -0
  201. data/utils/enveomics/enveomics.R/R/autoprune.R +121 -0
  202. data/utils/enveomics/enveomics.R/R/barplot.R +165 -0
  203. data/utils/enveomics/enveomics.R/R/cliopts.R +119 -0
  204. data/utils/enveomics/enveomics.R/R/df2dist.R +117 -0
  205. data/utils/enveomics/enveomics.R/R/growthcurve.R +263 -0
  206. data/utils/enveomics/enveomics.R/R/recplot.R +320 -0
  207. data/utils/enveomics/enveomics.R/R/recplot2.R +745 -0
  208. data/utils/enveomics/enveomics.R/R/tribs.R +423 -0
  209. data/utils/enveomics/enveomics.R/R/utils.R +16 -0
  210. data/utils/enveomics/enveomics.R/README.md +52 -0
  211. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  212. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  213. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +30 -0
  214. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +43 -0
  215. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +19 -0
  216. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +37 -0
  217. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +24 -0
  218. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +24 -0
  219. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +33 -0
  220. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +64 -0
  221. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +37 -0
  222. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +19 -0
  223. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +18 -0
  224. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +26 -0
  225. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +25 -0
  226. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +26 -0
  227. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +49 -0
  228. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +28 -0
  229. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +97 -0
  230. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +40 -0
  231. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +40 -0
  232. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +24 -0
  233. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +40 -0
  234. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +18 -0
  235. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +22 -0
  236. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +20 -0
  237. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +18 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +27 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +53 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -0
  242. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +44 -0
  243. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +21 -0
  244. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +15 -0
  245. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  246. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  247. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +43 -0
  248. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +29 -0
  249. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +30 -0
  250. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +71 -0
  251. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +18 -0
  252. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +18 -0
  253. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +18 -0
  254. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +27 -0
  255. data/utils/enveomics/globals.mk +8 -0
  256. data/utils/enveomics/manifest.json +9 -0
  257. data/utils/index_metadata.rb +0 -0
  258. data/utils/plot-taxdist.R +0 -0
  259. data/utils/requirements.txt +19 -19
  260. metadata +242 -2
@@ -0,0 +1,320 @@
1
+ enve.recplot <- structure(function(
2
+ ### Produces recruitment plots provided that BlastTab.catsbj.pl has
3
+ ### been previously executed. Requires the gplots library.
4
+ prefix,
5
+ ### Path to the prefix of the BlastTab.catsbj.pl output files. At
6
+ ### least the files .rec and .lim must exist with this prefix.
7
+
8
+ # Id. hist.
9
+ id.min=NULL,
10
+ ### Minimum identity to be considered. By default, the minimum detected
11
+ ### identity. This value is a percentage.
12
+ id.max=NULL,
13
+ ### Maximum identity to be considered. By default, 100.
14
+ id.binsize=NULL,
15
+ ### Size of the identity bins (vertical histograms). By default, 0.1 for
16
+ ### identity metrics and 5 for bit score.
17
+ id.splines=0,
18
+ ### Smoothing parameter for the splines in the identity histogram. Zero (0) for no
19
+ ### splines. A generally good value is 1/2. If non-zero, requires the stats package.
20
+ id.metric='id',
21
+ ### Metric of identity to be used (Y-axis). It can be any unambiguous prefix
22
+ ### of "identity", "corrected identity", or "bit score".
23
+ id.summary='sum',
24
+ ### Method used to build the identity histogram (Horizontal axis of the right panel).
25
+ ### It can be any unambiguous prefix of "sum", "average", "median", "90% lower bound",
26
+ ### "90% upper bound", "95% lower bound", and "95% upper bound". The last four options
27
+ ### correspond to the upper and lower boundaries of the 90% and 95% empirical confidence
28
+ ### intervals.
29
+
30
+ # Pos. hist.
31
+ pos.min=1,
32
+ ### Minimum (leftmost) position in the reference (concatenated) genome (in bp).
33
+ pos.max=NULL,
34
+ ### Maximum (rightmost) position in the reference (concatenated) genome (in bp).
35
+ ### By default: Length of the genome.
36
+ pos.binsize=1e3,
37
+ ### Size of the position bins (horizontal histograms) in bp.
38
+ pos.splines=0,
39
+ ### Smoothing parameter for the splines in the position histogram. Zero (0) for no splines.
40
+ ### If non-zero, requires the stats package.
41
+
42
+ # Rec. plot
43
+ rec.col1='white',
44
+ ### Lightest color in the recruitment plot.
45
+ rec.col2='black',
46
+ ### Darkest color in the recruitment plot.
47
+
48
+ # General
49
+ main=NULL,
50
+ ### Title of the plot.
51
+ contig.col=grey(0.85),
52
+ ### Color of the Contig boundaries. Set to NA to ignore Contig boundaries.
53
+
54
+ # Return
55
+ ret.recplot=FALSE,
56
+ ### Indicates if the matrix of the recruitment plot is to be returned.
57
+ ret.hist=FALSE,
58
+ ### Indicates if the vectors of the identity and position histograms are to be returned.
59
+ ret.mode=FALSE,
60
+ ### Indicates if the mode of the identity is to be computed. It requires the modeest
61
+ ### package.
62
+
63
+ # General
64
+ id.cutoff=NULL,
65
+ ### Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
66
+ ### identity metrics and 95% of the best scoring alignment for bit score.
67
+ verbose=TRUE,
68
+ ### Indicates if the function should report the advance.
69
+ ...
70
+ ### Any additional graphic parameters to be passed to plot for all panels except the
71
+ ### recruitment plot (lower-left).
72
+ ){
73
+
74
+ # Settings
75
+ METRICS <- c('identity', 'corrected identity', 'bit score');
76
+ SUMMARY <- c('sum', 'average', 'median', '');
77
+ if(is.null(prefix)) stop('Parameter prefix is mandatory.');
78
+ if(!requireNamespace("gplots", quietly=TRUE)) stop('Unavailable gplots library.');
79
+
80
+ # Read files
81
+ if(verbose) cat("Reading files.\n")
82
+ rec <- read.table(paste(prefix, '.rec', sep=''), sep="\t", comment.char='', quote='');
83
+ lim <- read.table(paste(prefix, '.lim', sep=''), sep="\t", comment.char='', quote='');
84
+
85
+ # Configure ID summary
86
+ id.summary <- pmatch(id.summary, SUMMARY);
87
+ if(is.na(id.summary)) stop('Invalid identity summary.');
88
+ if(id.summary == -1) stop('Ambiguous identity summary.');
89
+ if(id.summary==1){
90
+ id.summary.func <- function(x) colSums(x);
91
+ id.summary.name <- 'sum'
92
+ }else if(id.summary==2){
93
+ id.summary.func <- function(x) colMeans(x);
94
+ id.summary.name <- 'mean'
95
+ }else if(id.summary==3){
96
+ id.summary.func <- function(x) apply(x,2,median);
97
+ id.summary.name <- 'median'
98
+ }else if(id.summary==4){
99
+ id.summary.func <- function(x) apply(x,2,quantile,probs=0.05,names=FALSE);
100
+ id.summary.name <- '90% LB'
101
+ }else if(id.summary==5){
102
+ id.summary.func <- function(x) apply(x,2,quantile,probs=0.95,names=FALSE);
103
+ id.summary.name <- '90% UB'
104
+ }else if(id.summary==6){
105
+ id.summary.func <- function(x) apply(x,2,quantile,probs=0.025,names=FALSE);
106
+ id.summary.name <- '95% LB'
107
+ }else if(id.summary==7){
108
+ id.summary.func <- function(x) apply(x,2,quantile,probs=0.975,names=FALSE);
109
+ id.summary.name <- '95% UB'
110
+ }
111
+
112
+ # Configure metrics
113
+ id.metric <- pmatch(id.metric, METRICS);
114
+ if(is.na(id.metric)) stop('Invalid identity metric.');
115
+ if(id.metric == -1) stop('Ambiguous identity metric.');
116
+ if(id.metric==1){
117
+ id.reccol <- 3
118
+ id.shortname <- 'Id.'
119
+ id.fullname <- 'Identity'
120
+ id.units <- '%'
121
+ id.hallmarks <- seq(0, 100, by=5)
122
+ if(is.null(id.max)) id.max <- 100
123
+ if(is.null(id.cutoff)) id.cutoff <- 95
124
+ if(is.null(id.binsize)) id.binsize <- 0.1
125
+ }else if(id.metric==2){
126
+ if(ncol(rec)<6) stop("Requesting corrected identity, but .rec file doesn't have 6th column")
127
+ id.reccol <- 6
128
+ id.shortname <- 'cId.'
129
+ id.fullname <- 'Corrected identity'
130
+ id.units <- '%'
131
+ id.hallmarks <- seq(0, 100, by=5)
132
+ if(is.null(id.max)) id.max <- 100
133
+ if(is.null(id.cutoff)) id.cutoff <- 95
134
+ if(is.null(id.binsize)) id.binsize <- 0.1
135
+ }else if(id.metric==3){
136
+ id.reccol <- 4
137
+ id.shortname <- 'BSc.'
138
+ id.fullname <- 'Bit score'
139
+ id.units <- 'bits'
140
+ max.bs <- max(rec[, id.reccol])
141
+ id.hallmarks <- seq(0, max.bs*1.2, by=50)
142
+ if(is.null(id.max)) id.max <- max.bs
143
+ if(is.null(id.cutoff)) id.cutoff <- 0.95 * max.bs
144
+ if(is.null(id.binsize)) id.binsize <- 5
145
+ }
146
+ if(is.null(id.min)) id.min <- min(rec[, id.reccol]);
147
+ if(is.null(pos.max)) pos.max <- max(lim[, 3]);
148
+ id.lim <- c(id.min, id.max);
149
+ pos.lim <- c(pos.min, pos.max)/1e6;
150
+ id.breaks <- round((id.max-id.min)/id.binsize);
151
+ pos.breaks <- round((pos.max-pos.min)/pos.binsize);
152
+ if(is.null(main)) main <- paste('Recruitment plot of ', prefix, sep='');
153
+ pos.marks=seq(pos.min, pos.max, length.out=pos.breaks+1)/1e6;
154
+ id.marks=seq(id.min, id.max, length.out=id.breaks+1);
155
+ id.topclasses <- 0;
156
+ for(i in length(id.marks):1) if(id.marks[i]>id.cutoff) id.topclasses <- id.topclasses + 1;
157
+
158
+ # Set-up image
159
+ layout(matrix(c(3,4,1,2), nrow=2, byrow=TRUE), widths=c(2,1), heights=c(1,2));
160
+ out <- list();
161
+
162
+ # Recruitment plot
163
+ if(verbose) cat("Rec. plot.\n")
164
+ par(mar=c(5,4,0,0)+0.1);
165
+ rec.hist <- matrix(0, nrow=pos.breaks, ncol=id.breaks);
166
+ for(i in 1:nrow(rec)){
167
+ id.class <- ceiling((id.breaks)*((rec[i, id.reccol]-id.min)/(id.max-id.min)));
168
+ if(id.class<=id.breaks & id.class>0){
169
+ for(pos in rec[i, 1]:rec[i, 2]){
170
+ pos.class <- ceiling((pos.breaks)*((pos-pos.min)/(pos.max-pos.min)));
171
+ if(pos.class<=pos.breaks & pos.class>0) rec.hist[pos.class, id.class] <- rec.hist[pos.class, id.class]+1;
172
+ }
173
+ }
174
+ }
175
+ id.top <- c((1-id.topclasses):0) + id.breaks;
176
+ rec.col=gplots::colorpanel(256, rec.col1, rec.col2);
177
+ image(x=pos.marks, y=id.marks, z=log10(rec.hist),
178
+ breaks=seq(0, log10(max(rec.hist)), length.out=1+length(rec.col)), col=rec.col,
179
+ xlim=pos.lim, ylim=id.lim, xlab='Position in genome (Mbp)',
180
+ ylab=paste(id.fullname, ' (',id.units,')', sep=''), xaxs='i', yaxs='r');
181
+ if(!is.na(contig.col)) abline(v=c(lim$V2, lim$V3)/1e6, lty=1, col=contig.col);
182
+ abline(h=id.hallmarks, lty=2, col=grey(0.7));
183
+ abline(h=id.marks[id.top[1]], lty=3, col=grey(0.5))
184
+ legend('bottomleft', 'Rec. plot', bg=rgb(1,1,1,2/3));
185
+ out <- c(out, list(pos.marks=pos.marks, id.marks=id.marks));
186
+ if(ret.recplot) out <- c(out, list(recplot=rec.hist));
187
+
188
+ # Identity histogram
189
+ if(verbose) cat(id.shortname, " hist.\n", sep='')
190
+ par(mar=c(5,0,0,2)+0.1);
191
+ id.hist <- id.summary.func(rec.hist);
192
+ plot(1, t='n', xlim=c(1, max(id.hist)), ylim=id.lim, ylab='', yaxt='n', xlab=paste('Sequences (bp),', id.summary.name), log='x', ...);
193
+ id.x <- rep(id.marks, each=2)[2:(id.breaks*2+1)]
194
+ id.f <- rep(id.hist, each=2)[1:(id.breaks*2)]
195
+ if(sum(id.f)>0){
196
+ lines(id.f, id.x, lwd=ifelse(id.splines>0, 1/2, 2), type='o', pch='.');
197
+ if(id.splines>0){
198
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]), spar=id.splines)
199
+ lines(exp(id.spline$y), id.spline$x, lwd=2)
200
+ }
201
+ }
202
+
203
+ abline(h=id.hallmarks, lty=2, col=grey(0.7));
204
+ abline(h=id.marks[id.top[1]], lty=3, col=grey(0.5))
205
+ legend('bottomright', paste(id.shortname, 'histogram'), bg=rgb(1,1,1,2/3));
206
+ out <- c(out, list(id.mean=mean(rec[, id.reccol])));
207
+ out <- c(out, list(id.median=median(rec[, id.reccol])));
208
+ if(ret.mode) out <- c(out, list(id.mode=modeest::mlv(rec[, id.reccol], method='mfv')$M));
209
+ if(ret.hist) out <- c(out, list(id.hist=id.hist));
210
+
211
+ # Position histogram
212
+ if(verbose) cat("Pos. hist.\n")
213
+ par(mar=c(0,4,4,0)+0.1);
214
+ h1<-rep(0,nrow(rec.hist)) ;
215
+ h2<-rep(0,nrow(rec.hist)) ;
216
+ pos.winsize <- (pos.max-pos.min+1)/pos.breaks;
217
+ if(sum(rec.hist[, id.top])>0) h1 <- rowSums(matrix(rec.hist[, id.top], nrow=nrow(rec.hist)))/pos.winsize;
218
+ if(sum(rec.hist[,-id.top])>0) h2 <- rowSums(matrix(rec.hist[,-id.top], nrow=nrow(rec.hist)))/pos.winsize;
219
+
220
+ ymin <- min(1, h1[h1>0], h2[h2>0]);
221
+ ymax <- max(10, h1, h2);
222
+ if(is.na(ymin) || ymin<=0) ymin <- 1e-10;
223
+ if(is.na(ymax) || ymax<=0) ymax <- 1;
224
+ plot(1, t='n', xlab='', xaxt='n', ylab='Sequencing depth (X)', log='y', xlim=pos.lim,
225
+ ylim=c(ymin, ymax), xaxs='i', main=main, ...);
226
+ if(!is.na(contig.col)) abline(v=c(lim[,2], lim[,3])/1e6, lty=1, col=contig.col);
227
+ abline(h=10^c(0:5), lty=2, col=grey(0.7));
228
+ if(sum(h2)>0){
229
+ h2.x <- rep(pos.marks, each=2)[2:(pos.breaks*2+1)]
230
+ h2.y <- rep(h2, each=2)[1:(pos.breaks*2)]
231
+ lines(h2.x, h2.y, lwd=ifelse(pos.splines>0, 1/2, 2), col=grey(0.5));
232
+ if(pos.splines>0){
233
+ h2.spline <- smooth.spline(h2.x[h2.y>0], log(h2.y[h2.y>0]), spar=pos.splines)
234
+ lines(h2.spline$x, exp(h2.spline$y), lwd=2, col=grey(0.5))
235
+ }
236
+ if(ret.hist) out <- c(out, list(pos.hist.low=h2.y));
237
+ }
238
+ if(sum(h1)>0){
239
+ h1.x <- rep(pos.marks, each=2)[2:(pos.breaks*2+1)]
240
+ h1.y <- rep(h1, each=2)[1:(pos.breaks*2)]
241
+ lines(h1.x, h1.y, lwd=ifelse(pos.splines>0, 1/2, 2), col=grey(0));
242
+ if(pos.splines>0){
243
+ h1.spline <- smooth.spline(h1.x[h1.y>0], log(h1.y[h1.y>0]), spar=pos.splines)
244
+ lines(h1.spline$x, exp(h1.spline$y), lwd=2, col=grey(0))
245
+ }
246
+ if(ret.hist) out <- c(out, list(pos.hist.top=h1.y));
247
+ }
248
+ legend('topleft', 'Pos. histogram', bg=rgb(1,1,1,2/3));
249
+ out <- c(out, list(id.max=id.max, id.cutoff=id.marks[id.top[1]]));
250
+ out <- c(out, list(seqdepth.mean.top=mean(h1)));
251
+ out <- c(out, list(seqdepth.mean.low=mean(h2)));
252
+ out <- c(out, list(seqdepth.mean=mean(h1+h2)));
253
+ out <- c(out, list(seqdepth.median.top=median(h1)));
254
+ out <- c(out, list(seqdepth.median.low=median(h2)));
255
+ out <- c(out, list(seqdepth.median=median(h1+h2)));
256
+ out <- c(out, list(id.metric=id.fullname));
257
+ out <- c(out, list(id.summary=id.summary.name));
258
+
259
+ # Legend
260
+ par(mar=c(0,0,4,2)+0.1);
261
+ plot(1, t='n', xlab='', xaxt='n', ylab='', yaxt='n', xlim=c(0,1), ylim=c(0,1), xaxs='r', yaxs='i', ...);
262
+ text(1/2, 5/6, labels=paste('Reads per ', signif((pos.max-pos.min)/pos.breaks, 2), ' bp (rec. plot)', sep=''), pos=3);
263
+ leg.col <- gplots::colorpanel(100, rec.col1, rec.col2);
264
+ leg.lab <- signif(10^seq(0, log10(max(rec.hist)), length.out=10), 2);
265
+ for(i in 1:10){
266
+ for(j in 1:10){
267
+ k <- (i-1)*10 + j;
268
+ polygon(c(k-1, k, k, k-1)/100, c(2/3, 2/3, 5/6, 5/6), border=leg.col[k], col=leg.col[k]);
269
+ }
270
+ text((i-0.5)/10, 2/3, labels=paste(leg.lab[i], ''), srt=90, pos=2, offset=0, cex=3/4);
271
+ }
272
+ legend('bottom',
273
+ legend=c('Contig boundary', 'Hallmark', paste(id.fullname, 'cutoff'),
274
+ paste('Pos. hist.: ',id.shortname,' > ',signif(id.marks[id.top[1]],2),id.units,sep=''),
275
+ paste('Pos. hist.: ',id.shortname,' < ',signif(id.marks[id.top[1]],2),id.units,sep='')), ncol=2,
276
+ col=grey(c(0.85, 0.7, 0.5, 0, 0.5)), lty=c(1,2,3,1,1), lwd=c(1,1,1,2,2), bty='n', inset=0.05, cex=5/6);
277
+ return(out);
278
+ ### A list with the following elements:
279
+ ###
280
+ ### pos.marks: Midpoints of the position histogram.
281
+ ###
282
+ ### id.matrix: Midpoints of the identity histogram.
283
+ ###
284
+ ### recplot (if ret.recplot=TRUE): Matrix containing the recruitment plot values.
285
+ ###
286
+ ### id.mean: Mean identity.
287
+ ###
288
+ ### id.median: Median identity.
289
+ ###
290
+ ### id.mode (if ret.mode=TRUE): Mode of the identity.
291
+ ###
292
+ ### id.hist (if ret.hist=TRUE): Values of the identity histogram.
293
+ ###
294
+ ### pos.hist.low (if ret.hist=TRUE): Values of the position histogram (depth) with "low"
295
+ ### identity (i.e., below id.cutoff).
296
+ ###
297
+ ### pos.hist.top (if ret.hist=TRUE): Values of the position histogram (depth) with "top"
298
+ ### identity (i.e., above id.cutoff).
299
+ ###
300
+ ### id.max: Value of id.max. This is returned because id.max=NULL may vary.
301
+ ###
302
+ ### id.cutoff: Value of id.cutoff. This is returned because id.cutoff=NULL may vary.
303
+ ###
304
+ ### seqdepth.mean.top: Average sequencing depth with identity above id.cutoff.
305
+ ###
306
+ ### seqdepth.mean.low: Average sequencing depth with identity below id.cutoff.
307
+ ###
308
+ ### seqdepth.mean.all: Average sequencing depth without identity filtering.
309
+ ###
310
+ ### seqdepth.median.top: Median sequencing depth with identity above id.cutoff.
311
+ ###
312
+ ### seqdepth.median.low: Median sequencing depth with identity below id.cutoff.
313
+ ###
314
+ ### seqdepth.median.all: Median sequencing depth without identity filtering.
315
+ ###
316
+ ### id.metric: Full name of the used identity metric.
317
+ ###
318
+ ### id.summary: Full name of the summary method used to build the identity plot.
319
+ });
320
+
@@ -0,0 +1,745 @@
1
+ #==============> Define S4 classes
2
+ setClass("enve.RecPlot2",
3
+ ### Enve-omics representation of Recruitment plots. This object can
4
+ ### be produced by `enve.recplot2` and supports S4 method plot.
5
+ representation(
6
+ counts='matrix', ##<< Counts as a two-dimensional histogram.
7
+ pos.counts.in='numeric', ##<< Counts of in-group hits per position bin.
8
+ pos.counts.out='numeric', ##<< Counts of out-group hits per position bin.
9
+ id.counts='numeric', ##<< Counts per ID bin.
10
+ id.breaks='numeric', ##<< Breaks of identity bins.
11
+ pos.breaks='numeric', ##<< Breaks of position bins.
12
+ seq.breaks='numeric',
13
+ peaks='list', ##<< Peaks identified in the recplot.
14
+ ### Limits of the subject sequences after concatenation.
15
+ seq.names='character', ##<< Names of the subject sequences.
16
+ id.metric='character', ##<< Metric used as 'identity'.
17
+ id.ingroup='logical', ##<< Identity bins considered in-group.
18
+ call='call') ##<< Call producing this object.
19
+ ,package='enveomics.R'
20
+ );
21
+ setClass("enve.RecPlot2.Peak",
22
+ ### Enve-omics representation of a peak in the sequencing depth histogram
23
+ ### of a Recruitment plot (see `enve.recplot2.findPeaks`).
24
+ representation(
25
+ dist='character',
26
+ ### Distribution of the peak. Currently supported: 'norm' (normal) and 'sn'
27
+ ### (skew-normal).
28
+ values='numeric',
29
+ ### Sequencing depth values predicted to conform the peak.
30
+ values.res='numeric',
31
+ ### Sequencing depth values not explained by this or previously identified
32
+ ### peaks.
33
+ mode='numeric',
34
+ ### Seed-value of mode anchoring the peak.
35
+ param.hat='list',
36
+ ### Parameters of the distribution. A list of two values if dist='norm' (sd
37
+ ### and mean), or three values if dist='sn' (omega=scale, alpha=shape, and
38
+ ### xi=location). Note that the "dispersion" parameter is always first and
39
+ ### the "location" parameter is always last.
40
+ n.hat='numeric',
41
+ ### Number of bins estimated to be explained by this peak. This should
42
+ ### ideally be equal to the length of `values`, but it's not and integer.
43
+ n.total='numeric',
44
+ ### Total number of bins from which the peak was extracted. I.e., total
45
+ ### number of position bins with non-zero sequencing depth in the recruitment
46
+ ### plot (regardless of peak count).
47
+ err.res='numeric',
48
+ ### Error left after adding the peak.
49
+ merge.logdist='numeric'
50
+ ### Attempted `merge.logdist` parameter.
51
+ ));
52
+ setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
53
+ setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
54
+
55
+ #==============> Define S4 methods
56
+ plot.enve.RecPlot2 <- function
57
+ ### Plots an `enve.RecPlot2` object.
58
+ (x,
59
+ ### `enve.RecPlot2` object to plot.
60
+ layout=matrix(c(5,5,2,1,4,3), nrow=2),
61
+ ### Matrix indicating the position of the different panels in the layout,
62
+ ### where:
63
+ ### 0: Empty space,
64
+ ### 1: Counts matrix,
65
+ ### 2: position histogram (sequencing depth),
66
+ ### 3: identity histogram,
67
+ ### 4: Populations histogram (histogram of sequencing depths),
68
+ ### 5: Color scale for the counts matrix (vertical),
69
+ ### 6: Color scale of the counts
70
+ ### matrix (horizontal). Only panels indicated here will be plotted. To
71
+ ### plot only one panel simply set this to the number of the panel you
72
+ ### want to plot.
73
+ widths=c(1,7,2),
74
+ ### Relative widths of the columns of `layout`.
75
+ heights=c(1,2),
76
+ ### Relative heights of the rows of `layout`.
77
+ palette=grey((100:0)/100),
78
+ ### Colors to be used to represent the counts matrix, sorted from no hits
79
+ ### to the maximum sequencing depth.
80
+ underlay.group=TRUE,
81
+ ### If TRUE, it indicates the in-group and out-group areas couloured based
82
+ ### on `in.col` and `out.col`. Requires support for semi-transparency.
83
+ peaks.col='darkred',
84
+ ### If not NA, it attempts to represent peaks in the population histogram
85
+ ### in the specified color. Set to NA to avoid peak-finding.
86
+ id.lim=range(x$id.breaks),
87
+ ### Limits of identities to represent.
88
+ pos.lim=range(x$pos.breaks),
89
+ ### Limits of positions to represent (in bp, regardless of `pos.units`).
90
+ pos.units=c('Mbp','Kbp','bp'),
91
+ ### Units in which the positions should be represented (powers of 1,000
92
+ ### base pairs).
93
+ mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
94
+ '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
95
+ '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
96
+ '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
97
+ ### Margins of the panels as a list, with the character representation of
98
+ ### the number of the panel as index (see `layout`).
99
+ pos.splines=0,
100
+ ### Smoothing parameter for the splines in the position histogram. Zero
101
+ ### (0) for no splines. If non-zero, requires the stats package.
102
+ id.splines=1/2,
103
+ ### Smoothing parameter for the splines in the identity histogram. Zero
104
+ ### (0) for no splines. If non-zero, requires the stats package.
105
+ in.lwd=ifelse(pos.splines>0, 1/2, 2),
106
+ ### Line width for the sequencing depth of in-group matches.
107
+ out.lwd=ifelse(pos.splines>0, 1/2, 2),
108
+ ### Line width for the sequencing depth of out-group matches.
109
+ id.lwd=ifelse(id.splines>0, 1/2, 2),
110
+ ### Line width for the identity histogram.
111
+ in.col='darkblue',
112
+ ### Color associated to in-group matches.
113
+ out.col='lightblue',
114
+ ### Color associated to out-group matches.
115
+ id.col='black',
116
+ ### Color for the identity histogram.
117
+ breaks.col='#AAAAAA40',
118
+ ### Color of the vertical lines indicating sequence breaks.
119
+ peaks.opts=list(),
120
+ ### Options passed to `enve.recplot2.findPeaks`, if `peaks.col` is not NA.
121
+ ...
122
+ ### Any other graphic parameters (currently ignored).
123
+ ){
124
+ pos.units <- match.arg(pos.units);
125
+ pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
126
+ pos.lim <- pos.lim/pos.factor;
127
+ lmat <- layout;
128
+ for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
129
+
130
+ layout(lmat, widths=widths, heights=heights);
131
+ ori.mar <- par('mar');
132
+
133
+ # Essential vars
134
+ counts <- x$counts
135
+
136
+ id.ingroup <- x$id.ingroup
137
+ id.counts <- x$id.counts
138
+ id.breaks <- x$id.breaks
139
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
140
+ id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
141
+
142
+ pos.counts.in <- x$pos.counts.in
143
+ pos.counts.out <- x$pos.counts.out
144
+ pos.breaks <- x$pos.breaks/pos.factor
145
+ pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
146
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
147
+
148
+ seqdepth.in <- pos.counts.in/pos.binsize
149
+ seqdepth.out <- pos.counts.out/pos.binsize
150
+ seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
151
+ seqdepth.out[seqdepth.out>0]))*c(1/2,2)
152
+
153
+ if(underlay.group){
154
+ in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
155
+ list(maxColorValue=256, alpha=62)));
156
+ out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
157
+ list(maxColorValue=256, alpha=52)));
158
+ }
159
+
160
+ # Counts matrix
161
+ if(any(layout==1)){
162
+ par(mar=mar[['1']]);
163
+ plot(1, t='n', bty='l',
164
+ xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
165
+ xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
166
+ if(underlay.group){
167
+ rect(pos.lim[1], id.lim[1], pos.lim[2],
168
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
169
+ rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
170
+ id.lim[2], col=in.bg, border=NA);
171
+ }
172
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col);
173
+ image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
174
+ bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
175
+ length.out=1+length(palette)), add=TRUE);
176
+ }
177
+
178
+ # Position histogram
179
+ if(any(layout==2)){
180
+ par(mar=mar[['2']]);
181
+ if(any(layout==1)){
182
+ xlab=''
183
+ xaxt='n'
184
+ }else{
185
+ xlab=paste('Position in genome (',pos.units,')',sep='')
186
+ xaxt='s'
187
+ }
188
+ plot(1,t='n', bty='l', log='y',
189
+ xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
190
+ ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
191
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col)
192
+ pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
193
+ pos.f <- rep(seqdepth.in,each=2)
194
+ lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
195
+ lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
196
+ if(pos.splines > 0){
197
+ pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
198
+ spar=pos.splines)
199
+ lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
200
+ }
201
+ if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
202
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
203
+ seqdepth.lim[1]*3/2, col=out.col, border=NA);
204
+ if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
205
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
206
+ seqdepth.lim[1]*3/2, col=in.col, border=NA);
207
+ }
208
+
209
+ # Identity histogram
210
+ if(any(layout==3)){
211
+ par(mar=mar[['3']]);
212
+ if(any(layout==1)){
213
+ ylab=''
214
+ yaxt='n'
215
+ }else{
216
+ ylab=x$id.metric
217
+ yaxt='s'
218
+ }
219
+ if(sum(id.counts>0) >= 4){
220
+ id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
221
+ plot(1,t='n', bty='l', log='x',
222
+ xlim=id.counts.range, xlab='bps per bin', xaxs='i',
223
+ ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
224
+ if(underlay.group){
225
+ rect(id.counts.range[1], id.lim[1], id.counts.range[2],
226
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
227
+ rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
228
+ id.counts.range[2], id.lim[2], col=in.bg, border=NA);
229
+ }
230
+ id.f <- rep(id.counts,each=2)
231
+ id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
232
+ lines(id.f, id.x, lwd=id.lwd, col=id.col);
233
+ if(id.splines > 0){
234
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
235
+ spar=id.splines)
236
+ lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
237
+ }
238
+ }else{
239
+ plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
240
+ text(1,1,labels='Insufficient data', srt=90)
241
+ }
242
+ }
243
+
244
+ # Populations histogram
245
+ peaks <- NA;
246
+ if(any(layout==4)){
247
+ par(mar=mar[['4']]);
248
+ if(any(layout==2)){
249
+ ylab=''
250
+ yaxt='n'
251
+ }else{
252
+ ylab='Sequencing depth (X)'
253
+ yaxt='s'
254
+ }
255
+ h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
256
+ length.out=200);
257
+ h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
258
+ h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
259
+ plot(1, t='n', log='y',
260
+ xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
261
+ xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
262
+ yaxt=yaxt)
263
+ y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
264
+ y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
265
+ lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
266
+ y.tmp.out, col=out.col)
267
+ polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
268
+ y.tmp.in, border=NA, col=in.col)
269
+ if(!is.na(peaks.col)){
270
+ o <- peaks.opts; o$x = x;
271
+ peaks <- do.call(enve.recplot2.findPeaks, o);
272
+ h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
273
+ if(!is.null(peaks) & length(peaks)>0){
274
+ pf <- h.mids*0;
275
+ for(i in 1:length(peaks)){
276
+ cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
277
+ lines(cnt, h.mids, col='red');
278
+ pf <- pf+cnt;
279
+ axis(4, at=peaks[[i]]$param.hat[[length(peaks[[i]]$param.hat)]],
280
+ letters[i], las=1, hadj=1/2)
281
+ }
282
+ lines(pf, h.mids, col='red',lwd=1.5);
283
+ legend('bottomright', legend=paste(
284
+ letters[1:length(peaks)],'. ',
285
+ signif(as.numeric(lapply(peaks,
286
+ function(x) tail(as.numeric(x$param.hat),n=1))),3),'X (',
287
+ signif(100*as.numeric(lapply(peaks,
288
+ function(x) (length(x$values)/x$n.total))), 3), '%, err: ',
289
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 3), ')',
290
+ sep=''), bty='n');
291
+ }
292
+ }
293
+ }
294
+
295
+ # Color scale
296
+ count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
297
+ length.out=1+length(palette))
298
+ if(any(layout==5)){
299
+ par(mar=mar[['5']]);
300
+ plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
301
+ ylim=range(count.bins), yaxs='i', ylab='')
302
+ rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
303
+ border=NA)
304
+ }
305
+ if(any(layout==6)){
306
+ par(mar=mar[['6']]);
307
+ plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
308
+ xlim=range(count.bins), xaxs='i',xlab='');
309
+ rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
310
+ border=NA);
311
+ }
312
+
313
+ par(mar=ori.mar);
314
+ return(peaks);
315
+ ### Returns a list of `enve.RecPlot2.Peak` objects (see
316
+ ### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
317
+ ### 4, returns NA.
318
+ }
319
+
320
+ #==============> Define core functions
321
+ enve.recplot2 <- function(
322
+ ### Produces recruitment plots provided that BlastTab.catsbj.pl has
323
+ ### been previously executed.
324
+ prefix,
325
+ ### Path to the prefix of the BlastTab.catsbj.pl output files. At
326
+ ### least the files .rec and .lim must exist with this prefix.
327
+ plot=TRUE,
328
+ ### Should the object be plotted?
329
+ pos.breaks=1e3,
330
+ ### Breaks in the positions histogram. It can also be a vector of break
331
+ ### points, and values outside the range are ignored. If zero (0), it
332
+ ### uses the sequence breaks as defined in the .lim file, which means
333
+ ### one bin per contig (or gene, if the mapping is agains genes).
334
+ id.breaks=300,
335
+ ### Breaks in the identity histogram. It can also be a vector of break
336
+ ### points, and values outside the range are ignored.
337
+ id.metric=c('identity', 'corrected identity', 'bit score'),
338
+ ### Metric of identity to be used (Y-axis). Corrected identity is only
339
+ ### supported if the original BLAST file included sequence lengths.
340
+ id.summary=sum,
341
+ ### Function summarizing the identity bins. Other recommended options
342
+ ### include: `median` to estimate the median instead of total bins, and
343
+ ### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
344
+ id.cutoff=95,
345
+ ### Cutoff of identity metric above which the hits are considered
346
+ ### 'in-group'. The 95% identity corresponds to the expectation of
347
+ ### ANI<95% within species.
348
+ threads=2,
349
+ ### Number of threads to use.
350
+ verbose=TRUE,
351
+ ### Indicates if the function should report the advance.
352
+ ...
353
+ ### Any additional parameters supported by `plot.enve.RecPlot2`.
354
+ ){
355
+ # Settings
356
+ id.metric <- match.arg(id.metric);
357
+
358
+ #Read files
359
+ if(verbose) cat("Reading files.\n")
360
+ rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
361
+ quote="");
362
+ lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
363
+ quote="", as.is=TRUE);
364
+
365
+ # Build matrix
366
+ if(verbose) cat("Building counts matrix.\n")
367
+ if(id.metric=="corrected identity" & ncol(rec)<6){
368
+ stop("Requesting corr. identity, but .rec file doesn't have 6th column")
369
+ }
370
+ rec.idcol <- ifelse(id.metric=="identity", 3,
371
+ ifelse(id.metric=="corrected identity", 6, 4));
372
+ if(length(pos.breaks)==1){
373
+ if(pos.breaks>0){
374
+ pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
375
+ }else{
376
+ pos.breaks <- c(lim[,2], tail(lim[,3], n=1))
377
+ }
378
+ }
379
+ if(length(id.breaks)==1){
380
+ id.breaks <- seq(min(rec[,rec.idcol]), max(rec[,rec.idcol]),
381
+ length.out=id.breaks+1);
382
+ }
383
+
384
+ # Run in parallel
385
+ if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
386
+ cl <- makeCluster(threads)
387
+ rec.l <- list()
388
+ thl <- ceiling(nrow(rec)/threads)
389
+ for(i in 0:(threads-1)){
390
+ rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
391
+ verbose=ifelse(i==0, verbose, FALSE))
392
+ }
393
+ counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
394
+ pos.breaks=pos.breaks, id.breaks=id.breaks,
395
+ rec.idcol=rec.idcol)
396
+ counts <- counts.l[[1]]
397
+ if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
398
+ stopCluster(cl)
399
+
400
+ # Estimate 1D histograms
401
+ if(verbose) cat("Building histograms.\n")
402
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
403
+ id.ingroup <- (id.mids > id.cutoff);
404
+ id.counts <- apply(counts, 2, id.summary);
405
+ pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
406
+ pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
407
+
408
+ # Plot and return
409
+ recplot <- new('enve.RecPlot2',
410
+ counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
411
+ pos.counts.out=pos.counts.out,
412
+ id.breaks=id.breaks, pos.breaks=pos.breaks,
413
+ seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
414
+ id.ingroup=id.ingroup,id.metric=id.metric,
415
+ call=match.call());
416
+ if(plot){
417
+ if(verbose) cat("Plotting.\n")
418
+ peaks <- plot(recplot, ...);
419
+ attr(recplot, "peaks") <- peaks
420
+ }
421
+ return(recplot);
422
+ ### Returns an object of class `enve.RecPlot2`.
423
+ }
424
+
425
+ enve.recplot2.findPeaks <- function(
426
+ ### Identifies peaks in the population histogram potentially indicating
427
+ ### sub-population mixtures.
428
+ x,
429
+ ### An `enve.RecPlot2` object.
430
+ min.points=10,
431
+ ### Minimum number of points in the quantile-estimation-range
432
+ ### (`quant.est`) to estimate a peak.
433
+ quant.est=c(0.002, 0.998),
434
+ ### Range of quantiles to be used in the estimation of a peak's
435
+ ### parameters.
436
+ mlv.opts=list(method='parzen'),
437
+ ### Options passed to `mlv` to estimate the mode.
438
+ fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
439
+ start=list(omega=1, alpha=-1), lower=c(1e-6, -Inf, 0),
440
+ upper=c(Inf, 0, Inf)),
441
+ ### Options passed to `fitdist` to estimate the standard deviation if
442
+ ### with.skewness=TRUE. Note that the `start` parameter will be ammended
443
+ ### with xi=estimated mode for each peak.
444
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4,.6),
445
+ start=list(sd=1), lower=c(1e-8, 0)),
446
+ ### Options passed to `fitdist` to estimate the standard deviation if
447
+ ### with.skewness=FALSE. Note that the `start` parameter will be ammended
448
+ ### with mean=estimated mode for each peak.
449
+ rm.top=0.05,
450
+ ### Top-values to remove before finding peaks, as a quantile probability.
451
+ ### This step is useful to remove highly conserved regions, but can be
452
+ ### turned off by setting rm.top=0. The quantile is determined *after*
453
+ ### removing zero-coverage windows.
454
+ with.skewness=TRUE,
455
+ ### Allow skewness correction of the peaks. Typically, the
456
+ ### sequencing-depth distribution for a single peak is left-skewed, due
457
+ ### partly (but not exclusively) to fragmentation and mapping sensitivity.
458
+ ### See Lindner et al 2013, Bioinformatics 29(10):1260-7 for an
459
+ ### alternative solution for the first problem (fragmentation) called
460
+ ### "tail distribution".
461
+ optim.rounds=200,
462
+ ### Maximum rounds of peak optimization.
463
+ optim.epsilon=1e-8,
464
+ ### Trace change at which optimization stops (unless `optim.rounds` is
465
+ ### reached first). The trace change is estimated as the sum of square
466
+ ### differences between parameters in one round and those from two rounds
467
+ ### earlier (to avoid infinite loops from approximation).
468
+ merge.logdist=log(1.75),
469
+ ### Maximum value of |log-ratio| between centrality parameters in peaks to
470
+ ### attempt merging. The default of ~0.22 corresponds to a maximum
471
+ ### difference of 25%.
472
+ verbose=FALSE
473
+ ### Display (mostly debugging) information.
474
+ ){
475
+
476
+ # Essential vars
477
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
478
+ seqdepth.in <- x$pos.counts.in/pos.binsize;
479
+ lsd1 <- seqdepth.in[seqdepth.in>0];
480
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
481
+ if(with.skewness){
482
+ fitdist.opts <- fitdist.opts.sn
483
+ }else{
484
+ fitdist.opts <- fitdist.opts.norm
485
+ }
486
+ peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
487
+ mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
488
+ optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
489
+ n.total=length(lsd1), merge.logdist=merge.logdist)
490
+
491
+ # Find seed peaks
492
+ if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
493
+ peaks <- enve.recplot2.__findPeaks(peaks.opts);
494
+
495
+ # Merge overlapping peaks
496
+ if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
497
+ merged <- (length(peaks)>1)
498
+ while(merged){
499
+ merged <- FALSE
500
+ ignore <- c()
501
+ peaks2 <- list();
502
+ for(i in 1:length(peaks)){
503
+ if(i %in% ignore) next
504
+ p <- peaks[[ i ]]
505
+ j <- enve.recplot2.__whichClosestPeak(p, peaks)
506
+ p2 <- peaks[[ j ]]
507
+ dst.a <- p$param.hat[[ length(p$param.hat) ]]
508
+ dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
509
+ if( abs(log(dst.a/dst.b)) < merge.logdist ){
510
+ if(verbose) cat('==> Attempting a merge at',
511
+ p$param.hat[[ length(p$param.hat) ]],'&',
512
+ p2$param.hat[[ length(p2$param.hat) ]],'X\n');
513
+ peaks.opts$lsd1 <- c(p$values, p2$values)
514
+ p.new <- enve.recplot2.__findPeaks(peaks.opts)
515
+ if(length(p.new)==1){
516
+ peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
517
+ ignore <- c(ignore, j)
518
+ merged <- TRUE
519
+ }
520
+ }
521
+ if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
522
+ }
523
+ peaks <- peaks2
524
+ if(length(peaks)==1) break
525
+ }
526
+
527
+ if(verbose) cat('Found',length(peaks),'peak(s)\n')
528
+ return(peaks);
529
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
530
+ }
531
+
532
+ #==============> Define utils
533
+ enve.recplot2.corePeak <- function
534
+ ### Finds the peak in a list of peaks that is most likely to represent the
535
+ ### "core genome" of a population.
536
+ (x
537
+ ### `list` of `enve.RecPlot2.Peak` objects.
538
+ ){
539
+ # Find the peak with maximum depth (centrality)
540
+ maxPeak <- x[[
541
+ which.max(as.numeric(lapply(x,
542
+ function(y) y$param.hat[[ length(y$param.hat) ]])))
543
+ ]]
544
+ # If a "larger" peak (a peak explaining more bins of the genome) is within
545
+ # the "merge.logdist" distance, take that one instead.
546
+ corePeak <- maxPeak
547
+ for(p in x){
548
+ sz.d = log(length(p$values)/length(corePeak$values))
549
+ if(sz.d < 0)
550
+ next;
551
+ sq.d.a <- p$param.hat[[ length(p$param.hat) ]]
552
+ sq.d.b <- maxPeak$param.hat[[ length(maxPeak$param.hat) ]]
553
+ if(abs(log(sq.d.a/sq.d.b )) < maxPeak$merge.logdist+sz.d/5)
554
+ corePeak <- p
555
+ }
556
+ return(corePeak)
557
+ }
558
+
559
+ enve.recplot2.changeCutoff <- function
560
+ ### Change the intra-species cutoff of an existing recruitment plot.
561
+ (rp,
562
+ ### enve.RecPlot2 object.
563
+ new.cutoff=98
564
+ ### New cutoff to use.
565
+ ){
566
+ # Re-calculate vectors
567
+ id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
568
+ id.ingroup <- (id.mids > new.cutoff)
569
+ pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
570
+ pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
571
+ # Update object
572
+ attr(rp, "id.ingroup") <- id.ingroup
573
+ attr(rp, "pos.counts.in") <- pos.counts.in
574
+ attr(rp, "pos.counts.out") <- pos.counts.out
575
+ attr(rp, "call") <- match.call()
576
+ return(rp)
577
+ }
578
+
579
+ enve.recplot2.extractWindows <- function
580
+ ### Extract windows significantly below (or above) the peak in sequencing
581
+ ### depth.
582
+ (rp,
583
+ ### Recruitment plot, a enve.Recplot2 object.
584
+ peak,
585
+ ### Peak, a enve.RecPlot2.Peak object. If list, it is assumed to be a list
586
+ ### of enve.RecPlot2.Peak objects, in which case the core peak is used
587
+ ### (see enve.recplot2.corePeak).
588
+ lower.tail=TRUE,
589
+ ### If FALSE, it returns windows significantly above the peak in
590
+ ### sequencing depth.
591
+ significance=0.05,
592
+ ### Significance threshold (alpha) to select windows.
593
+ seq.names=FALSE
594
+ ### Returns subject sequence names instead of a vector of Booleans. It
595
+ ### assumes that the recruitment plot was generated with pos.breaks=0.
596
+ ){
597
+ # Determine the threshold
598
+ if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
599
+ par <- peak$param.hat
600
+ par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
601
+ thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
602
+
603
+ # Estimate sequencing depths per window
604
+ pos.cnts.in <- rp$pos.counts.in
605
+ pos.breaks <- rp$pos.breaks
606
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
607
+ seqdepth.in <- pos.cnts.in/pos.binsize
608
+
609
+ # Select windows past the threshold
610
+ if(lower.tail){
611
+ sel <- seqdepth.in < thr
612
+ }else{
613
+ sel <- seqdepth.in > thr
614
+ }
615
+ if(!seq.names) return(sel)
616
+ if(length(seqdepth.in) != length(rp$seq.names))
617
+ stop(paste("Requesting subject sequence names, but the recruitment plot",
618
+ "was not generated with pos.breaks=0."))
619
+ return(rp$seq.names[sel])
620
+ }
621
+
622
+ #==============> Define internal functions
623
+ enve.recplot2.__counts <- function
624
+ ### Internal ancilliary function (see `enve.recplot2`).
625
+ (x, pos.breaks, id.breaks, rec.idcol){
626
+ rec <- x$rec
627
+ verbose <- x$verbose
628
+ counts <- matrix(0, nrow=length(pos.breaks)-1, ncol=length(id.breaks)-1);
629
+ for(i in 1:nrow(rec)){
630
+ if(verbose & i%%100==0) cat(" [",signif(i*100/nrow(rec),3),"% ] \r");
631
+ y.bin <- which(
632
+ rec[i,rec.idcol]>=id.breaks[-length(id.breaks)] &
633
+ rec[i,rec.idcol]<=id.breaks[-1])[1] ;
634
+ for(pos in rec[i,1]:rec[i,2]){
635
+ x.bin <- which(
636
+ pos>=pos.breaks[-length(pos.breaks)] & pos<=pos.breaks[-1])[1] ;
637
+ counts[x.bin, y.bin] <- counts[x.bin, y.bin]+1 ;
638
+ }
639
+ }
640
+ return(counts);
641
+ }
642
+
643
+ enve.recplot2.__peakHist <- function
644
+ ### Internal ancilliary function (see `enve.RecPlot2.Peak`).
645
+ (x, mids, counts=TRUE){
646
+ d.o <- x$param.hat
647
+ d.o$x <- mids
648
+ prob <- do.call(paste('d', x$dist, sep=''), d.o)
649
+ if(!counts) return(prob)
650
+ if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
651
+ return(prob*x$n.hat/sum(prob))
652
+ }
653
+
654
+ enve.recplot2.__findPeak <- function
655
+ ### Internall ancilliary function (see `enve.recplot2.findPeaks`).
656
+ (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
657
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
658
+ ){
659
+ dist <- ifelse(with.skewness, 'sn', 'norm');
660
+
661
+ # Find peak
662
+ o <- mlv.opts; o$x = lsd1;
663
+ mode1 <- do.call(mlv, o)$M;
664
+ if(verbose) cat('Anchoring at mode =',mode1,'\n')
665
+ param.hat <- fitdist.opts$start; last.hat <- param.hat;
666
+ lim <- NA;
667
+ if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
668
+
669
+ # Refine peak parameters
670
+ for(round in 1:optim.rounds){
671
+ param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
672
+ lim.o <- param.hat
673
+ lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
674
+ lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
675
+ if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
676
+ 'and params =',as.numeric(param.hat),' \r')
677
+ if(length(lsd1.pop) < min.points) break;
678
+ o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
679
+ last.last.hat <- last.hat
680
+ last.hat <- param.hat
681
+ param.hat <- as.list(do.call(fitdist, o)$estimate);
682
+ if(any(is.na(param.hat))){
683
+ if(round>1) param.hat <- last.hat;
684
+ break;
685
+ }
686
+ epsilon <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
687
+ if(round>2) if(epsilon < optim.epsilon) break;
688
+ }
689
+ if(verbose) cat('\n')
690
+ if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
691
+
692
+ # Mow distribution
693
+ lsd2 <- c();
694
+ lsd.pop <- c();
695
+ n.hat <- length(lsd1.pop)/diff(quant.est)
696
+ peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
697
+ param.hat=param.hat, n.hat=n.hat, n.total=n.total,
698
+ merge.logdist=merge.logdist)
699
+ peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
700
+ peak.cnt <- enve.recplot2.__peakHist(peak,
701
+ (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
702
+ for(i in 2:length(peak.breaks)){
703
+ values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
704
+ n.exp <- peak.cnt[i-1]
705
+ if(n.exp==0) n.exp=0.1
706
+ if(length(values)==0) next
707
+ in.peak <- runif(length(values)) <= n.exp/length(values)
708
+ lsd2 <- c(lsd2, values[!in.peak])
709
+ lsd.pop <- c(lsd.pop, values[in.peak])
710
+ }
711
+ if(length(lsd.pop) < min.points) return(NULL)
712
+
713
+ # Return peak
714
+ attr(peak, 'values') <- lsd.pop
715
+ attr(peak, 'values.res') <- lsd2
716
+ attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
717
+ plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
718
+ plot=FALSE)$counts)+1)/2
719
+ if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
720
+ 'with expected n =',n.hat,'\n')
721
+ return(peak)
722
+ }
723
+
724
+ enve.recplot2.__findPeaks <- function
725
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
726
+ (peaks.opts){
727
+ peaks <- list()
728
+ while(length(peaks.opts$lsd1) > peaks.opts$min.points){
729
+ peak <- do.call(enve.recplot2.__findPeak, peaks.opts)
730
+ if(is.null(peak)) break
731
+ peaks[[ length(peaks)+1 ]] <- peak
732
+ peaks.opts$lsd1 <- peak$values.res
733
+ }
734
+ return(peaks)
735
+ }
736
+
737
+
738
+ enve.recplot2.__whichClosestPeak <- function
739
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
740
+ (peak, peaks){
741
+ dist <- as.numeric(lapply(peaks, function(x) abs(log(x$param.hat[[ length(x$param.hat) ]]/peak$param.hat[[ length(peak$param.hat) ]] ))))
742
+ dist[ dist==0 ] <- Inf
743
+ return(which.min(dist))
744
+ }
745
+