miga-base 1.2.17.0 → 1.2.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +1 -1
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI +3659 -0
  7. data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
  8. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/FastAAI/README.md +84 -0
  10. data/utils/enveomics/Docs/recplot2.md +244 -0
  11. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  13. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  14. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  15. data/utils/enveomics/LICENSE.txt +73 -0
  16. data/utils/enveomics/Makefile +52 -0
  17. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  18. data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
  19. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  20. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  21. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  22. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  23. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  24. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  25. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  26. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  27. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
  28. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  29. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  30. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  31. data/utils/enveomics/Manifest/categories.json +165 -0
  32. data/utils/enveomics/Manifest/examples.json +162 -0
  33. data/utils/enveomics/Manifest/tasks.json +4 -0
  34. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  35. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  36. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  37. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  38. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  39. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  48. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  49. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  50. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  51. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  52. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  53. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  54. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  62. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  63. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  64. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  65. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  66. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  67. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  68. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  69. data/utils/enveomics/README.md +42 -0
  70. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  71. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  72. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  73. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  74. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  75. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  76. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  77. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  78. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  79. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  80. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  81. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  82. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  83. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  84. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  85. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  86. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  87. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  88. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  89. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  90. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  91. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  92. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  93. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
  94. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  95. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  96. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  97. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  98. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  99. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  100. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  101. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  102. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  103. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  104. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  105. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  106. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  107. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  108. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  109. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  110. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  111. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  112. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  113. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  114. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  115. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  116. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  117. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  118. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  119. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  120. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  121. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  122. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  123. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  124. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  125. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  126. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  127. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  128. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  129. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  130. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  131. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  132. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  133. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  134. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  135. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  136. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  137. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  138. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  139. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  140. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  141. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  142. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  143. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  144. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  145. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  146. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  147. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  148. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  149. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  150. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  151. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  152. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  153. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  154. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  155. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  156. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  157. data/utils/enveomics/Scripts/aai.rb +421 -0
  158. data/utils/enveomics/Scripts/ani.rb +362 -0
  159. data/utils/enveomics/Scripts/anir.rb +137 -0
  160. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  161. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  162. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  163. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  164. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  165. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  166. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  167. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  168. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  169. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  170. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  171. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  172. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
  173. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  174. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  175. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  176. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  177. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  178. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  179. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  180. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
  181. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  182. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  183. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  184. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  185. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  186. data/utils/enveomics/Scripts/ogs.rb +104 -0
  187. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  188. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  189. data/utils/enveomics/Scripts/rbm.rb +108 -0
  190. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  191. data/utils/enveomics/Tests/Makefile +10 -0
  192. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  193. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  194. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  195. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  196. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  197. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  198. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  199. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  200. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  201. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  202. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  203. data/utils/enveomics/Tests/alkB.nwk +1 -0
  204. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  205. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  206. data/utils/enveomics/Tests/hiv1.faa +59 -0
  207. data/utils/enveomics/Tests/hiv1.fna +134 -0
  208. data/utils/enveomics/Tests/hiv2.faa +70 -0
  209. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  210. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  211. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  212. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  213. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  214. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  215. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  216. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  217. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  218. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  219. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  220. data/utils/enveomics/build_enveomics_r.bash +45 -0
  221. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  222. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  223. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  224. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  225. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  226. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  227. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  228. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  229. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  230. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  231. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  232. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  233. data/utils/enveomics/enveomics.R/README.md +81 -0
  234. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  235. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  236. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  237. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  238. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  239. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  240. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  241. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  242. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  243. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  244. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  245. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  246. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  247. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  248. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  249. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  250. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  251. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  252. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  253. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  254. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  255. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  256. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  257. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  258. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  259. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  260. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  261. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  262. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  263. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  264. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  265. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  266. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  267. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  268. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  269. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  270. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  271. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  272. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  273. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  274. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  275. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  276. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  277. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  278. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  279. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  280. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  282. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  283. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  284. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  285. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  286. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  287. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  288. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  289. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  290. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  291. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  292. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  293. data/utils/enveomics/globals.mk +8 -0
  294. data/utils/enveomics/manifest.json +9 -0
  295. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  296. data/utils/multitrim/README.md +67 -0
  297. data/utils/multitrim/multitrim.py +1555 -0
  298. data/utils/multitrim/multitrim.yml +13 -0
  299. metadata +301 -5
@@ -0,0 +1,1631 @@
1
+ #==============> Define S4 classes
2
+
3
+ #' Enveomics: Recruitment Plot (2) - S4 Class
4
+ #'
5
+ #' Enve-omics representation of Recruitment plots. This object can
6
+ #' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
7
+ #'
8
+ #' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
9
+ #' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
10
+ #' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
11
+ #' @slot id.counts \code{(numeric)} Counts per ID bin.
12
+ #' @slot id.breaks \code{(numeric)} Breaks of identity bins.
13
+ #' @slot pos.breaks \code{(numeric)} Breaks of position bins.
14
+ #' @slot pos.names \code{(character)} Names of the position bins.
15
+ #' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
16
+ #' @slot peaks \code{(list)} Peaks identified in the recplot.
17
+ #' Limits of the subject sequences after concatenation.
18
+ #' @slot seq.names \code{(character}) Names of the subject sequences.
19
+ #' @slot id.metric \code{(character}) Metric used as 'identity'.
20
+ #' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
21
+ #' @slot call \code{(call)} Call producing this object.
22
+ #'
23
+ #' @author Luis M. Rodriguez-R [aut, cre]
24
+ #'
25
+ #' @exportClass
26
+
27
+ enve.RecPlot2 <- setClass("enve.RecPlot2",
28
+ representation(
29
+ # slots = list(
30
+ counts='matrix',
31
+ pos.counts.in='numeric',
32
+ pos.counts.out='numeric',
33
+ id.counts='numeric',
34
+ id.breaks='numeric',
35
+ pos.breaks='numeric',
36
+ pos.names='character',
37
+ seq.breaks='numeric',
38
+ peaks='list',
39
+ seq.names='character',
40
+ id.metric='character',
41
+ id.ingroup='logical',
42
+ call='call')
43
+ ,package='enveomics.R'
44
+ );
45
+
46
+ #' Enveomics: Recruitment Plot (2) Peak - S4 Class
47
+ #'
48
+ #' Enve-omics representation of a peak in the sequencing depth histogram
49
+ #' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
50
+ #'
51
+ #' @slot dist \code{(character)}
52
+ #' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
53
+ #' (skew-normal).
54
+ #' @slot values \code{(numeric)}
55
+ #' Sequencing depth values predicted to conform the peak.
56
+ #' @slot values.res \code{(numeric)}
57
+ #' Sequencing depth values not explained by this or previously identified
58
+ #' peaks.
59
+ #' @slot mode \code{(numeric)}
60
+ #' Seed-value of mode anchoring the peak.
61
+ #' @slot param.hat \code{(list)}
62
+ #' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
63
+ #' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
64
+ #' xi=location). Note that the "dispersion" parameter is always first and
65
+ #' the "location" parameter is always last.
66
+ #' @slot n.hat \code{(numeric)}
67
+ #' Number of bins estimated to be explained by this peak. This should
68
+ #' ideally be equal to the length of \code{values}, but it's not an integer.
69
+ #' @slot n.total \code{(numeric)}
70
+ #' Total number of bins from which the peak was extracted. I.e., total
71
+ #' number of position bins with non-zero sequencing depth in the recruitment
72
+ #' plot (regardless of peak count).
73
+ #' @slot err.res \code{(numeric)}
74
+ #' Error left after adding the peak (mower) or log-likelihood (em or emauto).
75
+ #' @slot merge.logdist \code{(numeric)}
76
+ #' Attempted \code{merge.logdist} parameter.
77
+ #' @slot seq.depth \code{(numeric)}
78
+ #' Best estimate available for the sequencing depth of the peak (centrality).
79
+ #' @slot log \code{(logical)}
80
+ #' Indicates if the estimation was performed in natural logarithm space.
81
+ #'
82
+ #' @author Luis M. Rodriguez-R [aut, cre]
83
+ #'
84
+ #' @exportClass
85
+
86
+ enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
87
+ representation(
88
+ # slots = list(
89
+ dist='character',
90
+ values='numeric',
91
+ values.res='numeric',
92
+ mode='numeric',
93
+ param.hat='list',
94
+ n.hat='numeric',
95
+ n.total='numeric',
96
+ err.res='numeric',
97
+ merge.logdist='numeric',
98
+ seq.depth='numeric',
99
+ log='logical'
100
+ ));
101
+
102
+ #' Attribute accessor
103
+ #'
104
+ #'
105
+ #' @param x Object
106
+ #' @param name Attribute name
107
+ setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
108
+
109
+ #' Attribute accessor
110
+ #'
111
+ #'
112
+ #' @param x Object
113
+ #' @param name Attribute name
114
+ setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
115
+
116
+ #==============> Define S4 methods
117
+
118
+ #' Enveomics: Recruitment Plot (2)
119
+ #'
120
+ #' Plots an \code{\link{enve.RecPlot2}} object.
121
+ #'
122
+ #' @param x
123
+ #' \code{\link{enve.RecPlot2}} object to plot.
124
+ #' @param layout
125
+ #' Matrix indicating the position of the different panels in the layout,
126
+ #' where:
127
+ #' \itemize{
128
+ #' \item 0: Empty space
129
+ #' \item 1: Counts matrix
130
+ #' \item 2: position histogram (sequencing depth)
131
+ #' \item 3: identity histogram
132
+ #' \item 4: Populations histogram (histogram of sequencing depths)
133
+ #' \item 5: Color scale for the counts matrix (vertical)
134
+ #' \item 6: Color scale of the counts matrix (horizontal)
135
+ #' }
136
+ #' Only panels indicated here will be plotted. To plot only one panel
137
+ #' simply set this to the number of the panel you want to plot.
138
+ #' @param panel.fun
139
+ #' List of functions to be executed after drawing each panel. Use the
140
+ #' indices in \code{layout} (as characters) as keys. Functions for indices
141
+ #' missing in \code{layout} are ignored. For example, to add a vertical line
142
+ #' at the 3Mbp mark in both the position histogram and the counts matrix:
143
+ #' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
144
+ #' Note that the X-axis in both panels is in Mbp by default. To change
145
+ #' this behavior, set \code{pos.units} accordingly.
146
+ #' @param widths
147
+ #' Relative widths of the columns of \code{layout}.
148
+ #' @param heights
149
+ #' Relative heights of the rows of \code{layout}.
150
+ #' @param palette
151
+ #' Colors to be used to represent the counts matrix, sorted from no hits
152
+ #' to the maximum sequencing depth.
153
+ #' @param underlay.group
154
+ #' If TRUE, it indicates the in-group and out-group areas couloured based
155
+ #' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
156
+ #' @param peaks.col
157
+ #' If not \code{NA}, it attempts to represent peaks in the population histogram
158
+ #' in the specified color. Set to \code{NA} to avoid peak-finding.
159
+ #' @param use.peaks
160
+ #' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
161
+ #' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
162
+ #' @param id.lim
163
+ #' Limits of identities to represent.
164
+ #' @param pos.lim
165
+ #' Limits of positions to represent (in bp, regardless of \code{pos.units}).
166
+ #' @param pos.units
167
+ #' Units in which the positions should be represented (powers of 1,000
168
+ #' base pairs).
169
+ #' @param mar
170
+ #' Margins of the panels as a list, with the character representation of
171
+ #' the number of the panel as index (see \code{layout}).
172
+ #' @param pos.splines
173
+ #' Smoothing parameter for the splines in the position histogram. Zero
174
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
175
+ #' cross-validation.
176
+ #' @param id.splines
177
+ #' Smoothing parameter for the splines in the identity histogram. Zero
178
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
179
+ #' cross-validation.
180
+ #' @param in.lwd
181
+ #' Line width for the sequencing depth of in-group matches.
182
+ #' @param out.lwd
183
+ #' Line width for the sequencing depth of out-group matches.
184
+ #' @param id.lwd
185
+ #' Line width for the identity histogram.
186
+ #' @param in.col
187
+ #' Color associated to in-group matches.
188
+ #' @param out.col
189
+ #' Color associated to out-group matches.
190
+ #' @param id.col
191
+ #' Color for the identity histogram.
192
+ #' @param breaks.col
193
+ #' Color of the vertical lines indicating sequence breaks.
194
+ #' @param peaks.opts
195
+ #' Options passed to \code{\link{enve.recplot2.findPeaks}},
196
+ #' if \code{peaks.col} is not \code{NA}.
197
+ #' @param ...
198
+ #' Any other graphic parameters (currently ignored).
199
+ #'
200
+ #' @return
201
+ #' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
202
+ #' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
203
+ #' \code{layout} doesn't include 4, returns \code{NA}.
204
+ #'
205
+ #' @author Luis M. Rodriguez-R [aut, cre]
206
+ #'
207
+ #' @method plot enve.RecPlot2
208
+ #' @export
209
+
210
+ plot.enve.RecPlot2 <- function
211
+ (x,
212
+ layout=matrix(c(5,5,2,1,4,3), nrow=2),
213
+ panel.fun=list(),
214
+ widths=c(1,7,2),
215
+ heights=c(1,2),
216
+ palette=grey((100:0)/100),
217
+ underlay.group=TRUE,
218
+ peaks.col='darkred',
219
+ use.peaks,
220
+ id.lim=range(x$id.breaks),
221
+ pos.lim=range(x$pos.breaks),
222
+ pos.units=c('Mbp','Kbp','bp'),
223
+ mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
224
+ '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
225
+ '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
226
+ '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
227
+ pos.splines=0,
228
+ id.splines=1/2,
229
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
230
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
231
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
232
+ in.col='darkblue',
233
+ out.col='lightblue',
234
+ id.col='black',
235
+ breaks.col='#AAAAAA40',
236
+ peaks.opts=list(),
237
+ ...
238
+ ){
239
+ pos.units <- match.arg(pos.units);
240
+ pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
241
+ pos.lim <- pos.lim/pos.factor;
242
+ lmat <- layout;
243
+ for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
244
+
245
+ layout(lmat, widths=widths, heights=heights);
246
+ ori.mar <- par('mar');
247
+
248
+ # Essential vars
249
+ counts <- x$counts
250
+
251
+ id.ingroup <- x$id.ingroup
252
+ id.counts <- x$id.counts
253
+ id.breaks <- x$id.breaks
254
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
255
+ id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
256
+
257
+ pos.counts.in <- x$pos.counts.in
258
+ pos.counts.out <- x$pos.counts.out
259
+ pos.breaks <- x$pos.breaks/pos.factor
260
+ pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
261
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
262
+
263
+ seqdepth.in <- pos.counts.in/pos.binsize
264
+ seqdepth.out <- pos.counts.out/pos.binsize
265
+ seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
266
+ seqdepth.out[seqdepth.out>0]))*c(1/2,2)
267
+
268
+ if(underlay.group){
269
+ in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
270
+ list(maxColorValue=256, alpha=62)));
271
+ out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
272
+ list(maxColorValue=256, alpha=52)));
273
+ }
274
+
275
+ # [1] Counts matrix
276
+ if(any(layout==1)){
277
+ par(mar=mar[['1']]);
278
+ plot(1, t='n', bty='l',
279
+ xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
280
+ xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
281
+ if(underlay.group){
282
+ rect(pos.lim[1], id.lim[1], pos.lim[2],
283
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
284
+ rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
285
+ id.lim[2], col=in.bg, border=NA);
286
+ }
287
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col);
288
+ image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
289
+ bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
290
+ length.out=1+length(palette)), add=TRUE);
291
+ if(exists('1',panel.fun)) panel.fun[['1']]();
292
+ }
293
+
294
+ # [2] Position histogram
295
+ if(any(layout==2)){
296
+ par(mar=mar[['2']]);
297
+ if(any(layout==1)){
298
+ xlab=''
299
+ xaxt='n'
300
+ }else{
301
+ xlab=paste('Position in genome (',pos.units,')',sep='')
302
+ xaxt='s'
303
+ }
304
+ plot(1,t='n', bty='l', log='y',
305
+ xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
306
+ ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
307
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col)
308
+ pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
309
+ pos.f <- rep(seqdepth.in,each=2)
310
+ lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
311
+ lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
312
+ if(is.null(pos.splines) || pos.splines > 0){
313
+ pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
314
+ spar=pos.splines)
315
+ lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
316
+ }
317
+ if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
318
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
319
+ seqdepth.lim[1]*3/2, col=out.col, border=NA);
320
+ if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
321
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
322
+ seqdepth.lim[1]*3/2, col=in.col, border=NA);
323
+ if(exists('2',panel.fun)) panel.fun[['2']]();
324
+ }
325
+
326
+ # [3] Identity histogram
327
+ if(any(layout==3)){
328
+ par(mar=mar[['3']]);
329
+ if(any(layout==1)){
330
+ ylab=''
331
+ yaxt='n'
332
+ }else{
333
+ ylab=x$id.metric
334
+ yaxt='s'
335
+ }
336
+ if(sum(id.counts>0) >= 4){
337
+ id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
338
+ plot(1,t='n', bty='l', log='x',
339
+ xlim=id.counts.range, xlab='bps per bin', xaxs='i',
340
+ ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
341
+ if(underlay.group){
342
+ rect(id.counts.range[1], id.lim[1], id.counts.range[2],
343
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
344
+ rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
345
+ id.counts.range[2], id.lim[2], col=in.bg, border=NA);
346
+ }
347
+ id.f <- rep(id.counts,each=2)
348
+ id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
349
+ lines(id.f, id.x, lwd=id.lwd, col=id.col);
350
+ if(is.null(id.splines) || id.splines > 0){
351
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
352
+ spar=id.splines)
353
+ lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
354
+ }
355
+ }else{
356
+ plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
357
+ text(1,1,labels='Insufficient data', srt=90)
358
+ }
359
+ if(exists('3',panel.fun)) panel.fun[['3']]();
360
+ }
361
+
362
+ # [4] Populations histogram
363
+ peaks <- NA;
364
+ if(any(layout==4)){
365
+ par(mar=mar[['4']]);
366
+ if(any(layout==2)){
367
+ ylab=''
368
+ yaxt='n'
369
+ }else{
370
+ ylab='Sequencing depth (X)'
371
+ yaxt='s'
372
+ }
373
+ h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
374
+ length.out=200);
375
+ h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
376
+ h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
377
+ plot(1, t='n', log='y',
378
+ xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
379
+ xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
380
+ yaxt=yaxt)
381
+ y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
382
+ y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
383
+ lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
384
+ y.tmp.out, col=out.col)
385
+ polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
386
+ y.tmp.in, border=NA, col=in.col)
387
+ if(!is.na(peaks.col)){
388
+ o <- peaks.opts; o$x = x;
389
+ if(missing(use.peaks)){
390
+ peaks <- do.call(enve.recplot2.findPeaks, o)
391
+ }else{
392
+ peaks <- use.peaks
393
+ }
394
+ h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
395
+ if(!is.null(peaks) & length(peaks)>0){
396
+ pf <- h.mids*0;
397
+ for(i in 1:length(peaks)){
398
+ cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
399
+ lines(cnt, h.mids, col='red');
400
+ pf <- pf+cnt;
401
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
402
+ }
403
+ lines(pf, h.mids, col='red',lwd=1.5);
404
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
405
+ frx <- signif(100*as.numeric(
406
+ lapply(peaks,
407
+ function(x) ifelse(length(x$values)==0, x$n.hat,
408
+ length(x$values))/x$n.total)), 2)
409
+ if(peaks[[1]]$err.res < 0){
410
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
411
+ }else{
412
+ err <- paste(', err:',
413
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
414
+ }
415
+ legend('topright', bty='n', cex=1/2,
416
+ legend=paste(letters[1:length(peaks)],'. ',
417
+ dpt,'X (', frx, '%', err, ')', sep=''))
418
+ }
419
+ }
420
+ if(exists('4',panel.fun)) panel.fun[['4']]();
421
+ }
422
+
423
+ # [5] Color scale of the counts matrix (vertical)
424
+ count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
425
+ length.out=1+length(palette))
426
+ if(any(layout==5)){
427
+ par(mar=mar[['5']]);
428
+ plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
429
+ ylim=range(count.bins), yaxs='i', ylab='')
430
+ rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
431
+ border=NA)
432
+ if(exists('5',panel.fun)) panel.fun[['5']]();
433
+ }
434
+
435
+ # [6] Color scale of the coutnts matrix (horizontal)
436
+ if(any(layout==6)){
437
+ par(mar=mar[['6']]);
438
+ plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
439
+ xlim=range(count.bins), xaxs='i',xlab='');
440
+ rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
441
+ border=NA);
442
+ if(exists('6',panel.fun)) panel.fun[['6']]();
443
+ }
444
+
445
+ par(mar=ori.mar);
446
+ return(peaks);
447
+ }
448
+
449
+ #==============> Define core functions
450
+
451
+ #' Enveomics: Recruitment Plot (2)
452
+ #'
453
+ #' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
454
+ #' been previously executed.
455
+ #'
456
+ #' @param prefix
457
+ #' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
458
+ #' least the files .rec and .lim must exist with this prefix.
459
+ #' @param plot
460
+ #' Should the object be plotted?
461
+ #' @param pos.breaks
462
+ #' Breaks in the positions histogram. It can also be a vector of break
463
+ #' points, and values outside the range are ignored. If zero (0), it
464
+ #' uses the sequence breaks as defined in the .lim file, which means
465
+ #' one bin per contig (or gene, if the mapping is agains genes). Ignored
466
+ #' if `pos.breaks.tsv` is passed.
467
+ #' @param pos.breaks.tsv
468
+ #' Path to a list of (absolute) coordinates to use as position breaks.
469
+ #' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
470
+ #' must contain at least one column: coordinates of the break positions of
471
+ #' each position bin. If it has a second column, this is used as the name
472
+ #' of the position bin that ends at the given coordinate (the first row is
473
+ #' ignored). Any additional columns are currently ignored. If \code{NA},
474
+ #' position bins are determined by \code{pos.breaks}.
475
+ #' @param id.breaks
476
+ #' Breaks in the identity histogram. It can also be a vector of break
477
+ #' points, and values outside the range are ignored.
478
+ #' @param id.free.range
479
+ #' Indicates that the range should be freely set from the observed
480
+ #' values. Otherwise, 70-100\% is included in the identity histogram
481
+ #' (default).
482
+ #' @param id.metric
483
+ #' Metric of identity to be used (Y-axis). Corrected identity is only
484
+ #' supported if the original BLAST file included sequence lengths.
485
+ #' @param id.summary
486
+ #' Function summarizing the identity bins. Other recommended options
487
+ #' include: \code{median} to estimate the median instead of total bins, and
488
+ #' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
489
+ #' @param id.cutoff
490
+ #' Cutoff of identity metric above which the hits are considered
491
+ #' \code{in-group}. The 95\% identity corresponds to the expectation of
492
+ #' ANI<95\% within species.
493
+ #' @param threads
494
+ #' Number of threads to use.
495
+ #' @param verbose
496
+ #' Indicates if the function should report the advance.
497
+ #' @param ...
498
+ #' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
499
+ #'
500
+ #' @return Returns an object of class \code{\link{enve.RecPlot2}}.
501
+ #'
502
+ #' @author Luis M. Rodriguez-R [aut, cre]
503
+ #' @author Kenji Gerhardt [aut]
504
+ #'
505
+ #' @export
506
+
507
+ enve.recplot2 <- function(
508
+ prefix,
509
+ plot = TRUE,
510
+ pos.breaks = 1e3,
511
+ pos.breaks.tsv = NA,
512
+ id.breaks = 60,
513
+ id.free.range = FALSE,
514
+ id.metric = c('identity', 'corrected identity', 'bit score'),
515
+ id.summary = sum,
516
+ id.cutoff = 95,
517
+ threads = 2,
518
+ verbose = TRUE,
519
+ ...
520
+ ){
521
+ # Settings
522
+ id.metric <- match.arg(id.metric);
523
+
524
+ #Read files
525
+ if (verbose) cat("Reading files.\n")
526
+ rec <- read.table(paste(prefix, ".rec", sep = ""),
527
+ sep = "\t", comment.char = "", quote = "");
528
+ lim <- read.table(paste(prefix, ".lim", sep = ""),
529
+ sep = "\t", comment.char = "", quote = "", as.is = TRUE);
530
+
531
+ # Build matrix
532
+ if (verbose) cat("Building counts matrix.\n")
533
+ if (id.metric == "corrected identity" & ncol(rec) < 6) {
534
+ stop("Requesting corr. identity, but .rec file doesn't have 6th column")
535
+ }
536
+ rec.idcol <- ifelse(id.metric == "identity", 3,
537
+ ifelse(id.metric == "corrected identity", 6, 4))
538
+ pos.names <- as.character(NULL)
539
+ if (!is.na(pos.breaks.tsv)){
540
+ tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
541
+ pos.breaks <- as.numeric(tmp[, 1])
542
+ if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
543
+ } else if (length(pos.breaks) == 1) {
544
+ if (pos.breaks > 0){
545
+ pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
546
+ } else {
547
+ pos.breaks <- c(lim[1, 2], lim[, 3])
548
+ pos.names <- lim[, 1]
549
+ }
550
+ }
551
+ if (length(id.breaks) == 1) {
552
+ id.range.v <- rec[, rec.idcol]
553
+ if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
554
+ id.range.v <- range(id.range.v)
555
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
556
+ }
557
+
558
+ # Run in parallel
559
+ # If they already set threads to 1 manually, there's no point in launching
560
+ # clusters, it's just slower. Ditto for small files.
561
+ if (nrow(rec) < 75000 | threads == 1) {
562
+ # Coerces rec into a form that __counts is happy about
563
+ rec.l <- list()
564
+ rec.l[[1]] <- list(rec = rec, verbose = FALSE)
565
+
566
+ # No need to make a temporary variable, there's only one return for sure
567
+ # and it's not a list because it isn't coming back from an apply
568
+ counts <- enve.recplot2.__counts(
569
+ rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
570
+ rec.idcol = rec.idcol)
571
+ } else {
572
+ cl <- makeCluster(threads)
573
+ rec.l <- list()
574
+ thl <- ceiling(nrow(rec)/threads)
575
+ for (i in 0:(threads - 1)) {
576
+ rec.l[[i + 1]] <- list(
577
+ rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
578
+ verbose = ifelse(i == 0, verbose, FALSE))
579
+ }
580
+ counts.l <- clusterApply(
581
+ cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
582
+ id.breaks = id.breaks, rec.idcol = rec.idcol)
583
+ stopCluster(cl) # No spooky ghost clusters
584
+
585
+ counts <- counts.l[[1]]
586
+ for (i in 2:threads) counts <- counts + counts.l[[i]]
587
+ }
588
+
589
+ # Estimate 1D histograms
590
+ if (verbose) cat("Building histograms.\n")
591
+ id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
592
+ id.ingroup <- (id.mids > id.cutoff);
593
+ id.counts <- apply(counts, 2, id.summary);
594
+ pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
595
+ pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
596
+
597
+ # Plot and return
598
+ recplot <- new('enve.RecPlot2',
599
+ counts = counts, id.counts = id.counts,
600
+ pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
601
+ id.breaks = id.breaks, pos.breaks = pos.breaks,
602
+ pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
603
+ seq.names = lim[, 1], id.ingroup = id.ingroup,
604
+ id.metric = id.metric, call = match.call());
605
+ if (plot) {
606
+ if (verbose) cat("Plotting.\n")
607
+ peaks <- plot(recplot, ...);
608
+ attr(recplot, "peaks") <- peaks
609
+ }
610
+ return(recplot);
611
+ }
612
+
613
+ #' Enveomics: Recruitment Plot (2) Peak Finder
614
+ #'
615
+ #' Identifies peaks in the population histogram potentially indicating
616
+ #' sub-population mixtures.
617
+ #'
618
+ #' @param x
619
+ #' An \code{\link{enve.RecPlot2}} object.
620
+ #' @param method
621
+ #' Peak-finder method. This should be one of:
622
+ #' \itemize{
623
+ #' \item \strong{emauto}
624
+ #' (Expectation-Maximization with auto-selection of components)
625
+ #' \item \strong{em}
626
+ #' (Expectation-Maximization)
627
+ #' \item \strong{mower}
628
+ #' (Custom distribution-mowing method)
629
+ #' }
630
+ #' @param ...
631
+ #' Any additional parameters supported by
632
+ #' \code{\link{enve.recplot2.findPeaks}}.
633
+ #'
634
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
635
+ #'
636
+ #' @author Luis M. Rodriguez-R [aut, cre]
637
+ #'
638
+ #' export
639
+
640
+ enve.recplot2.findPeaks <- function(
641
+ x,
642
+ method="emauto",
643
+ ...
644
+ ){
645
+ if(method == "emauto"){
646
+ peaks <- enve.recplot2.findPeaks.emauto(x, ...)
647
+ }else if(method == "em"){
648
+ peaks <- enve.recplot2.findPeaks.em(x, ...)
649
+ }else if(method == "mower"){
650
+ peaks <- enve.recplot2.findPeaks.mower(x, ...)
651
+ }else{
652
+ stop("Invalid peak-finder method ", method)
653
+ }
654
+ return(peaks)
655
+ }
656
+
657
+ #' Enveomics: Recruitment Plot (2) Emauto Peak Finder
658
+ #'
659
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
660
+ #' Model Expectation Maximization (GMM-EM) method with number of components
661
+ #' automatically detected.
662
+ #'
663
+ #' @param x
664
+ #' An \code{\link{enve.RecPlot2}} object.
665
+ #' @param components
666
+ #' A vector of number of components to evaluate.
667
+ #' @param criterion
668
+ #' Criterion to use for components selection. Must be one of:
669
+ #' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
670
+ #' (Bayesian Information Criterion or Schwarz Criterion).
671
+ #' @param merge.tol
672
+ #' When attempting to merge peaks with very similar sequencing depth, use
673
+ #' this number of significant digits (in log-scale).
674
+ #' @param verbose
675
+ #' Display (mostly debugging) information.
676
+ #' @param ...
677
+ #' Any additional parameters supported by
678
+ #' \code{\link{enve.recplot2.findPeaks.em}}.
679
+ #'
680
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
681
+ #'
682
+ #' @author Luis M. Rodriguez-R [aut, cre]
683
+ #'
684
+ #' @export
685
+
686
+ enve.recplot2.findPeaks.emauto <- function(
687
+ x,
688
+ components = seq(1, 5),
689
+ criterion = 'aic',
690
+ merge.tol = 2L,
691
+ verbose = FALSE,
692
+ ...
693
+ ){
694
+ best <- list(crit=0, pstore=list())
695
+ if(criterion == 'aic'){
696
+ do_crit <- function(ll, k, n) 2*k - 2*ll
697
+ }else if(criterion %in% c('bic', 'sbc')){
698
+ do_crit <- function(ll, k, n) log(n)*k - 2*ll
699
+ }else{
700
+ stop('Invalid criterion ', criterion)
701
+ }
702
+ for(comp in components){
703
+ if(verbose) cat('Testing:',comp,'\n')
704
+ best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
705
+ verbose, ...)
706
+ }
707
+ if(length(best[['peaks']])==0) return(list())
708
+
709
+ seqdepths.r <- signif(log(sapply(best[['peaks']],
710
+ function(x) x$seq.depth)), merge.tol)
711
+ distinct <- length(unique(seqdepths.r))
712
+ if(distinct < length(best[['peaks']])){
713
+ if(verbose) cat('Attempting merge to', distinct, 'components\n')
714
+ init <- apply(sapply(best[['peaks']],
715
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
716
+ init <- init[!duplicated(seqdepths.r),]
717
+ init <- list(mu=init[,'mean'], sd=init[,'sd'],
718
+ alpha=init[,'alpha']/sum(init[,'alpha']))
719
+ best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
720
+ verbose, ...)
721
+ }
722
+ return(best[['peaks']])
723
+ }
724
+
725
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder
726
+ #'
727
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
728
+ #' Model Expectation Maximization (GMM-EM) method.
729
+ #'
730
+ #' @param x
731
+ #' An \code{\link{enve.RecPlot2}} object.
732
+ #' @param max.iter
733
+ #' Maximum number of EM iterations.
734
+ #' @param ll.diff.res
735
+ #' Maximum Log-Likelihood difference to be considered as convergent.
736
+ #' @param components
737
+ #' Number of distributions assumed in the mixture.
738
+ #' @param rm.top
739
+ #' Top-values to remove before finding peaks, as a quantile probability.
740
+ #' This step is useful to remove highly conserved regions, but can be
741
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
742
+ #' \strong{after} removing zero-coverage windows.
743
+ #' @param verbose
744
+ #' Display (mostly debugging) information.
745
+ #' @param init
746
+ #' Initialization parameters. By default, these are derived from k-means
747
+ #' clustering. A named list with vectors for \code{mu}, \code{sd}, and
748
+ #' \code{alpha}, each of length \code{components}.
749
+ #' @param log
750
+ #' Logical value indicating if the estimations should be performed in
751
+ #' natural logarithm units. Do not change unless you know what you're
752
+ #' doing.
753
+ #'
754
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
755
+ #'
756
+ #' @author Luis M. Rodriguez-R [aut, cre]
757
+ #'
758
+ #' @export
759
+
760
+ enve.recplot2.findPeaks.em <- function(
761
+ x,
762
+ max.iter = 1000,
763
+ ll.diff.res = 1e-8,
764
+ components = 2,
765
+ rm.top = 0.05,
766
+ verbose = FALSE,
767
+ init,
768
+ log = TRUE
769
+ ){
770
+
771
+ # Essential vars
772
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
773
+ lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
774
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
775
+ if(log) lsd1 <- log(lsd1)
776
+
777
+ # 1. Initialize
778
+ if(missing(init)){
779
+ km.clust <- kmeans(lsd1, components)$cluster
780
+ init <- list(
781
+ mu = tapply(lsd1, km.clust, mean),
782
+ sd = tapply(lsd1, km.clust, sd),
783
+ alpha = table(km.clust) / length(km.clust)
784
+ )
785
+ }
786
+ m.step <- init
787
+ ll <- c()
788
+ cur.ll <- -Inf
789
+
790
+ for(i in 1:max.iter){
791
+ # 2/3. EM
792
+ e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
793
+ m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
794
+ # 4. Convergence
795
+ ll <- c(ll, e.step[["ll"]])
796
+ ll.diff <- abs(cur.ll - e.step[["ll"]])
797
+ cur.ll <- e.step[["ll"]]
798
+ if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
799
+ if(is.na(ll.diff) || ll.diff == Inf) break
800
+ if(ll.diff <= ll.diff.res) break
801
+ }
802
+
803
+ # Return
804
+ peaks <- list()
805
+ for(i in 1:components){
806
+ n.hat <- m.step[['alpha']][i]*length(lsd1)
807
+ peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
808
+ values.res=0, mode=m.step[['mu']][i],
809
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
810
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
811
+ merge.logdist=as.numeric(), log=log,
812
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
813
+ }
814
+ return(peaks)
815
+ }
816
+
817
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder
818
+ #'
819
+ #' Identifies peaks in the population histogram potentially indicating
820
+ #' sub-population mixtures, using a custom distribution-mowing method.
821
+ #'
822
+ #' @param x
823
+ #' An \code{\link{enve.RecPlot2}} object.
824
+ #' @param min.points
825
+ #' Minimum number of points in the quantile-estimation-range
826
+ #' \code{(quant.est)} to estimate a peak.
827
+ #' @param quant.est
828
+ #' Range of quantiles to be used in the estimation of a peak's
829
+ #' parameters.
830
+ #' @param mlv.opts
831
+ #' Ignored. For backwards compatibility.
832
+ #' @param fitdist.opts.sn
833
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
834
+ #' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
835
+ #' ammended with \code{xi=estimated} mode for each peak.
836
+ #' @param fitdist.opts.norm
837
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
838
+ #' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
839
+ #' ammended with \code{mean=estimated} mode for each peak.
840
+ #' @param rm.top
841
+ #' Top-values to remove before finding peaks, as a quantile probability.
842
+ #' This step is useful to remove highly conserved regions, but can be
843
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
844
+ #' \strong{after} removing zero-coverage windows.
845
+ #' @param with.skewness
846
+ #' Allow skewness correction of the peaks. Typically, the
847
+ #' sequencing-depth distribution for a single peak is left-skewed, due
848
+ #' partly (but not exclusively) to fragmentation and mapping sensitivity.
849
+ #' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
850
+ #' alternative solution for the first problem (fragmentation) called
851
+ #' "tail distribution".
852
+ #' @param optim.rounds
853
+ #' Maximum rounds of peak optimization.
854
+ #' @param optim.epsilon
855
+ #' Trace change at which optimization stops (unless \code{optim.rounds} is
856
+ #' reached first). The trace change is estimated as the sum of square
857
+ #' differences between parameters in one round and those from two rounds
858
+ #' earlier (to avoid infinite loops from approximation).
859
+ #' @param merge.logdist
860
+ #' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
861
+ #' to attempt merging. The default of ~0.22 corresponds to a maximum
862
+ #' difference of 25\%.
863
+ #' @param verbose
864
+ #' Display (mostly debugging) information.
865
+ #' @param log
866
+ #' Logical value indicating if the estimations should be performed in
867
+ #' natural logarithm units. Do not change unless you know what you're
868
+ #' doing.
869
+ #'
870
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
871
+ #'
872
+ #' @author Luis M. Rodriguez-R [aut, cre]
873
+ #'
874
+ #' @export
875
+
876
+ enve.recplot2.findPeaks.mower <- function(
877
+ x,
878
+ min.points=10,
879
+ quant.est=c(0.002, 0.998),
880
+ mlv.opts=list(method='parzen'),
881
+ fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
882
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
883
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
884
+ start=list(sd=1), lower=c(0, -Inf)),
885
+ rm.top=0.05,
886
+ with.skewness=TRUE,
887
+ optim.rounds=200,
888
+ optim.epsilon=1e-4,
889
+ merge.logdist=log(1.75),
890
+ verbose=FALSE,
891
+ log=TRUE
892
+ ){
893
+
894
+ # Essential vars
895
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
896
+ seqdepth.in <- x$pos.counts.in/pos.binsize;
897
+ lsd1 <- seqdepth.in[seqdepth.in>0];
898
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
899
+ if(log) lsd1 <- log(lsd1)
900
+ if(with.skewness){
901
+ fitdist.opts <- fitdist.opts.sn
902
+ }else{
903
+ fitdist.opts <- fitdist.opts.norm
904
+ }
905
+ peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
906
+ mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
907
+ optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
908
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
909
+
910
+ # Find seed peaks
911
+ if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
912
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
913
+
914
+ # Merge overlapping peaks
915
+ if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
916
+ merged <- (length(peaks)>1)
917
+ while(merged){
918
+ merged <- FALSE
919
+ ignore <- c()
920
+ peaks2 <- list();
921
+ for(i in 1:length(peaks)){
922
+ if(i %in% ignore) next
923
+ p <- peaks[[ i ]]
924
+ j <- enve.recplot2.__whichClosestPeak(p, peaks)
925
+ p2 <- peaks[[ j ]]
926
+ dst.a <- p$param.hat[[ length(p$param.hat) ]]
927
+ dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
928
+ if( abs(log(dst.a/dst.b)) < merge.logdist ){
929
+ if(verbose) cat('==> Attempting a merge at',
930
+ p$param.hat[[ length(p$param.hat) ]],'&',
931
+ p2$param.hat[[ length(p2$param.hat) ]],'X\n');
932
+ peaks.opts$lsd1 <- c(p$values, p2$values)
933
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
934
+ if(length(p.new)==1){
935
+ peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
936
+ ignore <- c(ignore, j)
937
+ merged <- TRUE
938
+ }
939
+ }
940
+ if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
941
+ }
942
+ peaks <- peaks2
943
+ if(length(peaks)==1) break
944
+ }
945
+
946
+ if(verbose) cat('Found',length(peaks),'peak(s)\n')
947
+ return(peaks);
948
+ }
949
+
950
+ #==============> Define utils
951
+
952
+ #' Enveomics: Recruitment Plot (2) Core Peak Finder
953
+ #'
954
+ #' Finds the peak in a list of peaks that is most likely to represent the
955
+ #' "core genome" of a population.
956
+ #'
957
+ #' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
958
+ #'
959
+ #' @author Luis M. Rodriguez-R [aut, cre]
960
+ #'
961
+ #' @export
962
+
963
+ enve.recplot2.corePeak <- function
964
+ (x
965
+ ){
966
+ # Find the peak with maximum depth (centrality)
967
+ maxPeak <- x[[
968
+ which.max(as.numeric(lapply(x,
969
+ function(y) y$param.hat[[ length(y$param.hat) ]])))
970
+ ]]
971
+ # If a "larger" peak (a peak explaining more bins of the genome) is within
972
+ # the default "merge.logdist" distance, take that one instead.
973
+ corePeak <- maxPeak
974
+ for(p in x){
975
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
976
+ corePeak.len <- ifelse(
977
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
978
+ sz.d <- log(p.len/corePeak.len)
979
+ if(is.nan(sz.d) || sz.d < 0) next
980
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
981
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
982
+ if(p$log) sq.d.a <- exp(sq.d.a)
983
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
984
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
985
+ }
986
+ return(corePeak)
987
+ }
988
+
989
+ #' Enveomics: Recruitment Plot (2) Change Cutoff
990
+ #'
991
+ #' Change the intra-species cutoff of an existing recruitment plot.
992
+ #'
993
+ #' @param rp
994
+ #' \code{\link{enve.RecPlot2}} object.
995
+ #' @param new.cutoff
996
+ #' New cutoff to use.
997
+ #'
998
+ #' @author Luis M. Rodriguez-R [aut, cre]
999
+ #'
1000
+ #' @export
1001
+
1002
+ enve.recplot2.changeCutoff <- function
1003
+ (rp,
1004
+ new.cutoff=98
1005
+ ){
1006
+ # Re-calculate vectors
1007
+ id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
1008
+ id.ingroup <- (id.mids > new.cutoff)
1009
+ pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
1010
+ pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
1011
+ # Update object
1012
+ attr(rp, "id.ingroup") <- id.ingroup
1013
+ attr(rp, "pos.counts.in") <- pos.counts.in
1014
+ attr(rp, "pos.counts.out") <- pos.counts.out
1015
+ attr(rp, "call") <- match.call()
1016
+ return(rp)
1017
+ }
1018
+
1019
+ #' Enveomics: Recruitment Plot (2) Window Depth Threshold
1020
+ #'
1021
+ #' Identifies the threshold below which windows should be identified as
1022
+ #' variable or absent.
1023
+ #'
1024
+ #' @param rp
1025
+ #' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
1026
+ #' @param peak
1027
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1028
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1029
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1030
+ #' @param lower.tail
1031
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1032
+ #' sequencing depth.
1033
+ #' @param significance
1034
+ #' Significance threshold (alpha) to select windows.
1035
+ #'
1036
+ #' @return
1037
+ #' Returns a float. The units are depth if the peaks were estimated in
1038
+ #' linear scale, or log-depth otherwise (\code{peak$log}).
1039
+ #'
1040
+ #' @author Luis M. Rodriguez-R [aut, cre]
1041
+ #'
1042
+ #' @export
1043
+
1044
+ enve.recplot2.windowDepthThreshold <- function
1045
+ (rp,
1046
+ peak,
1047
+ lower.tail=TRUE,
1048
+ significance=0.05
1049
+ ){
1050
+ if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
1051
+ par <- peak$param.hat
1052
+ par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
1053
+ thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
1054
+ if(peak$log) thr <- exp(thr)
1055
+
1056
+ return(thr)
1057
+ }
1058
+
1059
+ #' Enveomics: Recruitment Plot (2) Extract Windows
1060
+ #'
1061
+ #' Extract windows significantly below (or above) the peak in sequencing
1062
+ #' depth.
1063
+ #'
1064
+ #' @param rp
1065
+ #' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
1066
+ #' @param peak
1067
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1068
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1069
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1070
+ #' @param lower.tail
1071
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1072
+ #' sequencing depth.
1073
+ #' @param significance
1074
+ #' Significance threshold (alpha) to select windows.
1075
+ #' @param seq.names
1076
+ #' Returns subject sequence names instead of a vector of Booleans. If
1077
+ #' the recruitment plot was generated with named position bins (e.g, using
1078
+ #' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
1079
+ #' vector of characters (the sequence identifiers), otherwise it returns a
1080
+ #' data.frame with a name column and two columns of coordinates.
1081
+ #'
1082
+ #' @return
1083
+ #' Returns a vector of logicals if \code{seq.names = FALSE}.
1084
+ #' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
1085
+ #' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
1086
+ #' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
1087
+ #'
1088
+ #' @author Luis M. Rodriguez-R [aut, cre]
1089
+ #'
1090
+ #' @export
1091
+
1092
+ enve.recplot2.extractWindows <- function
1093
+ (rp,
1094
+ peak,
1095
+ lower.tail = TRUE,
1096
+ significance = 0.05,
1097
+ seq.names = FALSE
1098
+ ){
1099
+ # Determine the threshold
1100
+ thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
1101
+
1102
+ # Select windows past the threshold
1103
+ seqdepth.in <- enve.recplot2.seqdepth(rp)
1104
+ if(lower.tail){
1105
+ sel <- seqdepth.in < thr
1106
+ }else{
1107
+ sel <- seqdepth.in > thr
1108
+ }
1109
+
1110
+ # seq.names = FALSE
1111
+ if(!seq.names) return(sel)
1112
+ # seq.names = TRUE
1113
+ return(enve.recplot2.coordinates(rp, sel))
1114
+ }
1115
+
1116
+ #' Enveomics: Recruitment Plot (2) Compare Identities
1117
+ #'
1118
+ #' Compare the distribution of identities between two
1119
+ #' \code{\link{enve.RecPlot2}} objects.
1120
+ #'
1121
+ #' @param x
1122
+ #' First \code{\link{enve.RecPlot2}} object.
1123
+ #' @param y
1124
+ #' Second \code{\link{enve.RecPlot2}} object.
1125
+ #' @param method
1126
+ #' Distance method to use. This should be (an unambiguous abbreviation of)
1127
+ #' one of:
1128
+ #' \itemize{
1129
+ #' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
1130
+ #' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
1131
+ #' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
1132
+ #' doi:10.1214/aoms/1177729694}), or}
1133
+ #' \item{"euclidean"}
1134
+ #' }
1135
+ #' @param smooth.par
1136
+ #' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
1137
+ #' Use \code{NULL} to automatically determine this value using leave-one-out
1138
+ #' cross-validation (see \code{smooth.spline} parameter \code{spar}).
1139
+ #' @param pseudocounts
1140
+ #' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
1141
+ #' 1 for add-one smoothing.
1142
+ #' @param max.deviation
1143
+ #' Maximum mean deviation between identity breaks tolerated (as percent
1144
+ #' identity). Difference in number of \code{id.breaks} is never tolerated.
1145
+ #'
1146
+ #' @author Luis M. Rodriguez-R [aut, cre]
1147
+ #'
1148
+ #' @export
1149
+
1150
+ enve.recplot2.compareIdentities <- function
1151
+ (x,
1152
+ y,
1153
+ method="hellinger",
1154
+ smooth.par=NULL,
1155
+ pseudocounts=0,
1156
+ max.deviation=0.75
1157
+ ){
1158
+ METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
1159
+ i.meth <- pmatch(method, METHODS)
1160
+ if (is.na(i.meth)) stop("Invalid distance ", method)
1161
+ if(!inherits(x, "enve.RecPlot2"))
1162
+ stop("'x' must inherit from class `enve.RecPlot2`")
1163
+ if(!inherits(y, "enve.RecPlot2"))
1164
+ stop("'y' must inherit from class `enve.RecPlot2`")
1165
+ if(length(x$id.breaks) != length(y$id.breaks))
1166
+ stop("'x' and 'y' must have the same number of `id.breaks`")
1167
+ dev <- mean(abs(x$id.breaks - y$id.breaks))
1168
+ if(dev > max.deviation)
1169
+ stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
1170
+ dev)
1171
+ x.cnt <- x$id.counts
1172
+ y.cnt <- y$id.counts
1173
+ if(is.null(smooth.par) || smooth.par > 0){
1174
+ x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
1175
+ y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
1176
+ p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
1177
+ q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
1178
+ x.cnt <- pmax(p.spline$y, 0)
1179
+ y.cnt <- pmax(q.spline$y, 0)
1180
+ }
1181
+ a <- as.numeric(pseudocounts)
1182
+ p <- (x.cnt + a) / sum(x.cnt + a)
1183
+ q <- (y.cnt + a) / sum(y.cnt + a)
1184
+ d <- NA
1185
+ if(i.meth %in% c(1L, 2L)){
1186
+ d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
1187
+ if(i.meth==2L) d <- 1 - d**2
1188
+ }else if(i.meth %in% c(3L, 4L)){
1189
+ sel <- p>0
1190
+ if(any(q[sel]==0))
1191
+ stop("Undefined distance without absolute continuity, use pseudocounts")
1192
+ d <- -sum(p[sel]*log(q[sel]/p[sel]))
1193
+ }else if(i.meth == 5L){
1194
+ d <- sqrt(sum((q-p)**2))
1195
+ }
1196
+ return(d)
1197
+ }
1198
+
1199
+ #' Enveomics: Recruitment Plot (2) Coordinates
1200
+ #'
1201
+ #' Returns the sequence name and coordinates of the requested position bins.
1202
+ #'
1203
+ #' @param x
1204
+ #' \code{\link{enve.RecPlot2}} object.
1205
+ #' @param bins
1206
+ #' Vector of selected bins to return. It can be a vector of logical values
1207
+ #' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
1208
+ #' missing, returns the coordinates of all windows.
1209
+ #'
1210
+ #' @return
1211
+ #' Returns a data.frame with five columns: \code{name.from} (character),
1212
+ #' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
1213
+ #' (numeric), and \code{seq.name} (character).
1214
+ #' The first two correspond to sequence and position of the start point of the
1215
+ #' bin. The next two correspond to the sequence and position of the end point of
1216
+ #' the bin. The last one indicates the name of the sequence (if defined).
1217
+ #'
1218
+ #' @author Luis M. Rodriguez-R [aut, cre]
1219
+ #'
1220
+ #' @export
1221
+
1222
+ enve.recplot2.coordinates <- function
1223
+ (x,
1224
+ bins
1225
+ ){
1226
+ if(!inherits(x, "enve.RecPlot2"))
1227
+ stop("'x' must inherit from class `enve.RecPlot2`")
1228
+ if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
1229
+ if(!is.vector(bins)) stop("'bins' must be a vector")
1230
+ if(inherits(bins, "logical")) bins <- which(bins)
1231
+
1232
+ y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
1233
+
1234
+ for(i in 1:length(bins)){
1235
+ j <- bins[i]
1236
+ # Concatenated coordinates
1237
+ cc <- x$pos.breaks[c(j, j+1)]
1238
+ # Find the corresponding `seq.breaks`
1239
+ sb.from <- which(
1240
+ cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
1241
+ cc[1] < x$seq.breaks[-1])
1242
+ sb.to <- which(
1243
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
1244
+ cc[2] <= x$seq.breaks[-1])
1245
+ # Translate coordinates
1246
+ if(length(sb.from)==1 & length(sb.to)==1){
1247
+ y[i, 'name.from'] <- x$seq.names[sb.from]
1248
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
1249
+ y[i, 'name.to'] <- x$seq.names[sb.to]
1250
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1251
+ y[i, 'seq.name'] <- x$pos.names[i]
1252
+ }
1253
+ }
1254
+
1255
+ return(y)
1256
+ }
1257
+
1258
+ #' Enveomics: Recruitment Plot (2) Sequencing Depth
1259
+ #'
1260
+ #' Calculate the sequencing depth of the given window(s).
1261
+ #'
1262
+ #' @param x
1263
+ #' \code{\link{enve.RecPlot2}} object.
1264
+ #' @param sel
1265
+ #' Window(s) for which the sequencing depth is to be calculated. If not
1266
+ #' passed, it returns the sequencing depth of all windows.
1267
+ #' @param low.identity
1268
+ #' A logical indicating if the sequencing depth is to be estimated only
1269
+ #' with low-identity matches. By default, only high-identity matches are
1270
+ #' used.
1271
+ #'
1272
+ #' @return
1273
+ #' Returns a numeric vector of sequencing depths (in bp/bp).
1274
+ #'
1275
+ #' @author Luis M. Rodriguez-R [aut, cre]
1276
+ #'
1277
+ #' @export
1278
+
1279
+ enve.recplot2.seqdepth <- function
1280
+
1281
+ (x,
1282
+ sel,
1283
+ low.identity=FALSE
1284
+ ){
1285
+ if(!inherits(x, "enve.RecPlot2"))
1286
+ stop("'x' must inherit from class `enve.RecPlot2`")
1287
+ if(low.identity){
1288
+ pos.cnts.in <- x$pos.counts.out
1289
+ }else{
1290
+ pos.cnts.in <- x$pos.counts.in
1291
+ }
1292
+ pos.breaks <- x$pos.breaks
1293
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
1294
+ seqdepth.in <- pos.cnts.in/pos.binsize
1295
+ if(missing(sel)) return(seqdepth.in)
1296
+ return(seqdepth.in[sel])
1297
+ }
1298
+
1299
+ #' Enveomics: Recruitment Plot (2) ANI Estimate
1300
+ #'
1301
+ #' Estimate the Average Nucleotide Identity from reads (ANIr) from a
1302
+ #' recruitment plot.
1303
+ #'
1304
+ #' @param x
1305
+ #' \code{\link{enve.RecPlot2}} object.
1306
+ #' @param range
1307
+ #' Range of identities to be considered. By default, the full range
1308
+ #' is used (note that the upper boundary is \code{Inf} and not 100 because
1309
+ #' recruitment plots can also be built with bit-scores). To use only
1310
+ #' intra-population matches (with identities), use c(95,100). To use only
1311
+ #' inter-population values, use c(0,95).
1312
+ #'
1313
+ #' @author Luis M. Rodriguez-R [aut, cre]
1314
+ #'
1315
+ #' @export
1316
+
1317
+ enve.recplot2.ANIr <- function
1318
+ (x,
1319
+ range=c(0,Inf)
1320
+ ){
1321
+ if(!inherits(x, "enve.RecPlot2"))
1322
+ stop("'x' must inherit from class `enve.RecPlot2`")
1323
+ id.b <- x$id.breaks
1324
+ id <- (id.b[-1]+id.b[-length(id.b)])/2
1325
+ cnt <- x$id.counts
1326
+ cnt[id < range[1]] <- 0
1327
+ cnt[id > range[2]] <- 0
1328
+ return(sum(id*cnt/sum(cnt)))
1329
+ }
1330
+
1331
+ #==============> Define internal functions
1332
+
1333
+ #' Enveomics: Recruitment Plot (2) Internal Ancillary Function
1334
+ #'
1335
+ #' Internal ancillary function (see \code{\link{enve.recplot2}}).
1336
+ #'
1337
+ #' @param x \code{\link{enve.RecPlot2}} object
1338
+ #' @param pos.breaks Position breaks
1339
+ #' @param id.breaks Identity breaks
1340
+ #' @param rec.idcol Identity column to use
1341
+ #'
1342
+ #' @author Luis M. Rodriguez-R [aut, cre]
1343
+ #' @author Kenji Gerhardt [aut]
1344
+ #'
1345
+ #' @export
1346
+
1347
+ enve.recplot2.__counts <- function
1348
+ (x, pos.breaks, id.breaks, rec.idcol) {
1349
+ rec2 <- x$rec
1350
+ verbose <- x$verbose
1351
+
1352
+ # get counts of how many occurrences of each genome pos.bin there are per read
1353
+ x.bins <- mapply(
1354
+ function(start, end) {
1355
+ list(rle(findInterval(start:end, pos.breaks, left.open = T)))
1356
+ }, rec2[, 1], rec2[, 2])
1357
+
1358
+ # find the single y bin for each row, replicates it at the correct places to
1359
+ # the number of distinct bins found in its row
1360
+ y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
1361
+ times = unlist(lapply(x.bins, function(a) length(a$lengths))))
1362
+
1363
+ # x.bins_counts is the number of occurrences of each bin a row contains,
1364
+ # per row, then unlisted
1365
+ x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
1366
+
1367
+ # these are the pos. in. genome bins that each count in x.bins_counts falls into
1368
+ x.bins <- unlist(lapply(x.bins, function(a) a$values))
1369
+
1370
+ # much more efficient counts implementation in R using lists instead of a matrix:
1371
+ counts <- lapply(
1372
+ 1:(length(pos.breaks) - 1),
1373
+ function(col_len) rep(0, length(id.breaks) - 1))
1374
+
1375
+ # accesses the correct list in counts by x.bin, then
1376
+ # accesses the position in that row by y.bins and adds the new count
1377
+ for(i in 1:length(x.bins)){
1378
+ counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
1379
+ }
1380
+
1381
+ counts <- do.call(rbind, counts)
1382
+ return(counts)
1383
+ }
1384
+
1385
+ #' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
1386
+ #'
1387
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
1388
+ #'
1389
+ #' @param x \code{\link{enve.RecPlot2}} object
1390
+ #' @param comp Components
1391
+ #' @param do_crit Function estimating the criterion
1392
+ #' @param best Best solution thus far
1393
+ #' @param verbose If verbose
1394
+ #' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
1395
+ #'
1396
+ #' @author Luis M. Rodriguez-R [aut, cre]
1397
+ #'
1398
+ #' @export
1399
+
1400
+ enve.recplot2.findPeaks.__emauto_one <- function
1401
+ (x, comp, do_crit, best, verbose, ...){
1402
+ peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1403
+ if(length(peaks)==0) return(best)
1404
+ k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
1405
+ crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
1406
+ if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
1407
+ ifelse(crit > best[['crit']], '*', ''), '\n')
1408
+ if(crit > best[['crit']]){
1409
+ best[['crit']] <- crit
1410
+ best[['peaks']] <- peaks
1411
+ }
1412
+ best[['pstore']][[comp]] <- peaks
1413
+ return(best)
1414
+ }
1415
+
1416
+ #' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
1417
+ #'
1418
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1419
+ #'
1420
+ #' @param x Vector of log-transformed sequencing depths
1421
+ #' @param theta Parameters list
1422
+ #'
1423
+ #' @author Luis M. Rodriguez-R [aut, cre]
1424
+ #'
1425
+ #' @export
1426
+
1427
+ enve.recplot2.findPeaks.__em_e <- function
1428
+ (x, theta){
1429
+ components <- length(theta[['mu']])
1430
+ product <- do.call(cbind,
1431
+ lapply(1:components,
1432
+ function(i) dnorm(x, theta[['mu']][i],
1433
+ theta[['sd']][i])*theta[['alpha']][i]))
1434
+ sum.of.components <- rowSums(product)
1435
+ posterior <- product / sum.of.components
1436
+ for(i in which(sum.of.components == Inf)) {
1437
+ cat(i,'/',nrow(product), ':', product[i,], '\n')
1438
+ }
1439
+
1440
+ return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1441
+ }
1442
+
1443
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
1444
+ #'
1445
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1446
+ #'
1447
+ #' @param x Vector of log-transformed sequencing depths
1448
+ #' @param posterior Posterior probability
1449
+ #'
1450
+ #' @author Luis M. Rodriguez-R [aut, cre]
1451
+ #'
1452
+ #' @export
1453
+
1454
+ enve.recplot2.findPeaks.__em_m <- function
1455
+ (x, posterior){
1456
+ components <- ncol(posterior)
1457
+ n <- colSums(posterior)
1458
+ mu <- colSums(posterior * x) / n
1459
+ sd <- sqrt( colSums(
1460
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1461
+ alpha <- n/length(x)
1462
+ return(list(mu=mu, sd=sd, alpha=alpha))
1463
+ }
1464
+
1465
+ #' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
1466
+ #'
1467
+ #' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
1468
+ #'
1469
+ #' @param x \code{\link{enve.RecPlot2.Peak}} object
1470
+ #' @param mids Midpoints
1471
+ #' @param counts Counts
1472
+ #'
1473
+ #' @author Luis M. Rodriguez-R [aut, cre]
1474
+ #'
1475
+ #' @export
1476
+
1477
+ enve.recplot2.__peakHist <- function
1478
+ (x, mids, counts=TRUE){
1479
+ d.o <- x$param.hat
1480
+ if(length(x$log)==0) x$log <- FALSE
1481
+ if(x$log){
1482
+ d.o$x <- log(mids)
1483
+ }else{
1484
+ d.o$x <- mids
1485
+ }
1486
+ prob <- do.call(paste('d', x$dist, sep=''), d.o)
1487
+ if(!counts) return(prob)
1488
+ if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1489
+ return(prob*x$n.hat/sum(prob))
1490
+ }
1491
+
1492
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
1493
+ #'
1494
+ #' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1495
+ #'
1496
+ #' @param lsd1 Vector of log-transformed sequencing depths
1497
+ #' @param min.points Minimum number of points
1498
+ #' @param quant.est Quantile estimate
1499
+ #' @param mlv.opts List of options for \code{mlv}
1500
+ #' @param fitdist.opts List of options for \code{fitdist}
1501
+ #' @param with.skewness If skewed-normal should be used
1502
+ #' @param optim.rounds Maximum number of optimization rounds
1503
+ #' @param optim.epsilon Minimum difference considered negligible
1504
+ #' @param n.total Global number of windows
1505
+ #' @param merge.logdist Attempted \code{merge.logdist} parameter
1506
+ #' @param verbose If verbose
1507
+ #' @param log If log-transformed depths
1508
+ #'
1509
+ #' @author Luis M. Rodriguez-R [aut, cre]
1510
+ #'
1511
+ #' @export
1512
+
1513
+ enve.recplot2.findPeaks.__mow_one <- function
1514
+ (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1515
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1516
+ ){
1517
+ dist <- ifelse(with.skewness, 'sn', 'norm');
1518
+
1519
+ # Find peak
1520
+ o <- mlv.opts; o$x = lsd1;
1521
+ mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1522
+ if(verbose) cat('Anchoring at mode =',mode1,'\n')
1523
+ param.hat <- fitdist.opts$start; last.hat <- param.hat;
1524
+ lim <- NA;
1525
+ if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1526
+
1527
+ # Refine peak parameters
1528
+ for(round in 1:optim.rounds){
1529
+ param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1530
+ lim.o <- param.hat
1531
+ lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1532
+ lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1533
+ if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1534
+ 'and params =',as.numeric(param.hat),' \r')
1535
+ if(length(lsd1.pop) < min.points) break;
1536
+ o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1537
+ last.last.hat <- last.hat
1538
+ last.hat <- param.hat
1539
+ param.hat <- as.list(do.call(fitdist, o)$estimate);
1540
+ if(any(is.na(param.hat))){
1541
+ if(round>1) param.hat <- last.hat;
1542
+ break;
1543
+ }
1544
+ if(round > 1){
1545
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1546
+ if(epsilon1 < optim.epsilon) break;
1547
+ if(round > 2){
1548
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1549
+ if(epsilon2 < optim.epsilon) break;
1550
+ }
1551
+ }
1552
+ }
1553
+ if(verbose) cat('\n')
1554
+ if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1555
+
1556
+ # Mow distribution
1557
+ lsd2 <- c();
1558
+ lsd.pop <- c();
1559
+ n.hat <- length(lsd1.pop)/diff(quant.est)
1560
+ peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1561
+ param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1562
+ merge.logdist=merge.logdist, log=log)
1563
+ peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1564
+ peak.cnt <- enve.recplot2.__peakHist(peak,
1565
+ (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1566
+ for(i in 2:length(peak.breaks)){
1567
+ values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1568
+ n.exp <- peak.cnt[i-1]
1569
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1570
+ if(length(values)==0) next
1571
+ in.peak <- runif(length(values)) <= n.exp/length(values)
1572
+ lsd2 <- c(lsd2, values[!in.peak])
1573
+ lsd.pop <- c(lsd.pop, values[in.peak])
1574
+ }
1575
+ if(length(lsd.pop) < min.points) return(NULL)
1576
+
1577
+ # Return peak
1578
+ attr(peak, 'values') <- lsd.pop
1579
+ attr(peak, 'values.res') <- lsd2
1580
+ attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1581
+ plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1582
+ plot=FALSE)$counts)+1)/2
1583
+ mu <- tail(param.hat, n=1)
1584
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1585
+ if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1586
+ 'with expected n =',n.hat,'\n')
1587
+ return(peak)
1588
+ }
1589
+
1590
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
1591
+ #'
1592
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1593
+ #'
1594
+ #' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
1595
+ #'
1596
+ #' @author Luis M. Rodriguez-R [aut, cre]
1597
+ #'
1598
+ #' @export
1599
+
1600
+ enve.recplot2.findPeaks.__mower <- function
1601
+ (peaks.opts){
1602
+ peaks <- list()
1603
+ while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1604
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1605
+ if(is.null(peak)) break
1606
+ peaks[[ length(peaks)+1 ]] <- peak
1607
+ peaks.opts$lsd1 <- peak$values.res
1608
+ }
1609
+ return(peaks)
1610
+ }
1611
+
1612
+ #' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
1613
+ #'
1614
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
1615
+ #'
1616
+ #' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
1617
+ #' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
1618
+ #'
1619
+ #' @author Luis M. Rodriguez-R [aut, cre]
1620
+ #'
1621
+ #' @export
1622
+
1623
+ enve.recplot2.__whichClosestPeak <- function
1624
+ (peak, peaks){
1625
+ dist <- as.numeric(lapply(peaks,
1626
+ function(x)
1627
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1628
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
1629
+ dist[ dist==0 ] <- Inf
1630
+ return(which.min(dist))
1631
+ }