miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,1631 @@
1
+ #==============> Define S4 classes
2
+
3
+ #' Enveomics: Recruitment Plot (2) - S4 Class
4
+ #'
5
+ #' Enve-omics representation of Recruitment plots. This object can
6
+ #' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
7
+ #'
8
+ #' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
9
+ #' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
10
+ #' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
11
+ #' @slot id.counts \code{(numeric)} Counts per ID bin.
12
+ #' @slot id.breaks \code{(numeric)} Breaks of identity bins.
13
+ #' @slot pos.breaks \code{(numeric)} Breaks of position bins.
14
+ #' @slot pos.names \code{(character)} Names of the position bins.
15
+ #' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
16
+ #' @slot peaks \code{(list)} Peaks identified in the recplot.
17
+ #' Limits of the subject sequences after concatenation.
18
+ #' @slot seq.names \code{(character}) Names of the subject sequences.
19
+ #' @slot id.metric \code{(character}) Metric used as 'identity'.
20
+ #' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
21
+ #' @slot call \code{(call)} Call producing this object.
22
+ #'
23
+ #' @author Luis M. Rodriguez-R [aut, cre]
24
+ #'
25
+ #' @exportClass
26
+
27
+ enve.RecPlot2 <- setClass("enve.RecPlot2",
28
+ representation(
29
+ # slots = list(
30
+ counts='matrix',
31
+ pos.counts.in='numeric',
32
+ pos.counts.out='numeric',
33
+ id.counts='numeric',
34
+ id.breaks='numeric',
35
+ pos.breaks='numeric',
36
+ pos.names='character',
37
+ seq.breaks='numeric',
38
+ peaks='list',
39
+ seq.names='character',
40
+ id.metric='character',
41
+ id.ingroup='logical',
42
+ call='call')
43
+ ,package='enveomics.R'
44
+ );
45
+
46
+ #' Enveomics: Recruitment Plot (2) Peak - S4 Class
47
+ #'
48
+ #' Enve-omics representation of a peak in the sequencing depth histogram
49
+ #' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
50
+ #'
51
+ #' @slot dist \code{(character)}
52
+ #' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
53
+ #' (skew-normal).
54
+ #' @slot values \code{(numeric)}
55
+ #' Sequencing depth values predicted to conform the peak.
56
+ #' @slot values.res \code{(numeric)}
57
+ #' Sequencing depth values not explained by this or previously identified
58
+ #' peaks.
59
+ #' @slot mode \code{(numeric)}
60
+ #' Seed-value of mode anchoring the peak.
61
+ #' @slot param.hat \code{(list)}
62
+ #' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
63
+ #' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
64
+ #' xi=location). Note that the "dispersion" parameter is always first and
65
+ #' the "location" parameter is always last.
66
+ #' @slot n.hat \code{(numeric)}
67
+ #' Number of bins estimated to be explained by this peak. This should
68
+ #' ideally be equal to the length of \code{values}, but it's not an integer.
69
+ #' @slot n.total \code{(numeric)}
70
+ #' Total number of bins from which the peak was extracted. I.e., total
71
+ #' number of position bins with non-zero sequencing depth in the recruitment
72
+ #' plot (regardless of peak count).
73
+ #' @slot err.res \code{(numeric)}
74
+ #' Error left after adding the peak (mower) or log-likelihood (em or emauto).
75
+ #' @slot merge.logdist \code{(numeric)}
76
+ #' Attempted \code{merge.logdist} parameter.
77
+ #' @slot seq.depth \code{(numeric)}
78
+ #' Best estimate available for the sequencing depth of the peak (centrality).
79
+ #' @slot log \code{(logical)}
80
+ #' Indicates if the estimation was performed in natural logarithm space.
81
+ #'
82
+ #' @author Luis M. Rodriguez-R [aut, cre]
83
+ #'
84
+ #' @exportClass
85
+
86
+ enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
87
+ representation(
88
+ # slots = list(
89
+ dist='character',
90
+ values='numeric',
91
+ values.res='numeric',
92
+ mode='numeric',
93
+ param.hat='list',
94
+ n.hat='numeric',
95
+ n.total='numeric',
96
+ err.res='numeric',
97
+ merge.logdist='numeric',
98
+ seq.depth='numeric',
99
+ log='logical'
100
+ ));
101
+
102
+ #' Attribute accessor
103
+ #'
104
+ #'
105
+ #' @param x Object
106
+ #' @param name Attribute name
107
+ setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
108
+
109
+ #' Attribute accessor
110
+ #'
111
+ #'
112
+ #' @param x Object
113
+ #' @param name Attribute name
114
+ setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
115
+
116
+ #==============> Define S4 methods
117
+
118
+ #' Enveomics: Recruitment Plot (2)
119
+ #'
120
+ #' Plots an \code{\link{enve.RecPlot2}} object.
121
+ #'
122
+ #' @param x
123
+ #' \code{\link{enve.RecPlot2}} object to plot.
124
+ #' @param layout
125
+ #' Matrix indicating the position of the different panels in the layout,
126
+ #' where:
127
+ #' \itemize{
128
+ #' \item 0: Empty space
129
+ #' \item 1: Counts matrix
130
+ #' \item 2: position histogram (sequencing depth)
131
+ #' \item 3: identity histogram
132
+ #' \item 4: Populations histogram (histogram of sequencing depths)
133
+ #' \item 5: Color scale for the counts matrix (vertical)
134
+ #' \item 6: Color scale of the counts matrix (horizontal)
135
+ #' }
136
+ #' Only panels indicated here will be plotted. To plot only one panel
137
+ #' simply set this to the number of the panel you want to plot.
138
+ #' @param panel.fun
139
+ #' List of functions to be executed after drawing each panel. Use the
140
+ #' indices in \code{layout} (as characters) as keys. Functions for indices
141
+ #' missing in \code{layout} are ignored. For example, to add a vertical line
142
+ #' at the 3Mbp mark in both the position histogram and the counts matrix:
143
+ #' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
144
+ #' Note that the X-axis in both panels is in Mbp by default. To change
145
+ #' this behavior, set \code{pos.units} accordingly.
146
+ #' @param widths
147
+ #' Relative widths of the columns of \code{layout}.
148
+ #' @param heights
149
+ #' Relative heights of the rows of \code{layout}.
150
+ #' @param palette
151
+ #' Colors to be used to represent the counts matrix, sorted from no hits
152
+ #' to the maximum sequencing depth.
153
+ #' @param underlay.group
154
+ #' If TRUE, it indicates the in-group and out-group areas couloured based
155
+ #' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
156
+ #' @param peaks.col
157
+ #' If not \code{NA}, it attempts to represent peaks in the population histogram
158
+ #' in the specified color. Set to \code{NA} to avoid peak-finding.
159
+ #' @param use.peaks
160
+ #' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
161
+ #' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
162
+ #' @param id.lim
163
+ #' Limits of identities to represent.
164
+ #' @param pos.lim
165
+ #' Limits of positions to represent (in bp, regardless of \code{pos.units}).
166
+ #' @param pos.units
167
+ #' Units in which the positions should be represented (powers of 1,000
168
+ #' base pairs).
169
+ #' @param mar
170
+ #' Margins of the panels as a list, with the character representation of
171
+ #' the number of the panel as index (see \code{layout}).
172
+ #' @param pos.splines
173
+ #' Smoothing parameter for the splines in the position histogram. Zero
174
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
175
+ #' cross-validation.
176
+ #' @param id.splines
177
+ #' Smoothing parameter for the splines in the identity histogram. Zero
178
+ #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
179
+ #' cross-validation.
180
+ #' @param in.lwd
181
+ #' Line width for the sequencing depth of in-group matches.
182
+ #' @param out.lwd
183
+ #' Line width for the sequencing depth of out-group matches.
184
+ #' @param id.lwd
185
+ #' Line width for the identity histogram.
186
+ #' @param in.col
187
+ #' Color associated to in-group matches.
188
+ #' @param out.col
189
+ #' Color associated to out-group matches.
190
+ #' @param id.col
191
+ #' Color for the identity histogram.
192
+ #' @param breaks.col
193
+ #' Color of the vertical lines indicating sequence breaks.
194
+ #' @param peaks.opts
195
+ #' Options passed to \code{\link{enve.recplot2.findPeaks}},
196
+ #' if \code{peaks.col} is not \code{NA}.
197
+ #' @param ...
198
+ #' Any other graphic parameters (currently ignored).
199
+ #'
200
+ #' @return
201
+ #' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
202
+ #' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
203
+ #' \code{layout} doesn't include 4, returns \code{NA}.
204
+ #'
205
+ #' @author Luis M. Rodriguez-R [aut, cre]
206
+ #'
207
+ #' @method plot enve.RecPlot2
208
+ #' @export
209
+
210
+ plot.enve.RecPlot2 <- function
211
+ (x,
212
+ layout=matrix(c(5,5,2,1,4,3), nrow=2),
213
+ panel.fun=list(),
214
+ widths=c(1,7,2),
215
+ heights=c(1,2),
216
+ palette=grey((100:0)/100),
217
+ underlay.group=TRUE,
218
+ peaks.col='darkred',
219
+ use.peaks,
220
+ id.lim=range(x$id.breaks),
221
+ pos.lim=range(x$pos.breaks),
222
+ pos.units=c('Mbp','Kbp','bp'),
223
+ mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
224
+ '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
225
+ '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
226
+ '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
227
+ pos.splines=0,
228
+ id.splines=1/2,
229
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
230
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
231
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
232
+ in.col='darkblue',
233
+ out.col='lightblue',
234
+ id.col='black',
235
+ breaks.col='#AAAAAA40',
236
+ peaks.opts=list(),
237
+ ...
238
+ ){
239
+ pos.units <- match.arg(pos.units);
240
+ pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
241
+ pos.lim <- pos.lim/pos.factor;
242
+ lmat <- layout;
243
+ for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
244
+
245
+ layout(lmat, widths=widths, heights=heights);
246
+ ori.mar <- par('mar');
247
+
248
+ # Essential vars
249
+ counts <- x$counts
250
+
251
+ id.ingroup <- x$id.ingroup
252
+ id.counts <- x$id.counts
253
+ id.breaks <- x$id.breaks
254
+ id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
255
+ id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
256
+
257
+ pos.counts.in <- x$pos.counts.in
258
+ pos.counts.out <- x$pos.counts.out
259
+ pos.breaks <- x$pos.breaks/pos.factor
260
+ pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
261
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
262
+
263
+ seqdepth.in <- pos.counts.in/pos.binsize
264
+ seqdepth.out <- pos.counts.out/pos.binsize
265
+ seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
266
+ seqdepth.out[seqdepth.out>0]))*c(1/2,2)
267
+
268
+ if(underlay.group){
269
+ in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
270
+ list(maxColorValue=256, alpha=62)));
271
+ out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
272
+ list(maxColorValue=256, alpha=52)));
273
+ }
274
+
275
+ # [1] Counts matrix
276
+ if(any(layout==1)){
277
+ par(mar=mar[['1']]);
278
+ plot(1, t='n', bty='l',
279
+ xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
280
+ xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
281
+ if(underlay.group){
282
+ rect(pos.lim[1], id.lim[1], pos.lim[2],
283
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
284
+ rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
285
+ id.lim[2], col=in.bg, border=NA);
286
+ }
287
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col);
288
+ image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
289
+ bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
290
+ length.out=1+length(palette)), add=TRUE);
291
+ if(exists('1',panel.fun)) panel.fun[['1']]();
292
+ }
293
+
294
+ # [2] Position histogram
295
+ if(any(layout==2)){
296
+ par(mar=mar[['2']]);
297
+ if(any(layout==1)){
298
+ xlab=''
299
+ xaxt='n'
300
+ }else{
301
+ xlab=paste('Position in genome (',pos.units,')',sep='')
302
+ xaxt='s'
303
+ }
304
+ plot(1,t='n', bty='l', log='y',
305
+ xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
306
+ ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
307
+ abline(v=x$seq.breaks/pos.factor, col=breaks.col)
308
+ pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
309
+ pos.f <- rep(seqdepth.in,each=2)
310
+ lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
311
+ lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
312
+ if(is.null(pos.splines) || pos.splines > 0){
313
+ pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
314
+ spar=pos.splines)
315
+ lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
316
+ }
317
+ if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
318
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
319
+ seqdepth.lim[1]*3/2, col=out.col, border=NA);
320
+ if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
321
+ seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
322
+ seqdepth.lim[1]*3/2, col=in.col, border=NA);
323
+ if(exists('2',panel.fun)) panel.fun[['2']]();
324
+ }
325
+
326
+ # [3] Identity histogram
327
+ if(any(layout==3)){
328
+ par(mar=mar[['3']]);
329
+ if(any(layout==1)){
330
+ ylab=''
331
+ yaxt='n'
332
+ }else{
333
+ ylab=x$id.metric
334
+ yaxt='s'
335
+ }
336
+ if(sum(id.counts>0) >= 4){
337
+ id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
338
+ plot(1,t='n', bty='l', log='x',
339
+ xlim=id.counts.range, xlab='bps per bin', xaxs='i',
340
+ ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
341
+ if(underlay.group){
342
+ rect(id.counts.range[1], id.lim[1], id.counts.range[2],
343
+ min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
344
+ rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
345
+ id.counts.range[2], id.lim[2], col=in.bg, border=NA);
346
+ }
347
+ id.f <- rep(id.counts,each=2)
348
+ id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
349
+ lines(id.f, id.x, lwd=id.lwd, col=id.col);
350
+ if(is.null(id.splines) || id.splines > 0){
351
+ id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
352
+ spar=id.splines)
353
+ lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
354
+ }
355
+ }else{
356
+ plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
357
+ text(1,1,labels='Insufficient data', srt=90)
358
+ }
359
+ if(exists('3',panel.fun)) panel.fun[['3']]();
360
+ }
361
+
362
+ # [4] Populations histogram
363
+ peaks <- NA;
364
+ if(any(layout==4)){
365
+ par(mar=mar[['4']]);
366
+ if(any(layout==2)){
367
+ ylab=''
368
+ yaxt='n'
369
+ }else{
370
+ ylab='Sequencing depth (X)'
371
+ yaxt='s'
372
+ }
373
+ h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
374
+ length.out=200);
375
+ h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
376
+ h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
377
+ plot(1, t='n', log='y',
378
+ xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
379
+ xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
380
+ yaxt=yaxt)
381
+ y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
382
+ y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
383
+ lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
384
+ y.tmp.out, col=out.col)
385
+ polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
386
+ y.tmp.in, border=NA, col=in.col)
387
+ if(!is.na(peaks.col)){
388
+ o <- peaks.opts; o$x = x;
389
+ if(missing(use.peaks)){
390
+ peaks <- do.call(enve.recplot2.findPeaks, o)
391
+ }else{
392
+ peaks <- use.peaks
393
+ }
394
+ h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
395
+ if(!is.null(peaks) & length(peaks)>0){
396
+ pf <- h.mids*0;
397
+ for(i in 1:length(peaks)){
398
+ cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
399
+ lines(cnt, h.mids, col='red');
400
+ pf <- pf+cnt;
401
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
402
+ }
403
+ lines(pf, h.mids, col='red',lwd=1.5);
404
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
405
+ frx <- signif(100*as.numeric(
406
+ lapply(peaks,
407
+ function(x) ifelse(length(x$values)==0, x$n.hat,
408
+ length(x$values))/x$n.total)), 2)
409
+ if(peaks[[1]]$err.res < 0){
410
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
411
+ }else{
412
+ err <- paste(', err:',
413
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
414
+ }
415
+ legend('topright', bty='n', cex=1/2,
416
+ legend=paste(letters[1:length(peaks)],'. ',
417
+ dpt,'X (', frx, '%', err, ')', sep=''))
418
+ }
419
+ }
420
+ if(exists('4',panel.fun)) panel.fun[['4']]();
421
+ }
422
+
423
+ # [5] Color scale of the counts matrix (vertical)
424
+ count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
425
+ length.out=1+length(palette))
426
+ if(any(layout==5)){
427
+ par(mar=mar[['5']]);
428
+ plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
429
+ ylim=range(count.bins), yaxs='i', ylab='')
430
+ rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
431
+ border=NA)
432
+ if(exists('5',panel.fun)) panel.fun[['5']]();
433
+ }
434
+
435
+ # [6] Color scale of the coutnts matrix (horizontal)
436
+ if(any(layout==6)){
437
+ par(mar=mar[['6']]);
438
+ plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
439
+ xlim=range(count.bins), xaxs='i',xlab='');
440
+ rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
441
+ border=NA);
442
+ if(exists('6',panel.fun)) panel.fun[['6']]();
443
+ }
444
+
445
+ par(mar=ori.mar);
446
+ return(peaks);
447
+ }
448
+
449
+ #==============> Define core functions
450
+
451
+ #' Enveomics: Recruitment Plot (2)
452
+ #'
453
+ #' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
454
+ #' been previously executed.
455
+ #'
456
+ #' @param prefix
457
+ #' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
458
+ #' least the files .rec and .lim must exist with this prefix.
459
+ #' @param plot
460
+ #' Should the object be plotted?
461
+ #' @param pos.breaks
462
+ #' Breaks in the positions histogram. It can also be a vector of break
463
+ #' points, and values outside the range are ignored. If zero (0), it
464
+ #' uses the sequence breaks as defined in the .lim file, which means
465
+ #' one bin per contig (or gene, if the mapping is agains genes). Ignored
466
+ #' if `pos.breaks.tsv` is passed.
467
+ #' @param pos.breaks.tsv
468
+ #' Path to a list of (absolute) coordinates to use as position breaks.
469
+ #' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
470
+ #' must contain at least one column: coordinates of the break positions of
471
+ #' each position bin. If it has a second column, this is used as the name
472
+ #' of the position bin that ends at the given coordinate (the first row is
473
+ #' ignored). Any additional columns are currently ignored. If \code{NA},
474
+ #' position bins are determined by \code{pos.breaks}.
475
+ #' @param id.breaks
476
+ #' Breaks in the identity histogram. It can also be a vector of break
477
+ #' points, and values outside the range are ignored.
478
+ #' @param id.free.range
479
+ #' Indicates that the range should be freely set from the observed
480
+ #' values. Otherwise, 70-100\% is included in the identity histogram
481
+ #' (default).
482
+ #' @param id.metric
483
+ #' Metric of identity to be used (Y-axis). Corrected identity is only
484
+ #' supported if the original BLAST file included sequence lengths.
485
+ #' @param id.summary
486
+ #' Function summarizing the identity bins. Other recommended options
487
+ #' include: \code{median} to estimate the median instead of total bins, and
488
+ #' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
489
+ #' @param id.cutoff
490
+ #' Cutoff of identity metric above which the hits are considered
491
+ #' \code{in-group}. The 95\% identity corresponds to the expectation of
492
+ #' ANI<95\% within species.
493
+ #' @param threads
494
+ #' Number of threads to use.
495
+ #' @param verbose
496
+ #' Indicates if the function should report the advance.
497
+ #' @param ...
498
+ #' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
499
+ #'
500
+ #' @return Returns an object of class \code{\link{enve.RecPlot2}}.
501
+ #'
502
+ #' @author Luis M. Rodriguez-R [aut, cre]
503
+ #' @author Kenji Gerhardt [aut]
504
+ #'
505
+ #' @export
506
+
507
+ enve.recplot2 <- function(
508
+ prefix,
509
+ plot = TRUE,
510
+ pos.breaks = 1e3,
511
+ pos.breaks.tsv = NA,
512
+ id.breaks = 60,
513
+ id.free.range = FALSE,
514
+ id.metric = c('identity', 'corrected identity', 'bit score'),
515
+ id.summary = sum,
516
+ id.cutoff = 95,
517
+ threads = 2,
518
+ verbose = TRUE,
519
+ ...
520
+ ){
521
+ # Settings
522
+ id.metric <- match.arg(id.metric);
523
+
524
+ #Read files
525
+ if (verbose) cat("Reading files.\n")
526
+ rec <- read.table(paste(prefix, ".rec", sep = ""),
527
+ sep = "\t", comment.char = "", quote = "");
528
+ lim <- read.table(paste(prefix, ".lim", sep = ""),
529
+ sep = "\t", comment.char = "", quote = "", as.is = TRUE);
530
+
531
+ # Build matrix
532
+ if (verbose) cat("Building counts matrix.\n")
533
+ if (id.metric == "corrected identity" & ncol(rec) < 6) {
534
+ stop("Requesting corr. identity, but .rec file doesn't have 6th column")
535
+ }
536
+ rec.idcol <- ifelse(id.metric == "identity", 3,
537
+ ifelse(id.metric == "corrected identity", 6, 4))
538
+ pos.names <- as.character(NULL)
539
+ if (!is.na(pos.breaks.tsv)){
540
+ tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
541
+ pos.breaks <- as.numeric(tmp[, 1])
542
+ if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
543
+ } else if (length(pos.breaks) == 1) {
544
+ if (pos.breaks > 0){
545
+ pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
546
+ } else {
547
+ pos.breaks <- c(lim[1, 2], lim[, 3])
548
+ pos.names <- lim[, 1]
549
+ }
550
+ }
551
+ if (length(id.breaks) == 1) {
552
+ id.range.v <- rec[, rec.idcol]
553
+ if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
554
+ id.range.v <- range(id.range.v)
555
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
556
+ }
557
+
558
+ # Run in parallel
559
+ # If they already set threads to 1 manually, there's no point in launching
560
+ # clusters, it's just slower. Ditto for small files.
561
+ if (nrow(rec) < 75000 | threads == 1) {
562
+ # Coerces rec into a form that __counts is happy about
563
+ rec.l <- list()
564
+ rec.l[[1]] <- list(rec = rec, verbose = FALSE)
565
+
566
+ # No need to make a temporary variable, there's only one return for sure
567
+ # and it's not a list because it isn't coming back from an apply
568
+ counts <- enve.recplot2.__counts(
569
+ rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
570
+ rec.idcol = rec.idcol)
571
+ } else {
572
+ cl <- makeCluster(threads)
573
+ rec.l <- list()
574
+ thl <- ceiling(nrow(rec)/threads)
575
+ for (i in 0:(threads - 1)) {
576
+ rec.l[[i + 1]] <- list(
577
+ rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
578
+ verbose = ifelse(i == 0, verbose, FALSE))
579
+ }
580
+ counts.l <- clusterApply(
581
+ cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
582
+ id.breaks = id.breaks, rec.idcol = rec.idcol)
583
+ stopCluster(cl) # No spooky ghost clusters
584
+
585
+ counts <- counts.l[[1]]
586
+ for (i in 2:threads) counts <- counts + counts.l[[i]]
587
+ }
588
+
589
+ # Estimate 1D histograms
590
+ if (verbose) cat("Building histograms.\n")
591
+ id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
592
+ id.ingroup <- (id.mids > id.cutoff);
593
+ id.counts <- apply(counts, 2, id.summary);
594
+ pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
595
+ pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
596
+
597
+ # Plot and return
598
+ recplot <- new('enve.RecPlot2',
599
+ counts = counts, id.counts = id.counts,
600
+ pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
601
+ id.breaks = id.breaks, pos.breaks = pos.breaks,
602
+ pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
603
+ seq.names = lim[, 1], id.ingroup = id.ingroup,
604
+ id.metric = id.metric, call = match.call());
605
+ if (plot) {
606
+ if (verbose) cat("Plotting.\n")
607
+ peaks <- plot(recplot, ...);
608
+ attr(recplot, "peaks") <- peaks
609
+ }
610
+ return(recplot);
611
+ }
612
+
613
+ #' Enveomics: Recruitment Plot (2) Peak Finder
614
+ #'
615
+ #' Identifies peaks in the population histogram potentially indicating
616
+ #' sub-population mixtures.
617
+ #'
618
+ #' @param x
619
+ #' An \code{\link{enve.RecPlot2}} object.
620
+ #' @param method
621
+ #' Peak-finder method. This should be one of:
622
+ #' \itemize{
623
+ #' \item \strong{emauto}
624
+ #' (Expectation-Maximization with auto-selection of components)
625
+ #' \item \strong{em}
626
+ #' (Expectation-Maximization)
627
+ #' \item \strong{mower}
628
+ #' (Custom distribution-mowing method)
629
+ #' }
630
+ #' @param ...
631
+ #' Any additional parameters supported by
632
+ #' \code{\link{enve.recplot2.findPeaks}}.
633
+ #'
634
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
635
+ #'
636
+ #' @author Luis M. Rodriguez-R [aut, cre]
637
+ #'
638
+ #' export
639
+
640
+ enve.recplot2.findPeaks <- function(
641
+ x,
642
+ method="emauto",
643
+ ...
644
+ ){
645
+ if(method == "emauto"){
646
+ peaks <- enve.recplot2.findPeaks.emauto(x, ...)
647
+ }else if(method == "em"){
648
+ peaks <- enve.recplot2.findPeaks.em(x, ...)
649
+ }else if(method == "mower"){
650
+ peaks <- enve.recplot2.findPeaks.mower(x, ...)
651
+ }else{
652
+ stop("Invalid peak-finder method ", method)
653
+ }
654
+ return(peaks)
655
+ }
656
+
657
+ #' Enveomics: Recruitment Plot (2) Emauto Peak Finder
658
+ #'
659
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
660
+ #' Model Expectation Maximization (GMM-EM) method with number of components
661
+ #' automatically detected.
662
+ #'
663
+ #' @param x
664
+ #' An \code{\link{enve.RecPlot2}} object.
665
+ #' @param components
666
+ #' A vector of number of components to evaluate.
667
+ #' @param criterion
668
+ #' Criterion to use for components selection. Must be one of:
669
+ #' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
670
+ #' (Bayesian Information Criterion or Schwarz Criterion).
671
+ #' @param merge.tol
672
+ #' When attempting to merge peaks with very similar sequencing depth, use
673
+ #' this number of significant digits (in log-scale).
674
+ #' @param verbose
675
+ #' Display (mostly debugging) information.
676
+ #' @param ...
677
+ #' Any additional parameters supported by
678
+ #' \code{\link{enve.recplot2.findPeaks.em}}.
679
+ #'
680
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
681
+ #'
682
+ #' @author Luis M. Rodriguez-R [aut, cre]
683
+ #'
684
+ #' @export
685
+
686
+ enve.recplot2.findPeaks.emauto <- function(
687
+ x,
688
+ components = seq(1, 5),
689
+ criterion = 'aic',
690
+ merge.tol = 2L,
691
+ verbose = FALSE,
692
+ ...
693
+ ){
694
+ best <- list(crit=0, pstore=list())
695
+ if(criterion == 'aic'){
696
+ do_crit <- function(ll, k, n) 2*k - 2*ll
697
+ }else if(criterion %in% c('bic', 'sbc')){
698
+ do_crit <- function(ll, k, n) log(n)*k - 2*ll
699
+ }else{
700
+ stop('Invalid criterion ', criterion)
701
+ }
702
+ for(comp in components){
703
+ if(verbose) cat('Testing:',comp,'\n')
704
+ best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
705
+ verbose, ...)
706
+ }
707
+ if(length(best[['peaks']])==0) return(list())
708
+
709
+ seqdepths.r <- signif(log(sapply(best[['peaks']],
710
+ function(x) x$seq.depth)), merge.tol)
711
+ distinct <- length(unique(seqdepths.r))
712
+ if(distinct < length(best[['peaks']])){
713
+ if(verbose) cat('Attempting merge to', distinct, 'components\n')
714
+ init <- apply(sapply(best[['peaks']],
715
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
716
+ init <- init[!duplicated(seqdepths.r),]
717
+ init <- list(mu=init[,'mean'], sd=init[,'sd'],
718
+ alpha=init[,'alpha']/sum(init[,'alpha']))
719
+ best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
720
+ verbose, ...)
721
+ }
722
+ return(best[['peaks']])
723
+ }
724
+
725
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder
726
+ #'
727
+ #' Identifies peaks in the population histogram using a Gaussian Mixture
728
+ #' Model Expectation Maximization (GMM-EM) method.
729
+ #'
730
+ #' @param x
731
+ #' An \code{\link{enve.RecPlot2}} object.
732
+ #' @param max.iter
733
+ #' Maximum number of EM iterations.
734
+ #' @param ll.diff.res
735
+ #' Maximum Log-Likelihood difference to be considered as convergent.
736
+ #' @param components
737
+ #' Number of distributions assumed in the mixture.
738
+ #' @param rm.top
739
+ #' Top-values to remove before finding peaks, as a quantile probability.
740
+ #' This step is useful to remove highly conserved regions, but can be
741
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
742
+ #' \strong{after} removing zero-coverage windows.
743
+ #' @param verbose
744
+ #' Display (mostly debugging) information.
745
+ #' @param init
746
+ #' Initialization parameters. By default, these are derived from k-means
747
+ #' clustering. A named list with vectors for \code{mu}, \code{sd}, and
748
+ #' \code{alpha}, each of length \code{components}.
749
+ #' @param log
750
+ #' Logical value indicating if the estimations should be performed in
751
+ #' natural logarithm units. Do not change unless you know what you're
752
+ #' doing.
753
+ #'
754
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
755
+ #'
756
+ #' @author Luis M. Rodriguez-R [aut, cre]
757
+ #'
758
+ #' @export
759
+
760
+ enve.recplot2.findPeaks.em <- function(
761
+ x,
762
+ max.iter = 1000,
763
+ ll.diff.res = 1e-8,
764
+ components = 2,
765
+ rm.top = 0.05,
766
+ verbose = FALSE,
767
+ init,
768
+ log = TRUE
769
+ ){
770
+
771
+ # Essential vars
772
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
773
+ lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
774
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
775
+ if(log) lsd1 <- log(lsd1)
776
+
777
+ # 1. Initialize
778
+ if(missing(init)){
779
+ km.clust <- kmeans(lsd1, components)$cluster
780
+ init <- list(
781
+ mu = tapply(lsd1, km.clust, mean),
782
+ sd = tapply(lsd1, km.clust, sd),
783
+ alpha = table(km.clust) / length(km.clust)
784
+ )
785
+ }
786
+ m.step <- init
787
+ ll <- c()
788
+ cur.ll <- -Inf
789
+
790
+ for(i in 1:max.iter){
791
+ # 2/3. EM
792
+ e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
793
+ m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
794
+ # 4. Convergence
795
+ ll <- c(ll, e.step[["ll"]])
796
+ ll.diff <- abs(cur.ll - e.step[["ll"]])
797
+ cur.ll <- e.step[["ll"]]
798
+ if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
799
+ if(is.na(ll.diff) || ll.diff == Inf) break
800
+ if(ll.diff <= ll.diff.res) break
801
+ }
802
+
803
+ # Return
804
+ peaks <- list()
805
+ for(i in 1:components){
806
+ n.hat <- m.step[['alpha']][i]*length(lsd1)
807
+ peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
808
+ values.res=0, mode=m.step[['mu']][i],
809
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
810
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
811
+ merge.logdist=as.numeric(), log=log,
812
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
813
+ }
814
+ return(peaks)
815
+ }
816
+
817
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder
818
+ #'
819
+ #' Identifies peaks in the population histogram potentially indicating
820
+ #' sub-population mixtures, using a custom distribution-mowing method.
821
+ #'
822
+ #' @param x
823
+ #' An \code{\link{enve.RecPlot2}} object.
824
+ #' @param min.points
825
+ #' Minimum number of points in the quantile-estimation-range
826
+ #' \code{(quant.est)} to estimate a peak.
827
+ #' @param quant.est
828
+ #' Range of quantiles to be used in the estimation of a peak's
829
+ #' parameters.
830
+ #' @param mlv.opts
831
+ #' Ignored. For backwards compatibility.
832
+ #' @param fitdist.opts.sn
833
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
834
+ #' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
835
+ #' ammended with \code{xi=estimated} mode for each peak.
836
+ #' @param fitdist.opts.norm
837
+ #' Options passed to \code{fitdist} to estimate the standard deviation if
838
+ #' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
839
+ #' ammended with \code{mean=estimated} mode for each peak.
840
+ #' @param rm.top
841
+ #' Top-values to remove before finding peaks, as a quantile probability.
842
+ #' This step is useful to remove highly conserved regions, but can be
843
+ #' turned off by setting \code{rm.top=0}. The quantile is determined
844
+ #' \strong{after} removing zero-coverage windows.
845
+ #' @param with.skewness
846
+ #' Allow skewness correction of the peaks. Typically, the
847
+ #' sequencing-depth distribution for a single peak is left-skewed, due
848
+ #' partly (but not exclusively) to fragmentation and mapping sensitivity.
849
+ #' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
850
+ #' alternative solution for the first problem (fragmentation) called
851
+ #' "tail distribution".
852
+ #' @param optim.rounds
853
+ #' Maximum rounds of peak optimization.
854
+ #' @param optim.epsilon
855
+ #' Trace change at which optimization stops (unless \code{optim.rounds} is
856
+ #' reached first). The trace change is estimated as the sum of square
857
+ #' differences between parameters in one round and those from two rounds
858
+ #' earlier (to avoid infinite loops from approximation).
859
+ #' @param merge.logdist
860
+ #' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
861
+ #' to attempt merging. The default of ~0.22 corresponds to a maximum
862
+ #' difference of 25\%.
863
+ #' @param verbose
864
+ #' Display (mostly debugging) information.
865
+ #' @param log
866
+ #' Logical value indicating if the estimations should be performed in
867
+ #' natural logarithm units. Do not change unless you know what you're
868
+ #' doing.
869
+ #'
870
+ #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
871
+ #'
872
+ #' @author Luis M. Rodriguez-R [aut, cre]
873
+ #'
874
+ #' @export
875
+
876
+ enve.recplot2.findPeaks.mower <- function(
877
+ x,
878
+ min.points=10,
879
+ quant.est=c(0.002, 0.998),
880
+ mlv.opts=list(method='parzen'),
881
+ fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
882
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
883
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
884
+ start=list(sd=1), lower=c(0, -Inf)),
885
+ rm.top=0.05,
886
+ with.skewness=TRUE,
887
+ optim.rounds=200,
888
+ optim.epsilon=1e-4,
889
+ merge.logdist=log(1.75),
890
+ verbose=FALSE,
891
+ log=TRUE
892
+ ){
893
+
894
+ # Essential vars
895
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
896
+ seqdepth.in <- x$pos.counts.in/pos.binsize;
897
+ lsd1 <- seqdepth.in[seqdepth.in>0];
898
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
899
+ if(log) lsd1 <- log(lsd1)
900
+ if(with.skewness){
901
+ fitdist.opts <- fitdist.opts.sn
902
+ }else{
903
+ fitdist.opts <- fitdist.opts.norm
904
+ }
905
+ peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
906
+ mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
907
+ optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
908
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
909
+
910
+ # Find seed peaks
911
+ if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
912
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
913
+
914
+ # Merge overlapping peaks
915
+ if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
916
+ merged <- (length(peaks)>1)
917
+ while(merged){
918
+ merged <- FALSE
919
+ ignore <- c()
920
+ peaks2 <- list();
921
+ for(i in 1:length(peaks)){
922
+ if(i %in% ignore) next
923
+ p <- peaks[[ i ]]
924
+ j <- enve.recplot2.__whichClosestPeak(p, peaks)
925
+ p2 <- peaks[[ j ]]
926
+ dst.a <- p$param.hat[[ length(p$param.hat) ]]
927
+ dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
928
+ if( abs(log(dst.a/dst.b)) < merge.logdist ){
929
+ if(verbose) cat('==> Attempting a merge at',
930
+ p$param.hat[[ length(p$param.hat) ]],'&',
931
+ p2$param.hat[[ length(p2$param.hat) ]],'X\n');
932
+ peaks.opts$lsd1 <- c(p$values, p2$values)
933
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
934
+ if(length(p.new)==1){
935
+ peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
936
+ ignore <- c(ignore, j)
937
+ merged <- TRUE
938
+ }
939
+ }
940
+ if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
941
+ }
942
+ peaks <- peaks2
943
+ if(length(peaks)==1) break
944
+ }
945
+
946
+ if(verbose) cat('Found',length(peaks),'peak(s)\n')
947
+ return(peaks);
948
+ }
949
+
950
+ #==============> Define utils
951
+
952
+ #' Enveomics: Recruitment Plot (2) Core Peak Finder
953
+ #'
954
+ #' Finds the peak in a list of peaks that is most likely to represent the
955
+ #' "core genome" of a population.
956
+ #'
957
+ #' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
958
+ #'
959
+ #' @author Luis M. Rodriguez-R [aut, cre]
960
+ #'
961
+ #' @export
962
+
963
+ enve.recplot2.corePeak <- function
964
+ (x
965
+ ){
966
+ # Find the peak with maximum depth (centrality)
967
+ maxPeak <- x[[
968
+ which.max(as.numeric(lapply(x,
969
+ function(y) y$param.hat[[ length(y$param.hat) ]])))
970
+ ]]
971
+ # If a "larger" peak (a peak explaining more bins of the genome) is within
972
+ # the default "merge.logdist" distance, take that one instead.
973
+ corePeak <- maxPeak
974
+ for(p in x){
975
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
976
+ corePeak.len <- ifelse(
977
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
978
+ sz.d <- log(p.len/corePeak.len)
979
+ if(is.nan(sz.d) || sz.d < 0) next
980
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
981
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
982
+ if(p$log) sq.d.a <- exp(sq.d.a)
983
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
984
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
985
+ }
986
+ return(corePeak)
987
+ }
988
+
989
+ #' Enveomics: Recruitment Plot (2) Change Cutoff
990
+ #'
991
+ #' Change the intra-species cutoff of an existing recruitment plot.
992
+ #'
993
+ #' @param rp
994
+ #' \code{\link{enve.RecPlot2}} object.
995
+ #' @param new.cutoff
996
+ #' New cutoff to use.
997
+ #'
998
+ #' @author Luis M. Rodriguez-R [aut, cre]
999
+ #'
1000
+ #' @export
1001
+
1002
+ enve.recplot2.changeCutoff <- function
1003
+ (rp,
1004
+ new.cutoff=98
1005
+ ){
1006
+ # Re-calculate vectors
1007
+ id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
1008
+ id.ingroup <- (id.mids > new.cutoff)
1009
+ pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
1010
+ pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
1011
+ # Update object
1012
+ attr(rp, "id.ingroup") <- id.ingroup
1013
+ attr(rp, "pos.counts.in") <- pos.counts.in
1014
+ attr(rp, "pos.counts.out") <- pos.counts.out
1015
+ attr(rp, "call") <- match.call()
1016
+ return(rp)
1017
+ }
1018
+
1019
+ #' Enveomics: Recruitment Plot (2) Window Depth Threshold
1020
+ #'
1021
+ #' Identifies the threshold below which windows should be identified as
1022
+ #' variable or absent.
1023
+ #'
1024
+ #' @param rp
1025
+ #' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
1026
+ #' @param peak
1027
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1028
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1029
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1030
+ #' @param lower.tail
1031
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1032
+ #' sequencing depth.
1033
+ #' @param significance
1034
+ #' Significance threshold (alpha) to select windows.
1035
+ #'
1036
+ #' @return
1037
+ #' Returns a float. The units are depth if the peaks were estimated in
1038
+ #' linear scale, or log-depth otherwise (\code{peak$log}).
1039
+ #'
1040
+ #' @author Luis M. Rodriguez-R [aut, cre]
1041
+ #'
1042
+ #' @export
1043
+
1044
+ enve.recplot2.windowDepthThreshold <- function
1045
+ (rp,
1046
+ peak,
1047
+ lower.tail=TRUE,
1048
+ significance=0.05
1049
+ ){
1050
+ if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
1051
+ par <- peak$param.hat
1052
+ par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
1053
+ thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
1054
+ if(peak$log) thr <- exp(thr)
1055
+
1056
+ return(thr)
1057
+ }
1058
+
1059
+ #' Enveomics: Recruitment Plot (2) Extract Windows
1060
+ #'
1061
+ #' Extract windows significantly below (or above) the peak in sequencing
1062
+ #' depth.
1063
+ #'
1064
+ #' @param rp
1065
+ #' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
1066
+ #' @param peak
1067
+ #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1068
+ #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1069
+ #' used (see \code{\link{enve.recplot2.corePeak}}).
1070
+ #' @param lower.tail
1071
+ #' If \code{FALSE}, it returns windows significantly above the peak in
1072
+ #' sequencing depth.
1073
+ #' @param significance
1074
+ #' Significance threshold (alpha) to select windows.
1075
+ #' @param seq.names
1076
+ #' Returns subject sequence names instead of a vector of Booleans. If
1077
+ #' the recruitment plot was generated with named position bins (e.g, using
1078
+ #' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
1079
+ #' vector of characters (the sequence identifiers), otherwise it returns a
1080
+ #' data.frame with a name column and two columns of coordinates.
1081
+ #'
1082
+ #' @return
1083
+ #' Returns a vector of logicals if \code{seq.names = FALSE}.
1084
+ #' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
1085
+ #' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
1086
+ #' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
1087
+ #'
1088
+ #' @author Luis M. Rodriguez-R [aut, cre]
1089
+ #'
1090
+ #' @export
1091
+
1092
+ enve.recplot2.extractWindows <- function
1093
+ (rp,
1094
+ peak,
1095
+ lower.tail = TRUE,
1096
+ significance = 0.05,
1097
+ seq.names = FALSE
1098
+ ){
1099
+ # Determine the threshold
1100
+ thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
1101
+
1102
+ # Select windows past the threshold
1103
+ seqdepth.in <- enve.recplot2.seqdepth(rp)
1104
+ if(lower.tail){
1105
+ sel <- seqdepth.in < thr
1106
+ }else{
1107
+ sel <- seqdepth.in > thr
1108
+ }
1109
+
1110
+ # seq.names = FALSE
1111
+ if(!seq.names) return(sel)
1112
+ # seq.names = TRUE
1113
+ return(enve.recplot2.coordinates(rp, sel))
1114
+ }
1115
+
1116
+ #' Enveomics: Recruitment Plot (2) Compare Identities
1117
+ #'
1118
+ #' Compare the distribution of identities between two
1119
+ #' \code{\link{enve.RecPlot2}} objects.
1120
+ #'
1121
+ #' @param x
1122
+ #' First \code{\link{enve.RecPlot2}} object.
1123
+ #' @param y
1124
+ #' Second \code{\link{enve.RecPlot2}} object.
1125
+ #' @param method
1126
+ #' Distance method to use. This should be (an unambiguous abbreviation of)
1127
+ #' one of:
1128
+ #' \itemize{
1129
+ #' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
1130
+ #' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
1131
+ #' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
1132
+ #' doi:10.1214/aoms/1177729694}), or}
1133
+ #' \item{"euclidean"}
1134
+ #' }
1135
+ #' @param smooth.par
1136
+ #' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
1137
+ #' Use \code{NULL} to automatically determine this value using leave-one-out
1138
+ #' cross-validation (see \code{smooth.spline} parameter \code{spar}).
1139
+ #' @param pseudocounts
1140
+ #' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
1141
+ #' 1 for add-one smoothing.
1142
+ #' @param max.deviation
1143
+ #' Maximum mean deviation between identity breaks tolerated (as percent
1144
+ #' identity). Difference in number of \code{id.breaks} is never tolerated.
1145
+ #'
1146
+ #' @author Luis M. Rodriguez-R [aut, cre]
1147
+ #'
1148
+ #' @export
1149
+
1150
+ enve.recplot2.compareIdentities <- function
1151
+ (x,
1152
+ y,
1153
+ method="hellinger",
1154
+ smooth.par=NULL,
1155
+ pseudocounts=0,
1156
+ max.deviation=0.75
1157
+ ){
1158
+ METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
1159
+ i.meth <- pmatch(method, METHODS)
1160
+ if (is.na(i.meth)) stop("Invalid distance ", method)
1161
+ if(!inherits(x, "enve.RecPlot2"))
1162
+ stop("'x' must inherit from class `enve.RecPlot2`")
1163
+ if(!inherits(y, "enve.RecPlot2"))
1164
+ stop("'y' must inherit from class `enve.RecPlot2`")
1165
+ if(length(x$id.breaks) != length(y$id.breaks))
1166
+ stop("'x' and 'y' must have the same number of `id.breaks`")
1167
+ dev <- mean(abs(x$id.breaks - y$id.breaks))
1168
+ if(dev > max.deviation)
1169
+ stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
1170
+ dev)
1171
+ x.cnt <- x$id.counts
1172
+ y.cnt <- y$id.counts
1173
+ if(is.null(smooth.par) || smooth.par > 0){
1174
+ x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
1175
+ y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
1176
+ p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
1177
+ q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
1178
+ x.cnt <- pmax(p.spline$y, 0)
1179
+ y.cnt <- pmax(q.spline$y, 0)
1180
+ }
1181
+ a <- as.numeric(pseudocounts)
1182
+ p <- (x.cnt + a) / sum(x.cnt + a)
1183
+ q <- (y.cnt + a) / sum(y.cnt + a)
1184
+ d <- NA
1185
+ if(i.meth %in% c(1L, 2L)){
1186
+ d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
1187
+ if(i.meth==2L) d <- 1 - d**2
1188
+ }else if(i.meth %in% c(3L, 4L)){
1189
+ sel <- p>0
1190
+ if(any(q[sel]==0))
1191
+ stop("Undefined distance without absolute continuity, use pseudocounts")
1192
+ d <- -sum(p[sel]*log(q[sel]/p[sel]))
1193
+ }else if(i.meth == 5L){
1194
+ d <- sqrt(sum((q-p)**2))
1195
+ }
1196
+ return(d)
1197
+ }
1198
+
1199
+ #' Enveomics: Recruitment Plot (2) Coordinates
1200
+ #'
1201
+ #' Returns the sequence name and coordinates of the requested position bins.
1202
+ #'
1203
+ #' @param x
1204
+ #' \code{\link{enve.RecPlot2}} object.
1205
+ #' @param bins
1206
+ #' Vector of selected bins to return. It can be a vector of logical values
1207
+ #' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
1208
+ #' missing, returns the coordinates of all windows.
1209
+ #'
1210
+ #' @return
1211
+ #' Returns a data.frame with five columns: \code{name.from} (character),
1212
+ #' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
1213
+ #' (numeric), and \code{seq.name} (character).
1214
+ #' The first two correspond to sequence and position of the start point of the
1215
+ #' bin. The next two correspond to the sequence and position of the end point of
1216
+ #' the bin. The last one indicates the name of the sequence (if defined).
1217
+ #'
1218
+ #' @author Luis M. Rodriguez-R [aut, cre]
1219
+ #'
1220
+ #' @export
1221
+
1222
+ enve.recplot2.coordinates <- function
1223
+ (x,
1224
+ bins
1225
+ ){
1226
+ if(!inherits(x, "enve.RecPlot2"))
1227
+ stop("'x' must inherit from class `enve.RecPlot2`")
1228
+ if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
1229
+ if(!is.vector(bins)) stop("'bins' must be a vector")
1230
+ if(inherits(bins, "logical")) bins <- which(bins)
1231
+
1232
+ y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
1233
+
1234
+ for(i in 1:length(bins)){
1235
+ j <- bins[i]
1236
+ # Concatenated coordinates
1237
+ cc <- x$pos.breaks[c(j, j+1)]
1238
+ # Find the corresponding `seq.breaks`
1239
+ sb.from <- which(
1240
+ cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
1241
+ cc[1] < x$seq.breaks[-1])
1242
+ sb.to <- which(
1243
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
1244
+ cc[2] <= x$seq.breaks[-1])
1245
+ # Translate coordinates
1246
+ if(length(sb.from)==1 & length(sb.to)==1){
1247
+ y[i, 'name.from'] <- x$seq.names[sb.from]
1248
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
1249
+ y[i, 'name.to'] <- x$seq.names[sb.to]
1250
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1251
+ y[i, 'seq.name'] <- x$pos.names[i]
1252
+ }
1253
+ }
1254
+
1255
+ return(y)
1256
+ }
1257
+
1258
+ #' Enveomics: Recruitment Plot (2) Sequencing Depth
1259
+ #'
1260
+ #' Calculate the sequencing depth of the given window(s).
1261
+ #'
1262
+ #' @param x
1263
+ #' \code{\link{enve.RecPlot2}} object.
1264
+ #' @param sel
1265
+ #' Window(s) for which the sequencing depth is to be calculated. If not
1266
+ #' passed, it returns the sequencing depth of all windows.
1267
+ #' @param low.identity
1268
+ #' A logical indicating if the sequencing depth is to be estimated only
1269
+ #' with low-identity matches. By default, only high-identity matches are
1270
+ #' used.
1271
+ #'
1272
+ #' @return
1273
+ #' Returns a numeric vector of sequencing depths (in bp/bp).
1274
+ #'
1275
+ #' @author Luis M. Rodriguez-R [aut, cre]
1276
+ #'
1277
+ #' @export
1278
+
1279
+ enve.recplot2.seqdepth <- function
1280
+
1281
+ (x,
1282
+ sel,
1283
+ low.identity=FALSE
1284
+ ){
1285
+ if(!inherits(x, "enve.RecPlot2"))
1286
+ stop("'x' must inherit from class `enve.RecPlot2`")
1287
+ if(low.identity){
1288
+ pos.cnts.in <- x$pos.counts.out
1289
+ }else{
1290
+ pos.cnts.in <- x$pos.counts.in
1291
+ }
1292
+ pos.breaks <- x$pos.breaks
1293
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
1294
+ seqdepth.in <- pos.cnts.in/pos.binsize
1295
+ if(missing(sel)) return(seqdepth.in)
1296
+ return(seqdepth.in[sel])
1297
+ }
1298
+
1299
+ #' Enveomics: Recruitment Plot (2) ANI Estimate
1300
+ #'
1301
+ #' Estimate the Average Nucleotide Identity from reads (ANIr) from a
1302
+ #' recruitment plot.
1303
+ #'
1304
+ #' @param x
1305
+ #' \code{\link{enve.RecPlot2}} object.
1306
+ #' @param range
1307
+ #' Range of identities to be considered. By default, the full range
1308
+ #' is used (note that the upper boundary is \code{Inf} and not 100 because
1309
+ #' recruitment plots can also be built with bit-scores). To use only
1310
+ #' intra-population matches (with identities), use c(95,100). To use only
1311
+ #' inter-population values, use c(0,95).
1312
+ #'
1313
+ #' @author Luis M. Rodriguez-R [aut, cre]
1314
+ #'
1315
+ #' @export
1316
+
1317
+ enve.recplot2.ANIr <- function
1318
+ (x,
1319
+ range=c(0,Inf)
1320
+ ){
1321
+ if(!inherits(x, "enve.RecPlot2"))
1322
+ stop("'x' must inherit from class `enve.RecPlot2`")
1323
+ id.b <- x$id.breaks
1324
+ id <- (id.b[-1]+id.b[-length(id.b)])/2
1325
+ cnt <- x$id.counts
1326
+ cnt[id < range[1]] <- 0
1327
+ cnt[id > range[2]] <- 0
1328
+ return(sum(id*cnt/sum(cnt)))
1329
+ }
1330
+
1331
+ #==============> Define internal functions
1332
+
1333
+ #' Enveomics: Recruitment Plot (2) Internal Ancillary Function
1334
+ #'
1335
+ #' Internal ancillary function (see \code{\link{enve.recplot2}}).
1336
+ #'
1337
+ #' @param x \code{\link{enve.RecPlot2}} object
1338
+ #' @param pos.breaks Position breaks
1339
+ #' @param id.breaks Identity breaks
1340
+ #' @param rec.idcol Identity column to use
1341
+ #'
1342
+ #' @author Luis M. Rodriguez-R [aut, cre]
1343
+ #' @author Kenji Gerhardt [aut]
1344
+ #'
1345
+ #' @export
1346
+
1347
+ enve.recplot2.__counts <- function
1348
+ (x, pos.breaks, id.breaks, rec.idcol) {
1349
+ rec2 <- x$rec
1350
+ verbose <- x$verbose
1351
+
1352
+ # get counts of how many occurrences of each genome pos.bin there are per read
1353
+ x.bins <- mapply(
1354
+ function(start, end) {
1355
+ list(rle(findInterval(start:end, pos.breaks, left.open = T)))
1356
+ }, rec2[, 1], rec2[, 2])
1357
+
1358
+ # find the single y bin for each row, replicates it at the correct places to
1359
+ # the number of distinct bins found in its row
1360
+ y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
1361
+ times = unlist(lapply(x.bins, function(a) length(a$lengths))))
1362
+
1363
+ # x.bins_counts is the number of occurrences of each bin a row contains,
1364
+ # per row, then unlisted
1365
+ x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
1366
+
1367
+ # these are the pos. in. genome bins that each count in x.bins_counts falls into
1368
+ x.bins <- unlist(lapply(x.bins, function(a) a$values))
1369
+
1370
+ # much more efficient counts implementation in R using lists instead of a matrix:
1371
+ counts <- lapply(
1372
+ 1:(length(pos.breaks) - 1),
1373
+ function(col_len) rep(0, length(id.breaks) - 1))
1374
+
1375
+ # accesses the correct list in counts by x.bin, then
1376
+ # accesses the position in that row by y.bins and adds the new count
1377
+ for(i in 1:length(x.bins)){
1378
+ counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
1379
+ }
1380
+
1381
+ counts <- do.call(rbind, counts)
1382
+ return(counts)
1383
+ }
1384
+
1385
+ #' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
1386
+ #'
1387
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
1388
+ #'
1389
+ #' @param x \code{\link{enve.RecPlot2}} object
1390
+ #' @param comp Components
1391
+ #' @param do_crit Function estimating the criterion
1392
+ #' @param best Best solution thus far
1393
+ #' @param verbose If verbose
1394
+ #' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
1395
+ #'
1396
+ #' @author Luis M. Rodriguez-R [aut, cre]
1397
+ #'
1398
+ #' @export
1399
+
1400
+ enve.recplot2.findPeaks.__emauto_one <- function
1401
+ (x, comp, do_crit, best, verbose, ...){
1402
+ peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1403
+ if(length(peaks)==0) return(best)
1404
+ k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
1405
+ crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
1406
+ if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
1407
+ ifelse(crit > best[['crit']], '*', ''), '\n')
1408
+ if(crit > best[['crit']]){
1409
+ best[['crit']] <- crit
1410
+ best[['peaks']] <- peaks
1411
+ }
1412
+ best[['pstore']][[comp]] <- peaks
1413
+ return(best)
1414
+ }
1415
+
1416
+ #' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
1417
+ #'
1418
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1419
+ #'
1420
+ #' @param x Vector of log-transformed sequencing depths
1421
+ #' @param theta Parameters list
1422
+ #'
1423
+ #' @author Luis M. Rodriguez-R [aut, cre]
1424
+ #'
1425
+ #' @export
1426
+
1427
+ enve.recplot2.findPeaks.__em_e <- function
1428
+ (x, theta){
1429
+ components <- length(theta[['mu']])
1430
+ product <- do.call(cbind,
1431
+ lapply(1:components,
1432
+ function(i) dnorm(x, theta[['mu']][i],
1433
+ theta[['sd']][i])*theta[['alpha']][i]))
1434
+ sum.of.components <- rowSums(product)
1435
+ posterior <- product / sum.of.components
1436
+ for(i in which(sum.of.components == Inf)) {
1437
+ cat(i,'/',nrow(product), ':', product[i,], '\n')
1438
+ }
1439
+
1440
+ return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1441
+ }
1442
+
1443
+ #' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
1444
+ #'
1445
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1446
+ #'
1447
+ #' @param x Vector of log-transformed sequencing depths
1448
+ #' @param posterior Posterior probability
1449
+ #'
1450
+ #' @author Luis M. Rodriguez-R [aut, cre]
1451
+ #'
1452
+ #' @export
1453
+
1454
+ enve.recplot2.findPeaks.__em_m <- function
1455
+ (x, posterior){
1456
+ components <- ncol(posterior)
1457
+ n <- colSums(posterior)
1458
+ mu <- colSums(posterior * x) / n
1459
+ sd <- sqrt( colSums(
1460
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1461
+ alpha <- n/length(x)
1462
+ return(list(mu=mu, sd=sd, alpha=alpha))
1463
+ }
1464
+
1465
+ #' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
1466
+ #'
1467
+ #' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
1468
+ #'
1469
+ #' @param x \code{\link{enve.RecPlot2.Peak}} object
1470
+ #' @param mids Midpoints
1471
+ #' @param counts Counts
1472
+ #'
1473
+ #' @author Luis M. Rodriguez-R [aut, cre]
1474
+ #'
1475
+ #' @export
1476
+
1477
+ enve.recplot2.__peakHist <- function
1478
+ (x, mids, counts=TRUE){
1479
+ d.o <- x$param.hat
1480
+ if(length(x$log)==0) x$log <- FALSE
1481
+ if(x$log){
1482
+ d.o$x <- log(mids)
1483
+ }else{
1484
+ d.o$x <- mids
1485
+ }
1486
+ prob <- do.call(paste('d', x$dist, sep=''), d.o)
1487
+ if(!counts) return(prob)
1488
+ if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1489
+ return(prob*x$n.hat/sum(prob))
1490
+ }
1491
+
1492
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
1493
+ #'
1494
+ #' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1495
+ #'
1496
+ #' @param lsd1 Vector of log-transformed sequencing depths
1497
+ #' @param min.points Minimum number of points
1498
+ #' @param quant.est Quantile estimate
1499
+ #' @param mlv.opts List of options for \code{mlv}
1500
+ #' @param fitdist.opts List of options for \code{fitdist}
1501
+ #' @param with.skewness If skewed-normal should be used
1502
+ #' @param optim.rounds Maximum number of optimization rounds
1503
+ #' @param optim.epsilon Minimum difference considered negligible
1504
+ #' @param n.total Global number of windows
1505
+ #' @param merge.logdist Attempted \code{merge.logdist} parameter
1506
+ #' @param verbose If verbose
1507
+ #' @param log If log-transformed depths
1508
+ #'
1509
+ #' @author Luis M. Rodriguez-R [aut, cre]
1510
+ #'
1511
+ #' @export
1512
+
1513
+ enve.recplot2.findPeaks.__mow_one <- function
1514
+ (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1515
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1516
+ ){
1517
+ dist <- ifelse(with.skewness, 'sn', 'norm');
1518
+
1519
+ # Find peak
1520
+ o <- mlv.opts; o$x = lsd1;
1521
+ mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1522
+ if(verbose) cat('Anchoring at mode =',mode1,'\n')
1523
+ param.hat <- fitdist.opts$start; last.hat <- param.hat;
1524
+ lim <- NA;
1525
+ if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1526
+
1527
+ # Refine peak parameters
1528
+ for(round in 1:optim.rounds){
1529
+ param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1530
+ lim.o <- param.hat
1531
+ lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1532
+ lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1533
+ if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1534
+ 'and params =',as.numeric(param.hat),' \r')
1535
+ if(length(lsd1.pop) < min.points) break;
1536
+ o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1537
+ last.last.hat <- last.hat
1538
+ last.hat <- param.hat
1539
+ param.hat <- as.list(do.call(fitdist, o)$estimate);
1540
+ if(any(is.na(param.hat))){
1541
+ if(round>1) param.hat <- last.hat;
1542
+ break;
1543
+ }
1544
+ if(round > 1){
1545
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1546
+ if(epsilon1 < optim.epsilon) break;
1547
+ if(round > 2){
1548
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1549
+ if(epsilon2 < optim.epsilon) break;
1550
+ }
1551
+ }
1552
+ }
1553
+ if(verbose) cat('\n')
1554
+ if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1555
+
1556
+ # Mow distribution
1557
+ lsd2 <- c();
1558
+ lsd.pop <- c();
1559
+ n.hat <- length(lsd1.pop)/diff(quant.est)
1560
+ peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1561
+ param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1562
+ merge.logdist=merge.logdist, log=log)
1563
+ peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1564
+ peak.cnt <- enve.recplot2.__peakHist(peak,
1565
+ (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1566
+ for(i in 2:length(peak.breaks)){
1567
+ values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1568
+ n.exp <- peak.cnt[i-1]
1569
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1570
+ if(length(values)==0) next
1571
+ in.peak <- runif(length(values)) <= n.exp/length(values)
1572
+ lsd2 <- c(lsd2, values[!in.peak])
1573
+ lsd.pop <- c(lsd.pop, values[in.peak])
1574
+ }
1575
+ if(length(lsd.pop) < min.points) return(NULL)
1576
+
1577
+ # Return peak
1578
+ attr(peak, 'values') <- lsd.pop
1579
+ attr(peak, 'values.res') <- lsd2
1580
+ attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1581
+ plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1582
+ plot=FALSE)$counts)+1)/2
1583
+ mu <- tail(param.hat, n=1)
1584
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1585
+ if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1586
+ 'with expected n =',n.hat,'\n')
1587
+ return(peak)
1588
+ }
1589
+
1590
+ #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
1591
+ #'
1592
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1593
+ #'
1594
+ #' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
1595
+ #'
1596
+ #' @author Luis M. Rodriguez-R [aut, cre]
1597
+ #'
1598
+ #' @export
1599
+
1600
+ enve.recplot2.findPeaks.__mower <- function
1601
+ (peaks.opts){
1602
+ peaks <- list()
1603
+ while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1604
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1605
+ if(is.null(peak)) break
1606
+ peaks[[ length(peaks)+1 ]] <- peak
1607
+ peaks.opts$lsd1 <- peak$values.res
1608
+ }
1609
+ return(peaks)
1610
+ }
1611
+
1612
+ #' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
1613
+ #'
1614
+ #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
1615
+ #'
1616
+ #' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
1617
+ #' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
1618
+ #'
1619
+ #' @author Luis M. Rodriguez-R [aut, cre]
1620
+ #'
1621
+ #' @export
1622
+
1623
+ enve.recplot2.__whichClosestPeak <- function
1624
+ (peak, peaks){
1625
+ dist <- as.numeric(lapply(peaks,
1626
+ function(x)
1627
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1628
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
1629
+ dist[ dist==0 ] <- Inf
1630
+ return(which.min(dist))
1631
+ }