miga-base 1.2.15.2 → 1.2.15.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (306) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/download/gtdb.rb +4 -1
  3. data/lib/miga/cli/action/gtdb_get.rb +4 -0
  4. data/lib/miga/daemon.rb +4 -1
  5. data/lib/miga/lair.rb +6 -4
  6. data/lib/miga/remote_dataset/download.rb +3 -2
  7. data/lib/miga/remote_dataset.rb +25 -7
  8. data/lib/miga/taxonomy.rb +6 -0
  9. data/lib/miga/version.rb +2 -2
  10. metadata +6 -302
  11. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +0 -41964
  12. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +0 -32439
  13. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -62056
  14. data/utils/FastAAI/FastAAI +0 -3659
  15. data/utils/FastAAI/FastAAI-legacy/FastAAI +0 -1336
  16. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +0 -1296
  17. data/utils/FastAAI/README.md +0 -84
  18. data/utils/enveomics/Docs/recplot2.md +0 -244
  19. data/utils/enveomics/Examples/aai-matrix.bash +0 -66
  20. data/utils/enveomics/Examples/ani-matrix.bash +0 -66
  21. data/utils/enveomics/Examples/essential-phylogeny.bash +0 -105
  22. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +0 -100
  23. data/utils/enveomics/LICENSE.txt +0 -73
  24. data/utils/enveomics/Makefile +0 -52
  25. data/utils/enveomics/Manifest/Tasks/aasubs.json +0 -103
  26. data/utils/enveomics/Manifest/Tasks/blasttab.json +0 -790
  27. data/utils/enveomics/Manifest/Tasks/distances.json +0 -161
  28. data/utils/enveomics/Manifest/Tasks/fasta.json +0 -802
  29. data/utils/enveomics/Manifest/Tasks/fastq.json +0 -291
  30. data/utils/enveomics/Manifest/Tasks/graphics.json +0 -126
  31. data/utils/enveomics/Manifest/Tasks/mapping.json +0 -137
  32. data/utils/enveomics/Manifest/Tasks/ogs.json +0 -382
  33. data/utils/enveomics/Manifest/Tasks/other.json +0 -906
  34. data/utils/enveomics/Manifest/Tasks/remote.json +0 -355
  35. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +0 -650
  36. data/utils/enveomics/Manifest/Tasks/tables.json +0 -308
  37. data/utils/enveomics/Manifest/Tasks/trees.json +0 -68
  38. data/utils/enveomics/Manifest/Tasks/variants.json +0 -111
  39. data/utils/enveomics/Manifest/categories.json +0 -165
  40. data/utils/enveomics/Manifest/examples.json +0 -162
  41. data/utils/enveomics/Manifest/tasks.json +0 -4
  42. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
  43. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
  44. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
  45. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
  46. data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
  47. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
  48. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
  49. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
  50. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
  51. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
  52. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
  53. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
  54. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
  55. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
  56. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
  57. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
  58. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
  59. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
  60. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
  61. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
  62. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
  63. data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
  64. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
  65. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
  66. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
  67. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
  68. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
  69. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
  70. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
  71. data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
  72. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
  73. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
  74. data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
  75. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
  76. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
  77. data/utils/enveomics/README.md +0 -42
  78. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +0 -171
  79. data/utils/enveomics/Scripts/Aln.cat.rb +0 -221
  80. data/utils/enveomics/Scripts/Aln.convert.pl +0 -35
  81. data/utils/enveomics/Scripts/AlphaDiversity.pl +0 -152
  82. data/utils/enveomics/Scripts/BedGraph.tad.rb +0 -93
  83. data/utils/enveomics/Scripts/BedGraph.window.rb +0 -71
  84. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +0 -102
  85. data/utils/enveomics/Scripts/BlastTab.addlen.rb +0 -63
  86. data/utils/enveomics/Scripts/BlastTab.advance.bash +0 -48
  87. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +0 -55
  88. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +0 -104
  89. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +0 -76
  90. data/utils/enveomics/Scripts/BlastTab.filter.pl +0 -47
  91. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +0 -194
  92. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +0 -104
  93. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +0 -157
  94. data/utils/enveomics/Scripts/BlastTab.recplot2.R +0 -48
  95. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +0 -86
  96. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +0 -119
  97. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +0 -86
  98. data/utils/enveomics/Scripts/BlastTab.subsample.pl +0 -47
  99. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +0 -114
  100. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +0 -90
  101. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +0 -123
  102. data/utils/enveomics/Scripts/Chao1.pl +0 -97
  103. data/utils/enveomics/Scripts/CharTable.classify.rb +0 -234
  104. data/utils/enveomics/Scripts/EBIseq2tax.rb +0 -83
  105. data/utils/enveomics/Scripts/FastA.N50.pl +0 -60
  106. data/utils/enveomics/Scripts/FastA.extract.rb +0 -152
  107. data/utils/enveomics/Scripts/FastA.filter.pl +0 -52
  108. data/utils/enveomics/Scripts/FastA.filterLen.pl +0 -28
  109. data/utils/enveomics/Scripts/FastA.filterN.pl +0 -60
  110. data/utils/enveomics/Scripts/FastA.fragment.rb +0 -100
  111. data/utils/enveomics/Scripts/FastA.gc.pl +0 -42
  112. data/utils/enveomics/Scripts/FastA.interpose.pl +0 -93
  113. data/utils/enveomics/Scripts/FastA.length.pl +0 -38
  114. data/utils/enveomics/Scripts/FastA.mask.rb +0 -89
  115. data/utils/enveomics/Scripts/FastA.per_file.pl +0 -36
  116. data/utils/enveomics/Scripts/FastA.qlen.pl +0 -57
  117. data/utils/enveomics/Scripts/FastA.rename.pl +0 -65
  118. data/utils/enveomics/Scripts/FastA.revcom.pl +0 -23
  119. data/utils/enveomics/Scripts/FastA.sample.rb +0 -98
  120. data/utils/enveomics/Scripts/FastA.slider.pl +0 -85
  121. data/utils/enveomics/Scripts/FastA.split.pl +0 -55
  122. data/utils/enveomics/Scripts/FastA.split.rb +0 -79
  123. data/utils/enveomics/Scripts/FastA.subsample.pl +0 -131
  124. data/utils/enveomics/Scripts/FastA.tag.rb +0 -65
  125. data/utils/enveomics/Scripts/FastA.toFastQ.rb +0 -69
  126. data/utils/enveomics/Scripts/FastA.wrap.rb +0 -48
  127. data/utils/enveomics/Scripts/FastQ.filter.pl +0 -54
  128. data/utils/enveomics/Scripts/FastQ.interpose.pl +0 -90
  129. data/utils/enveomics/Scripts/FastQ.maskQual.rb +0 -89
  130. data/utils/enveomics/Scripts/FastQ.offset.pl +0 -90
  131. data/utils/enveomics/Scripts/FastQ.split.pl +0 -53
  132. data/utils/enveomics/Scripts/FastQ.tag.rb +0 -70
  133. data/utils/enveomics/Scripts/FastQ.test-error.rb +0 -81
  134. data/utils/enveomics/Scripts/FastQ.toFastA.awk +0 -24
  135. data/utils/enveomics/Scripts/GFF.catsbj.pl +0 -127
  136. data/utils/enveomics/Scripts/GenBank.add_fields.rb +0 -84
  137. data/utils/enveomics/Scripts/HMM.essential.rb +0 -351
  138. data/utils/enveomics/Scripts/HMM.haai.rb +0 -168
  139. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +0 -83
  140. data/utils/enveomics/Scripts/JPlace.distances.rb +0 -88
  141. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +0 -320
  142. data/utils/enveomics/Scripts/M5nr.getSequences.rb +0 -81
  143. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +0 -198
  144. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +0 -35
  145. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +0 -49
  146. data/utils/enveomics/Scripts/NCBIacc2tax.rb +0 -92
  147. data/utils/enveomics/Scripts/Newick.autoprune.R +0 -27
  148. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +0 -228
  149. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +0 -32
  150. data/utils/enveomics/Scripts/RefSeq.download.bash +0 -48
  151. data/utils/enveomics/Scripts/SRA.download.bash +0 -55
  152. data/utils/enveomics/Scripts/TRIBS.plot-test.R +0 -36
  153. data/utils/enveomics/Scripts/TRIBS.test.R +0 -39
  154. data/utils/enveomics/Scripts/Table.barplot.R +0 -31
  155. data/utils/enveomics/Scripts/Table.df2dist.R +0 -30
  156. data/utils/enveomics/Scripts/Table.filter.pl +0 -61
  157. data/utils/enveomics/Scripts/Table.merge.pl +0 -77
  158. data/utils/enveomics/Scripts/Table.prefScore.R +0 -60
  159. data/utils/enveomics/Scripts/Table.replace.rb +0 -69
  160. data/utils/enveomics/Scripts/Table.round.rb +0 -63
  161. data/utils/enveomics/Scripts/Table.split.pl +0 -57
  162. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +0 -227
  163. data/utils/enveomics/Scripts/VCF.KaKs.rb +0 -147
  164. data/utils/enveomics/Scripts/VCF.SNPs.rb +0 -88
  165. data/utils/enveomics/Scripts/aai.rb +0 -421
  166. data/utils/enveomics/Scripts/ani.rb +0 -362
  167. data/utils/enveomics/Scripts/anir.rb +0 -137
  168. data/utils/enveomics/Scripts/clust.rand.rb +0 -102
  169. data/utils/enveomics/Scripts/gi2tax.rb +0 -103
  170. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +0 -96
  171. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  172. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  173. data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
  174. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +0 -293
  175. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +0 -175
  176. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +0 -24
  177. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +0 -17
  178. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +0 -30
  179. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +0 -253
  180. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +0 -88
  181. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +0 -182
  182. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +0 -49
  183. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +0 -74
  184. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +0 -237
  185. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +0 -31
  186. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +0 -152
  187. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +0 -3
  188. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +0 -74
  189. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +0 -135
  190. data/utils/enveomics/Scripts/ogs.annotate.rb +0 -88
  191. data/utils/enveomics/Scripts/ogs.core-pan.rb +0 -160
  192. data/utils/enveomics/Scripts/ogs.extract.rb +0 -125
  193. data/utils/enveomics/Scripts/ogs.mcl.rb +0 -186
  194. data/utils/enveomics/Scripts/ogs.rb +0 -104
  195. data/utils/enveomics/Scripts/ogs.stats.rb +0 -131
  196. data/utils/enveomics/Scripts/rbm-legacy.rb +0 -172
  197. data/utils/enveomics/Scripts/rbm.rb +0 -108
  198. data/utils/enveomics/Scripts/sam.filter.rb +0 -148
  199. data/utils/enveomics/Tests/Makefile +0 -10
  200. data/utils/enveomics/Tests/Mgen_M2288.faa +0 -3189
  201. data/utils/enveomics/Tests/Mgen_M2288.fna +0 -8282
  202. data/utils/enveomics/Tests/Mgen_M2321.fna +0 -8288
  203. data/utils/enveomics/Tests/Nequ_Kin4M.faa +0 -2970
  204. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  205. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +0 -7
  206. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +0 -17
  207. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +0 -137
  208. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +0 -123
  209. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +0 -200
  210. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +0 -55
  211. data/utils/enveomics/Tests/alkB.nwk +0 -1
  212. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +0 -13
  213. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +0 -17
  214. data/utils/enveomics/Tests/hiv1.faa +0 -59
  215. data/utils/enveomics/Tests/hiv1.fna +0 -134
  216. data/utils/enveomics/Tests/hiv2.faa +0 -70
  217. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +0 -233
  218. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +0 -1
  219. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +0 -233
  220. data/utils/enveomics/Tests/phyla_counts.tsv +0 -10
  221. data/utils/enveomics/Tests/primate_lentivirus.ogs +0 -11
  222. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +0 -9
  223. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +0 -8
  224. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +0 -6
  225. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +0 -9
  226. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +0 -6
  227. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +0 -6
  228. data/utils/enveomics/build_enveomics_r.bash +0 -45
  229. data/utils/enveomics/enveomics.R/DESCRIPTION +0 -31
  230. data/utils/enveomics/enveomics.R/NAMESPACE +0 -39
  231. data/utils/enveomics/enveomics.R/R/autoprune.R +0 -155
  232. data/utils/enveomics/enveomics.R/R/barplot.R +0 -184
  233. data/utils/enveomics/enveomics.R/R/cliopts.R +0 -135
  234. data/utils/enveomics/enveomics.R/R/df2dist.R +0 -154
  235. data/utils/enveomics/enveomics.R/R/growthcurve.R +0 -331
  236. data/utils/enveomics/enveomics.R/R/prefscore.R +0 -79
  237. data/utils/enveomics/enveomics.R/R/recplot.R +0 -354
  238. data/utils/enveomics/enveomics.R/R/recplot2.R +0 -1631
  239. data/utils/enveomics/enveomics.R/R/tribs.R +0 -583
  240. data/utils/enveomics/enveomics.R/R/utils.R +0 -80
  241. data/utils/enveomics/enveomics.R/README.md +0 -81
  242. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  243. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  244. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -16
  245. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -16
  246. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -16
  247. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +0 -25
  248. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +0 -46
  249. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +0 -23
  250. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +0 -47
  251. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +0 -23
  252. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +0 -23
  253. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +0 -40
  254. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +0 -103
  255. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +0 -67
  256. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +0 -24
  257. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +0 -19
  258. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +0 -45
  259. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +0 -44
  260. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +0 -47
  261. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +0 -75
  262. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +0 -50
  263. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +0 -44
  264. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +0 -139
  265. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +0 -45
  266. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +0 -24
  267. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +0 -77
  268. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +0 -25
  269. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +0 -21
  270. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +0 -19
  271. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +0 -19
  272. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +0 -47
  273. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +0 -29
  274. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +0 -18
  275. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +0 -45
  276. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -36
  277. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +0 -19
  278. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +0 -19
  279. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +0 -27
  280. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +0 -52
  281. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +0 -17
  282. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +0 -51
  283. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +0 -43
  284. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +0 -82
  285. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +0 -59
  286. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +0 -27
  287. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +0 -36
  288. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +0 -23
  289. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +0 -68
  290. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +0 -28
  291. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +0 -27
  292. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +0 -14
  293. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +0 -13
  294. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +0 -78
  295. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +0 -46
  296. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +0 -45
  297. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +0 -125
  298. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +0 -19
  299. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +0 -19
  300. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +0 -19
  301. data/utils/enveomics/globals.mk +0 -8
  302. data/utils/enveomics/manifest.json +0 -9
  303. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  304. data/utils/multitrim/README.md +0 -67
  305. data/utils/multitrim/multitrim.py +0 -1555
  306. data/utils/multitrim/multitrim.yml +0 -13
@@ -1,1631 +0,0 @@
1
- #==============> Define S4 classes
2
-
3
- #' Enveomics: Recruitment Plot (2) - S4 Class
4
- #'
5
- #' Enve-omics representation of Recruitment plots. This object can
6
- #' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
7
- #'
8
- #' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
9
- #' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
10
- #' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
11
- #' @slot id.counts \code{(numeric)} Counts per ID bin.
12
- #' @slot id.breaks \code{(numeric)} Breaks of identity bins.
13
- #' @slot pos.breaks \code{(numeric)} Breaks of position bins.
14
- #' @slot pos.names \code{(character)} Names of the position bins.
15
- #' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
16
- #' @slot peaks \code{(list)} Peaks identified in the recplot.
17
- #' Limits of the subject sequences after concatenation.
18
- #' @slot seq.names \code{(character}) Names of the subject sequences.
19
- #' @slot id.metric \code{(character}) Metric used as 'identity'.
20
- #' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
21
- #' @slot call \code{(call)} Call producing this object.
22
- #'
23
- #' @author Luis M. Rodriguez-R [aut, cre]
24
- #'
25
- #' @exportClass
26
-
27
- enve.RecPlot2 <- setClass("enve.RecPlot2",
28
- representation(
29
- # slots = list(
30
- counts='matrix',
31
- pos.counts.in='numeric',
32
- pos.counts.out='numeric',
33
- id.counts='numeric',
34
- id.breaks='numeric',
35
- pos.breaks='numeric',
36
- pos.names='character',
37
- seq.breaks='numeric',
38
- peaks='list',
39
- seq.names='character',
40
- id.metric='character',
41
- id.ingroup='logical',
42
- call='call')
43
- ,package='enveomics.R'
44
- );
45
-
46
- #' Enveomics: Recruitment Plot (2) Peak - S4 Class
47
- #'
48
- #' Enve-omics representation of a peak in the sequencing depth histogram
49
- #' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
50
- #'
51
- #' @slot dist \code{(character)}
52
- #' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
53
- #' (skew-normal).
54
- #' @slot values \code{(numeric)}
55
- #' Sequencing depth values predicted to conform the peak.
56
- #' @slot values.res \code{(numeric)}
57
- #' Sequencing depth values not explained by this or previously identified
58
- #' peaks.
59
- #' @slot mode \code{(numeric)}
60
- #' Seed-value of mode anchoring the peak.
61
- #' @slot param.hat \code{(list)}
62
- #' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
63
- #' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
64
- #' xi=location). Note that the "dispersion" parameter is always first and
65
- #' the "location" parameter is always last.
66
- #' @slot n.hat \code{(numeric)}
67
- #' Number of bins estimated to be explained by this peak. This should
68
- #' ideally be equal to the length of \code{values}, but it's not an integer.
69
- #' @slot n.total \code{(numeric)}
70
- #' Total number of bins from which the peak was extracted. I.e., total
71
- #' number of position bins with non-zero sequencing depth in the recruitment
72
- #' plot (regardless of peak count).
73
- #' @slot err.res \code{(numeric)}
74
- #' Error left after adding the peak (mower) or log-likelihood (em or emauto).
75
- #' @slot merge.logdist \code{(numeric)}
76
- #' Attempted \code{merge.logdist} parameter.
77
- #' @slot seq.depth \code{(numeric)}
78
- #' Best estimate available for the sequencing depth of the peak (centrality).
79
- #' @slot log \code{(logical)}
80
- #' Indicates if the estimation was performed in natural logarithm space.
81
- #'
82
- #' @author Luis M. Rodriguez-R [aut, cre]
83
- #'
84
- #' @exportClass
85
-
86
- enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
87
- representation(
88
- # slots = list(
89
- dist='character',
90
- values='numeric',
91
- values.res='numeric',
92
- mode='numeric',
93
- param.hat='list',
94
- n.hat='numeric',
95
- n.total='numeric',
96
- err.res='numeric',
97
- merge.logdist='numeric',
98
- seq.depth='numeric',
99
- log='logical'
100
- ));
101
-
102
- #' Attribute accessor
103
- #'
104
- #'
105
- #' @param x Object
106
- #' @param name Attribute name
107
- setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
108
-
109
- #' Attribute accessor
110
- #'
111
- #'
112
- #' @param x Object
113
- #' @param name Attribute name
114
- setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
115
-
116
- #==============> Define S4 methods
117
-
118
- #' Enveomics: Recruitment Plot (2)
119
- #'
120
- #' Plots an \code{\link{enve.RecPlot2}} object.
121
- #'
122
- #' @param x
123
- #' \code{\link{enve.RecPlot2}} object to plot.
124
- #' @param layout
125
- #' Matrix indicating the position of the different panels in the layout,
126
- #' where:
127
- #' \itemize{
128
- #' \item 0: Empty space
129
- #' \item 1: Counts matrix
130
- #' \item 2: position histogram (sequencing depth)
131
- #' \item 3: identity histogram
132
- #' \item 4: Populations histogram (histogram of sequencing depths)
133
- #' \item 5: Color scale for the counts matrix (vertical)
134
- #' \item 6: Color scale of the counts matrix (horizontal)
135
- #' }
136
- #' Only panels indicated here will be plotted. To plot only one panel
137
- #' simply set this to the number of the panel you want to plot.
138
- #' @param panel.fun
139
- #' List of functions to be executed after drawing each panel. Use the
140
- #' indices in \code{layout} (as characters) as keys. Functions for indices
141
- #' missing in \code{layout} are ignored. For example, to add a vertical line
142
- #' at the 3Mbp mark in both the position histogram and the counts matrix:
143
- #' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
144
- #' Note that the X-axis in both panels is in Mbp by default. To change
145
- #' this behavior, set \code{pos.units} accordingly.
146
- #' @param widths
147
- #' Relative widths of the columns of \code{layout}.
148
- #' @param heights
149
- #' Relative heights of the rows of \code{layout}.
150
- #' @param palette
151
- #' Colors to be used to represent the counts matrix, sorted from no hits
152
- #' to the maximum sequencing depth.
153
- #' @param underlay.group
154
- #' If TRUE, it indicates the in-group and out-group areas couloured based
155
- #' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
156
- #' @param peaks.col
157
- #' If not \code{NA}, it attempts to represent peaks in the population histogram
158
- #' in the specified color. Set to \code{NA} to avoid peak-finding.
159
- #' @param use.peaks
160
- #' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
161
- #' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
162
- #' @param id.lim
163
- #' Limits of identities to represent.
164
- #' @param pos.lim
165
- #' Limits of positions to represent (in bp, regardless of \code{pos.units}).
166
- #' @param pos.units
167
- #' Units in which the positions should be represented (powers of 1,000
168
- #' base pairs).
169
- #' @param mar
170
- #' Margins of the panels as a list, with the character representation of
171
- #' the number of the panel as index (see \code{layout}).
172
- #' @param pos.splines
173
- #' Smoothing parameter for the splines in the position histogram. Zero
174
- #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
175
- #' cross-validation.
176
- #' @param id.splines
177
- #' Smoothing parameter for the splines in the identity histogram. Zero
178
- #' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
179
- #' cross-validation.
180
- #' @param in.lwd
181
- #' Line width for the sequencing depth of in-group matches.
182
- #' @param out.lwd
183
- #' Line width for the sequencing depth of out-group matches.
184
- #' @param id.lwd
185
- #' Line width for the identity histogram.
186
- #' @param in.col
187
- #' Color associated to in-group matches.
188
- #' @param out.col
189
- #' Color associated to out-group matches.
190
- #' @param id.col
191
- #' Color for the identity histogram.
192
- #' @param breaks.col
193
- #' Color of the vertical lines indicating sequence breaks.
194
- #' @param peaks.opts
195
- #' Options passed to \code{\link{enve.recplot2.findPeaks}},
196
- #' if \code{peaks.col} is not \code{NA}.
197
- #' @param ...
198
- #' Any other graphic parameters (currently ignored).
199
- #'
200
- #' @return
201
- #' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
202
- #' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
203
- #' \code{layout} doesn't include 4, returns \code{NA}.
204
- #'
205
- #' @author Luis M. Rodriguez-R [aut, cre]
206
- #'
207
- #' @method plot enve.RecPlot2
208
- #' @export
209
-
210
- plot.enve.RecPlot2 <- function
211
- (x,
212
- layout=matrix(c(5,5,2,1,4,3), nrow=2),
213
- panel.fun=list(),
214
- widths=c(1,7,2),
215
- heights=c(1,2),
216
- palette=grey((100:0)/100),
217
- underlay.group=TRUE,
218
- peaks.col='darkred',
219
- use.peaks,
220
- id.lim=range(x$id.breaks),
221
- pos.lim=range(x$pos.breaks),
222
- pos.units=c('Mbp','Kbp','bp'),
223
- mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
224
- '3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
225
- '4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
226
- '5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
227
- pos.splines=0,
228
- id.splines=1/2,
229
- in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
230
- out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
231
- id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
232
- in.col='darkblue',
233
- out.col='lightblue',
234
- id.col='black',
235
- breaks.col='#AAAAAA40',
236
- peaks.opts=list(),
237
- ...
238
- ){
239
- pos.units <- match.arg(pos.units);
240
- pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
241
- pos.lim <- pos.lim/pos.factor;
242
- lmat <- layout;
243
- for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
244
-
245
- layout(lmat, widths=widths, heights=heights);
246
- ori.mar <- par('mar');
247
-
248
- # Essential vars
249
- counts <- x$counts
250
-
251
- id.ingroup <- x$id.ingroup
252
- id.counts <- x$id.counts
253
- id.breaks <- x$id.breaks
254
- id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
255
- id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
256
-
257
- pos.counts.in <- x$pos.counts.in
258
- pos.counts.out <- x$pos.counts.out
259
- pos.breaks <- x$pos.breaks/pos.factor
260
- pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
261
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
262
-
263
- seqdepth.in <- pos.counts.in/pos.binsize
264
- seqdepth.out <- pos.counts.out/pos.binsize
265
- seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
266
- seqdepth.out[seqdepth.out>0]))*c(1/2,2)
267
-
268
- if(underlay.group){
269
- in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
270
- list(maxColorValue=256, alpha=62)));
271
- out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
272
- list(maxColorValue=256, alpha=52)));
273
- }
274
-
275
- # [1] Counts matrix
276
- if(any(layout==1)){
277
- par(mar=mar[['1']]);
278
- plot(1, t='n', bty='l',
279
- xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
280
- xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
281
- if(underlay.group){
282
- rect(pos.lim[1], id.lim[1], pos.lim[2],
283
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
284
- rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
285
- id.lim[2], col=in.bg, border=NA);
286
- }
287
- abline(v=x$seq.breaks/pos.factor, col=breaks.col);
288
- image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
289
- bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
290
- length.out=1+length(palette)), add=TRUE);
291
- if(exists('1',panel.fun)) panel.fun[['1']]();
292
- }
293
-
294
- # [2] Position histogram
295
- if(any(layout==2)){
296
- par(mar=mar[['2']]);
297
- if(any(layout==1)){
298
- xlab=''
299
- xaxt='n'
300
- }else{
301
- xlab=paste('Position in genome (',pos.units,')',sep='')
302
- xaxt='s'
303
- }
304
- plot(1,t='n', bty='l', log='y',
305
- xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
306
- ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
307
- abline(v=x$seq.breaks/pos.factor, col=breaks.col)
308
- pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
309
- pos.f <- rep(seqdepth.in,each=2)
310
- lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
311
- lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
312
- if(is.null(pos.splines) || pos.splines > 0){
313
- pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
314
- spar=pos.splines)
315
- lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
316
- }
317
- if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
318
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
319
- seqdepth.lim[1]*3/2, col=out.col, border=NA);
320
- if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
321
- seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
322
- seqdepth.lim[1]*3/2, col=in.col, border=NA);
323
- if(exists('2',panel.fun)) panel.fun[['2']]();
324
- }
325
-
326
- # [3] Identity histogram
327
- if(any(layout==3)){
328
- par(mar=mar[['3']]);
329
- if(any(layout==1)){
330
- ylab=''
331
- yaxt='n'
332
- }else{
333
- ylab=x$id.metric
334
- yaxt='s'
335
- }
336
- if(sum(id.counts>0) >= 4){
337
- id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
338
- plot(1,t='n', bty='l', log='x',
339
- xlim=id.counts.range, xlab='bps per bin', xaxs='i',
340
- ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
341
- if(underlay.group){
342
- rect(id.counts.range[1], id.lim[1], id.counts.range[2],
343
- min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
344
- rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
345
- id.counts.range[2], id.lim[2], col=in.bg, border=NA);
346
- }
347
- id.f <- rep(id.counts,each=2)
348
- id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
349
- lines(id.f, id.x, lwd=id.lwd, col=id.col);
350
- if(is.null(id.splines) || id.splines > 0){
351
- id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
352
- spar=id.splines)
353
- lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
354
- }
355
- }else{
356
- plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
357
- text(1,1,labels='Insufficient data', srt=90)
358
- }
359
- if(exists('3',panel.fun)) panel.fun[['3']]();
360
- }
361
-
362
- # [4] Populations histogram
363
- peaks <- NA;
364
- if(any(layout==4)){
365
- par(mar=mar[['4']]);
366
- if(any(layout==2)){
367
- ylab=''
368
- yaxt='n'
369
- }else{
370
- ylab='Sequencing depth (X)'
371
- yaxt='s'
372
- }
373
- h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
374
- length.out=200);
375
- h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
376
- h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
377
- plot(1, t='n', log='y',
378
- xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
379
- xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
380
- yaxt=yaxt)
381
- y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
382
- y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
383
- lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
384
- y.tmp.out, col=out.col)
385
- polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
386
- y.tmp.in, border=NA, col=in.col)
387
- if(!is.na(peaks.col)){
388
- o <- peaks.opts; o$x = x;
389
- if(missing(use.peaks)){
390
- peaks <- do.call(enve.recplot2.findPeaks, o)
391
- }else{
392
- peaks <- use.peaks
393
- }
394
- h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
395
- if(!is.null(peaks) & length(peaks)>0){
396
- pf <- h.mids*0;
397
- for(i in 1:length(peaks)){
398
- cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
399
- lines(cnt, h.mids, col='red');
400
- pf <- pf+cnt;
401
- axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
402
- }
403
- lines(pf, h.mids, col='red',lwd=1.5);
404
- dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
405
- frx <- signif(100*as.numeric(
406
- lapply(peaks,
407
- function(x) ifelse(length(x$values)==0, x$n.hat,
408
- length(x$values))/x$n.total)), 2)
409
- if(peaks[[1]]$err.res < 0){
410
- err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
411
- }else{
412
- err <- paste(', err:',
413
- signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
414
- }
415
- legend('topright', bty='n', cex=1/2,
416
- legend=paste(letters[1:length(peaks)],'. ',
417
- dpt,'X (', frx, '%', err, ')', sep=''))
418
- }
419
- }
420
- if(exists('4',panel.fun)) panel.fun[['4']]();
421
- }
422
-
423
- # [5] Color scale of the counts matrix (vertical)
424
- count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
425
- length.out=1+length(palette))
426
- if(any(layout==5)){
427
- par(mar=mar[['5']]);
428
- plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
429
- ylim=range(count.bins), yaxs='i', ylab='')
430
- rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
431
- border=NA)
432
- if(exists('5',panel.fun)) panel.fun[['5']]();
433
- }
434
-
435
- # [6] Color scale of the coutnts matrix (horizontal)
436
- if(any(layout==6)){
437
- par(mar=mar[['6']]);
438
- plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
439
- xlim=range(count.bins), xaxs='i',xlab='');
440
- rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
441
- border=NA);
442
- if(exists('6',panel.fun)) panel.fun[['6']]();
443
- }
444
-
445
- par(mar=ori.mar);
446
- return(peaks);
447
- }
448
-
449
- #==============> Define core functions
450
-
451
- #' Enveomics: Recruitment Plot (2)
452
- #'
453
- #' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
454
- #' been previously executed.
455
- #'
456
- #' @param prefix
457
- #' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
458
- #' least the files .rec and .lim must exist with this prefix.
459
- #' @param plot
460
- #' Should the object be plotted?
461
- #' @param pos.breaks
462
- #' Breaks in the positions histogram. It can also be a vector of break
463
- #' points, and values outside the range are ignored. If zero (0), it
464
- #' uses the sequence breaks as defined in the .lim file, which means
465
- #' one bin per contig (or gene, if the mapping is agains genes). Ignored
466
- #' if `pos.breaks.tsv` is passed.
467
- #' @param pos.breaks.tsv
468
- #' Path to a list of (absolute) coordinates to use as position breaks.
469
- #' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
470
- #' must contain at least one column: coordinates of the break positions of
471
- #' each position bin. If it has a second column, this is used as the name
472
- #' of the position bin that ends at the given coordinate (the first row is
473
- #' ignored). Any additional columns are currently ignored. If \code{NA},
474
- #' position bins are determined by \code{pos.breaks}.
475
- #' @param id.breaks
476
- #' Breaks in the identity histogram. It can also be a vector of break
477
- #' points, and values outside the range are ignored.
478
- #' @param id.free.range
479
- #' Indicates that the range should be freely set from the observed
480
- #' values. Otherwise, 70-100\% is included in the identity histogram
481
- #' (default).
482
- #' @param id.metric
483
- #' Metric of identity to be used (Y-axis). Corrected identity is only
484
- #' supported if the original BLAST file included sequence lengths.
485
- #' @param id.summary
486
- #' Function summarizing the identity bins. Other recommended options
487
- #' include: \code{median} to estimate the median instead of total bins, and
488
- #' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
489
- #' @param id.cutoff
490
- #' Cutoff of identity metric above which the hits are considered
491
- #' \code{in-group}. The 95\% identity corresponds to the expectation of
492
- #' ANI<95\% within species.
493
- #' @param threads
494
- #' Number of threads to use.
495
- #' @param verbose
496
- #' Indicates if the function should report the advance.
497
- #' @param ...
498
- #' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
499
- #'
500
- #' @return Returns an object of class \code{\link{enve.RecPlot2}}.
501
- #'
502
- #' @author Luis M. Rodriguez-R [aut, cre]
503
- #' @author Kenji Gerhardt [aut]
504
- #'
505
- #' @export
506
-
507
- enve.recplot2 <- function(
508
- prefix,
509
- plot = TRUE,
510
- pos.breaks = 1e3,
511
- pos.breaks.tsv = NA,
512
- id.breaks = 60,
513
- id.free.range = FALSE,
514
- id.metric = c('identity', 'corrected identity', 'bit score'),
515
- id.summary = sum,
516
- id.cutoff = 95,
517
- threads = 2,
518
- verbose = TRUE,
519
- ...
520
- ){
521
- # Settings
522
- id.metric <- match.arg(id.metric);
523
-
524
- #Read files
525
- if (verbose) cat("Reading files.\n")
526
- rec <- read.table(paste(prefix, ".rec", sep = ""),
527
- sep = "\t", comment.char = "", quote = "");
528
- lim <- read.table(paste(prefix, ".lim", sep = ""),
529
- sep = "\t", comment.char = "", quote = "", as.is = TRUE);
530
-
531
- # Build matrix
532
- if (verbose) cat("Building counts matrix.\n")
533
- if (id.metric == "corrected identity" & ncol(rec) < 6) {
534
- stop("Requesting corr. identity, but .rec file doesn't have 6th column")
535
- }
536
- rec.idcol <- ifelse(id.metric == "identity", 3,
537
- ifelse(id.metric == "corrected identity", 6, 4))
538
- pos.names <- as.character(NULL)
539
- if (!is.na(pos.breaks.tsv)){
540
- tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
541
- pos.breaks <- as.numeric(tmp[, 1])
542
- if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
543
- } else if (length(pos.breaks) == 1) {
544
- if (pos.breaks > 0){
545
- pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
546
- } else {
547
- pos.breaks <- c(lim[1, 2], lim[, 3])
548
- pos.names <- lim[, 1]
549
- }
550
- }
551
- if (length(id.breaks) == 1) {
552
- id.range.v <- rec[, rec.idcol]
553
- if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
554
- id.range.v <- range(id.range.v)
555
- id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
556
- }
557
-
558
- # Run in parallel
559
- # If they already set threads to 1 manually, there's no point in launching
560
- # clusters, it's just slower. Ditto for small files.
561
- if (nrow(rec) < 75000 | threads == 1) {
562
- # Coerces rec into a form that __counts is happy about
563
- rec.l <- list()
564
- rec.l[[1]] <- list(rec = rec, verbose = FALSE)
565
-
566
- # No need to make a temporary variable, there's only one return for sure
567
- # and it's not a list because it isn't coming back from an apply
568
- counts <- enve.recplot2.__counts(
569
- rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
570
- rec.idcol = rec.idcol)
571
- } else {
572
- cl <- makeCluster(threads)
573
- rec.l <- list()
574
- thl <- ceiling(nrow(rec)/threads)
575
- for (i in 0:(threads - 1)) {
576
- rec.l[[i + 1]] <- list(
577
- rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
578
- verbose = ifelse(i == 0, verbose, FALSE))
579
- }
580
- counts.l <- clusterApply(
581
- cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
582
- id.breaks = id.breaks, rec.idcol = rec.idcol)
583
- stopCluster(cl) # No spooky ghost clusters
584
-
585
- counts <- counts.l[[1]]
586
- for (i in 2:threads) counts <- counts + counts.l[[i]]
587
- }
588
-
589
- # Estimate 1D histograms
590
- if (verbose) cat("Building histograms.\n")
591
- id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
592
- id.ingroup <- (id.mids > id.cutoff);
593
- id.counts <- apply(counts, 2, id.summary);
594
- pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
595
- pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
596
-
597
- # Plot and return
598
- recplot <- new('enve.RecPlot2',
599
- counts = counts, id.counts = id.counts,
600
- pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
601
- id.breaks = id.breaks, pos.breaks = pos.breaks,
602
- pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
603
- seq.names = lim[, 1], id.ingroup = id.ingroup,
604
- id.metric = id.metric, call = match.call());
605
- if (plot) {
606
- if (verbose) cat("Plotting.\n")
607
- peaks <- plot(recplot, ...);
608
- attr(recplot, "peaks") <- peaks
609
- }
610
- return(recplot);
611
- }
612
-
613
- #' Enveomics: Recruitment Plot (2) Peak Finder
614
- #'
615
- #' Identifies peaks in the population histogram potentially indicating
616
- #' sub-population mixtures.
617
- #'
618
- #' @param x
619
- #' An \code{\link{enve.RecPlot2}} object.
620
- #' @param method
621
- #' Peak-finder method. This should be one of:
622
- #' \itemize{
623
- #' \item \strong{emauto}
624
- #' (Expectation-Maximization with auto-selection of components)
625
- #' \item \strong{em}
626
- #' (Expectation-Maximization)
627
- #' \item \strong{mower}
628
- #' (Custom distribution-mowing method)
629
- #' }
630
- #' @param ...
631
- #' Any additional parameters supported by
632
- #' \code{\link{enve.recplot2.findPeaks}}.
633
- #'
634
- #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
635
- #'
636
- #' @author Luis M. Rodriguez-R [aut, cre]
637
- #'
638
- #' export
639
-
640
- enve.recplot2.findPeaks <- function(
641
- x,
642
- method="emauto",
643
- ...
644
- ){
645
- if(method == "emauto"){
646
- peaks <- enve.recplot2.findPeaks.emauto(x, ...)
647
- }else if(method == "em"){
648
- peaks <- enve.recplot2.findPeaks.em(x, ...)
649
- }else if(method == "mower"){
650
- peaks <- enve.recplot2.findPeaks.mower(x, ...)
651
- }else{
652
- stop("Invalid peak-finder method ", method)
653
- }
654
- return(peaks)
655
- }
656
-
657
- #' Enveomics: Recruitment Plot (2) Emauto Peak Finder
658
- #'
659
- #' Identifies peaks in the population histogram using a Gaussian Mixture
660
- #' Model Expectation Maximization (GMM-EM) method with number of components
661
- #' automatically detected.
662
- #'
663
- #' @param x
664
- #' An \code{\link{enve.RecPlot2}} object.
665
- #' @param components
666
- #' A vector of number of components to evaluate.
667
- #' @param criterion
668
- #' Criterion to use for components selection. Must be one of:
669
- #' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
670
- #' (Bayesian Information Criterion or Schwarz Criterion).
671
- #' @param merge.tol
672
- #' When attempting to merge peaks with very similar sequencing depth, use
673
- #' this number of significant digits (in log-scale).
674
- #' @param verbose
675
- #' Display (mostly debugging) information.
676
- #' @param ...
677
- #' Any additional parameters supported by
678
- #' \code{\link{enve.recplot2.findPeaks.em}}.
679
- #'
680
- #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
681
- #'
682
- #' @author Luis M. Rodriguez-R [aut, cre]
683
- #'
684
- #' @export
685
-
686
- enve.recplot2.findPeaks.emauto <- function(
687
- x,
688
- components = seq(1, 5),
689
- criterion = 'aic',
690
- merge.tol = 2L,
691
- verbose = FALSE,
692
- ...
693
- ){
694
- best <- list(crit=0, pstore=list())
695
- if(criterion == 'aic'){
696
- do_crit <- function(ll, k, n) 2*k - 2*ll
697
- }else if(criterion %in% c('bic', 'sbc')){
698
- do_crit <- function(ll, k, n) log(n)*k - 2*ll
699
- }else{
700
- stop('Invalid criterion ', criterion)
701
- }
702
- for(comp in components){
703
- if(verbose) cat('Testing:',comp,'\n')
704
- best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
705
- verbose, ...)
706
- }
707
- if(length(best[['peaks']])==0) return(list())
708
-
709
- seqdepths.r <- signif(log(sapply(best[['peaks']],
710
- function(x) x$seq.depth)), merge.tol)
711
- distinct <- length(unique(seqdepths.r))
712
- if(distinct < length(best[['peaks']])){
713
- if(verbose) cat('Attempting merge to', distinct, 'components\n')
714
- init <- apply(sapply(best[['peaks']],
715
- function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
716
- init <- init[!duplicated(seqdepths.r),]
717
- init <- list(mu=init[,'mean'], sd=init[,'sd'],
718
- alpha=init[,'alpha']/sum(init[,'alpha']))
719
- best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
720
- verbose, ...)
721
- }
722
- return(best[['peaks']])
723
- }
724
-
725
- #' Enveomics: Recruitment Plot (2) Em Peak Finder
726
- #'
727
- #' Identifies peaks in the population histogram using a Gaussian Mixture
728
- #' Model Expectation Maximization (GMM-EM) method.
729
- #'
730
- #' @param x
731
- #' An \code{\link{enve.RecPlot2}} object.
732
- #' @param max.iter
733
- #' Maximum number of EM iterations.
734
- #' @param ll.diff.res
735
- #' Maximum Log-Likelihood difference to be considered as convergent.
736
- #' @param components
737
- #' Number of distributions assumed in the mixture.
738
- #' @param rm.top
739
- #' Top-values to remove before finding peaks, as a quantile probability.
740
- #' This step is useful to remove highly conserved regions, but can be
741
- #' turned off by setting \code{rm.top=0}. The quantile is determined
742
- #' \strong{after} removing zero-coverage windows.
743
- #' @param verbose
744
- #' Display (mostly debugging) information.
745
- #' @param init
746
- #' Initialization parameters. By default, these are derived from k-means
747
- #' clustering. A named list with vectors for \code{mu}, \code{sd}, and
748
- #' \code{alpha}, each of length \code{components}.
749
- #' @param log
750
- #' Logical value indicating if the estimations should be performed in
751
- #' natural logarithm units. Do not change unless you know what you're
752
- #' doing.
753
- #'
754
- #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
755
- #'
756
- #' @author Luis M. Rodriguez-R [aut, cre]
757
- #'
758
- #' @export
759
-
760
- enve.recplot2.findPeaks.em <- function(
761
- x,
762
- max.iter = 1000,
763
- ll.diff.res = 1e-8,
764
- components = 2,
765
- rm.top = 0.05,
766
- verbose = FALSE,
767
- init,
768
- log = TRUE
769
- ){
770
-
771
- # Essential vars
772
- pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
773
- lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
774
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
775
- if(log) lsd1 <- log(lsd1)
776
-
777
- # 1. Initialize
778
- if(missing(init)){
779
- km.clust <- kmeans(lsd1, components)$cluster
780
- init <- list(
781
- mu = tapply(lsd1, km.clust, mean),
782
- sd = tapply(lsd1, km.clust, sd),
783
- alpha = table(km.clust) / length(km.clust)
784
- )
785
- }
786
- m.step <- init
787
- ll <- c()
788
- cur.ll <- -Inf
789
-
790
- for(i in 1:max.iter){
791
- # 2/3. EM
792
- e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
793
- m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
794
- # 4. Convergence
795
- ll <- c(ll, e.step[["ll"]])
796
- ll.diff <- abs(cur.ll - e.step[["ll"]])
797
- cur.ll <- e.step[["ll"]]
798
- if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
799
- if(is.na(ll.diff) || ll.diff == Inf) break
800
- if(ll.diff <= ll.diff.res) break
801
- }
802
-
803
- # Return
804
- peaks <- list()
805
- for(i in 1:components){
806
- n.hat <- m.step[['alpha']][i]*length(lsd1)
807
- peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
808
- values.res=0, mode=m.step[['mu']][i],
809
- param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
810
- n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
811
- merge.logdist=as.numeric(), log=log,
812
- seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
813
- }
814
- return(peaks)
815
- }
816
-
817
- #' Enveomics: Recruitment Plot (2) Mowing Peak Finder
818
- #'
819
- #' Identifies peaks in the population histogram potentially indicating
820
- #' sub-population mixtures, using a custom distribution-mowing method.
821
- #'
822
- #' @param x
823
- #' An \code{\link{enve.RecPlot2}} object.
824
- #' @param min.points
825
- #' Minimum number of points in the quantile-estimation-range
826
- #' \code{(quant.est)} to estimate a peak.
827
- #' @param quant.est
828
- #' Range of quantiles to be used in the estimation of a peak's
829
- #' parameters.
830
- #' @param mlv.opts
831
- #' Ignored. For backwards compatibility.
832
- #' @param fitdist.opts.sn
833
- #' Options passed to \code{fitdist} to estimate the standard deviation if
834
- #' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
835
- #' ammended with \code{xi=estimated} mode for each peak.
836
- #' @param fitdist.opts.norm
837
- #' Options passed to \code{fitdist} to estimate the standard deviation if
838
- #' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
839
- #' ammended with \code{mean=estimated} mode for each peak.
840
- #' @param rm.top
841
- #' Top-values to remove before finding peaks, as a quantile probability.
842
- #' This step is useful to remove highly conserved regions, but can be
843
- #' turned off by setting \code{rm.top=0}. The quantile is determined
844
- #' \strong{after} removing zero-coverage windows.
845
- #' @param with.skewness
846
- #' Allow skewness correction of the peaks. Typically, the
847
- #' sequencing-depth distribution for a single peak is left-skewed, due
848
- #' partly (but not exclusively) to fragmentation and mapping sensitivity.
849
- #' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
850
- #' alternative solution for the first problem (fragmentation) called
851
- #' "tail distribution".
852
- #' @param optim.rounds
853
- #' Maximum rounds of peak optimization.
854
- #' @param optim.epsilon
855
- #' Trace change at which optimization stops (unless \code{optim.rounds} is
856
- #' reached first). The trace change is estimated as the sum of square
857
- #' differences between parameters in one round and those from two rounds
858
- #' earlier (to avoid infinite loops from approximation).
859
- #' @param merge.logdist
860
- #' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
861
- #' to attempt merging. The default of ~0.22 corresponds to a maximum
862
- #' difference of 25\%.
863
- #' @param verbose
864
- #' Display (mostly debugging) information.
865
- #' @param log
866
- #' Logical value indicating if the estimations should be performed in
867
- #' natural logarithm units. Do not change unless you know what you're
868
- #' doing.
869
- #'
870
- #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
871
- #'
872
- #' @author Luis M. Rodriguez-R [aut, cre]
873
- #'
874
- #' @export
875
-
876
- enve.recplot2.findPeaks.mower <- function(
877
- x,
878
- min.points=10,
879
- quant.est=c(0.002, 0.998),
880
- mlv.opts=list(method='parzen'),
881
- fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
882
- start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
883
- fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
884
- start=list(sd=1), lower=c(0, -Inf)),
885
- rm.top=0.05,
886
- with.skewness=TRUE,
887
- optim.rounds=200,
888
- optim.epsilon=1e-4,
889
- merge.logdist=log(1.75),
890
- verbose=FALSE,
891
- log=TRUE
892
- ){
893
-
894
- # Essential vars
895
- pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
896
- seqdepth.in <- x$pos.counts.in/pos.binsize;
897
- lsd1 <- seqdepth.in[seqdepth.in>0];
898
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
899
- if(log) lsd1 <- log(lsd1)
900
- if(with.skewness){
901
- fitdist.opts <- fitdist.opts.sn
902
- }else{
903
- fitdist.opts <- fitdist.opts.norm
904
- }
905
- peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
906
- mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
907
- optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
908
- n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
909
-
910
- # Find seed peaks
911
- if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
912
- peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
913
-
914
- # Merge overlapping peaks
915
- if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
916
- merged <- (length(peaks)>1)
917
- while(merged){
918
- merged <- FALSE
919
- ignore <- c()
920
- peaks2 <- list();
921
- for(i in 1:length(peaks)){
922
- if(i %in% ignore) next
923
- p <- peaks[[ i ]]
924
- j <- enve.recplot2.__whichClosestPeak(p, peaks)
925
- p2 <- peaks[[ j ]]
926
- dst.a <- p$param.hat[[ length(p$param.hat) ]]
927
- dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
928
- if( abs(log(dst.a/dst.b)) < merge.logdist ){
929
- if(verbose) cat('==> Attempting a merge at',
930
- p$param.hat[[ length(p$param.hat) ]],'&',
931
- p2$param.hat[[ length(p2$param.hat) ]],'X\n');
932
- peaks.opts$lsd1 <- c(p$values, p2$values)
933
- p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
934
- if(length(p.new)==1){
935
- peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
936
- ignore <- c(ignore, j)
937
- merged <- TRUE
938
- }
939
- }
940
- if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
941
- }
942
- peaks <- peaks2
943
- if(length(peaks)==1) break
944
- }
945
-
946
- if(verbose) cat('Found',length(peaks),'peak(s)\n')
947
- return(peaks);
948
- }
949
-
950
- #==============> Define utils
951
-
952
- #' Enveomics: Recruitment Plot (2) Core Peak Finder
953
- #'
954
- #' Finds the peak in a list of peaks that is most likely to represent the
955
- #' "core genome" of a population.
956
- #'
957
- #' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
958
- #'
959
- #' @author Luis M. Rodriguez-R [aut, cre]
960
- #'
961
- #' @export
962
-
963
- enve.recplot2.corePeak <- function
964
- (x
965
- ){
966
- # Find the peak with maximum depth (centrality)
967
- maxPeak <- x[[
968
- which.max(as.numeric(lapply(x,
969
- function(y) y$param.hat[[ length(y$param.hat) ]])))
970
- ]]
971
- # If a "larger" peak (a peak explaining more bins of the genome) is within
972
- # the default "merge.logdist" distance, take that one instead.
973
- corePeak <- maxPeak
974
- for(p in x){
975
- p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
976
- corePeak.len <- ifelse(
977
- length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
978
- sz.d <- log(p.len/corePeak.len)
979
- if(is.nan(sz.d) || sz.d < 0) next
980
- sq.d.a <- as.numeric(tail(p$param.hat, n=1))
981
- sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
982
- if(p$log) sq.d.a <- exp(sq.d.a)
983
- if(corePeak$log) sq.d.b <- exp(sq.d.b)
984
- if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
985
- }
986
- return(corePeak)
987
- }
988
-
989
- #' Enveomics: Recruitment Plot (2) Change Cutoff
990
- #'
991
- #' Change the intra-species cutoff of an existing recruitment plot.
992
- #'
993
- #' @param rp
994
- #' \code{\link{enve.RecPlot2}} object.
995
- #' @param new.cutoff
996
- #' New cutoff to use.
997
- #'
998
- #' @author Luis M. Rodriguez-R [aut, cre]
999
- #'
1000
- #' @export
1001
-
1002
- enve.recplot2.changeCutoff <- function
1003
- (rp,
1004
- new.cutoff=98
1005
- ){
1006
- # Re-calculate vectors
1007
- id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
1008
- id.ingroup <- (id.mids > new.cutoff)
1009
- pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
1010
- pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
1011
- # Update object
1012
- attr(rp, "id.ingroup") <- id.ingroup
1013
- attr(rp, "pos.counts.in") <- pos.counts.in
1014
- attr(rp, "pos.counts.out") <- pos.counts.out
1015
- attr(rp, "call") <- match.call()
1016
- return(rp)
1017
- }
1018
-
1019
- #' Enveomics: Recruitment Plot (2) Window Depth Threshold
1020
- #'
1021
- #' Identifies the threshold below which windows should be identified as
1022
- #' variable or absent.
1023
- #'
1024
- #' @param rp
1025
- #' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
1026
- #' @param peak
1027
- #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1028
- #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1029
- #' used (see \code{\link{enve.recplot2.corePeak}}).
1030
- #' @param lower.tail
1031
- #' If \code{FALSE}, it returns windows significantly above the peak in
1032
- #' sequencing depth.
1033
- #' @param significance
1034
- #' Significance threshold (alpha) to select windows.
1035
- #'
1036
- #' @return
1037
- #' Returns a float. The units are depth if the peaks were estimated in
1038
- #' linear scale, or log-depth otherwise (\code{peak$log}).
1039
- #'
1040
- #' @author Luis M. Rodriguez-R [aut, cre]
1041
- #'
1042
- #' @export
1043
-
1044
- enve.recplot2.windowDepthThreshold <- function
1045
- (rp,
1046
- peak,
1047
- lower.tail=TRUE,
1048
- significance=0.05
1049
- ){
1050
- if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
1051
- par <- peak$param.hat
1052
- par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
1053
- thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
1054
- if(peak$log) thr <- exp(thr)
1055
-
1056
- return(thr)
1057
- }
1058
-
1059
- #' Enveomics: Recruitment Plot (2) Extract Windows
1060
- #'
1061
- #' Extract windows significantly below (or above) the peak in sequencing
1062
- #' depth.
1063
- #'
1064
- #' @param rp
1065
- #' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
1066
- #' @param peak
1067
- #' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
1068
- #' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
1069
- #' used (see \code{\link{enve.recplot2.corePeak}}).
1070
- #' @param lower.tail
1071
- #' If \code{FALSE}, it returns windows significantly above the peak in
1072
- #' sequencing depth.
1073
- #' @param significance
1074
- #' Significance threshold (alpha) to select windows.
1075
- #' @param seq.names
1076
- #' Returns subject sequence names instead of a vector of Booleans. If
1077
- #' the recruitment plot was generated with named position bins (e.g, using
1078
- #' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
1079
- #' vector of characters (the sequence identifiers), otherwise it returns a
1080
- #' data.frame with a name column and two columns of coordinates.
1081
- #'
1082
- #' @return
1083
- #' Returns a vector of logicals if \code{seq.names = FALSE}.
1084
- #' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
1085
- #' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
1086
- #' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
1087
- #'
1088
- #' @author Luis M. Rodriguez-R [aut, cre]
1089
- #'
1090
- #' @export
1091
-
1092
- enve.recplot2.extractWindows <- function
1093
- (rp,
1094
- peak,
1095
- lower.tail = TRUE,
1096
- significance = 0.05,
1097
- seq.names = FALSE
1098
- ){
1099
- # Determine the threshold
1100
- thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
1101
-
1102
- # Select windows past the threshold
1103
- seqdepth.in <- enve.recplot2.seqdepth(rp)
1104
- if(lower.tail){
1105
- sel <- seqdepth.in < thr
1106
- }else{
1107
- sel <- seqdepth.in > thr
1108
- }
1109
-
1110
- # seq.names = FALSE
1111
- if(!seq.names) return(sel)
1112
- # seq.names = TRUE
1113
- return(enve.recplot2.coordinates(rp, sel))
1114
- }
1115
-
1116
- #' Enveomics: Recruitment Plot (2) Compare Identities
1117
- #'
1118
- #' Compare the distribution of identities between two
1119
- #' \code{\link{enve.RecPlot2}} objects.
1120
- #'
1121
- #' @param x
1122
- #' First \code{\link{enve.RecPlot2}} object.
1123
- #' @param y
1124
- #' Second \code{\link{enve.RecPlot2}} object.
1125
- #' @param method
1126
- #' Distance method to use. This should be (an unambiguous abbreviation of)
1127
- #' one of:
1128
- #' \itemize{
1129
- #' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
1130
- #' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
1131
- #' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
1132
- #' doi:10.1214/aoms/1177729694}), or}
1133
- #' \item{"euclidean"}
1134
- #' }
1135
- #' @param smooth.par
1136
- #' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
1137
- #' Use \code{NULL} to automatically determine this value using leave-one-out
1138
- #' cross-validation (see \code{smooth.spline} parameter \code{spar}).
1139
- #' @param pseudocounts
1140
- #' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
1141
- #' 1 for add-one smoothing.
1142
- #' @param max.deviation
1143
- #' Maximum mean deviation between identity breaks tolerated (as percent
1144
- #' identity). Difference in number of \code{id.breaks} is never tolerated.
1145
- #'
1146
- #' @author Luis M. Rodriguez-R [aut, cre]
1147
- #'
1148
- #' @export
1149
-
1150
- enve.recplot2.compareIdentities <- function
1151
- (x,
1152
- y,
1153
- method="hellinger",
1154
- smooth.par=NULL,
1155
- pseudocounts=0,
1156
- max.deviation=0.75
1157
- ){
1158
- METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
1159
- i.meth <- pmatch(method, METHODS)
1160
- if (is.na(i.meth)) stop("Invalid distance ", method)
1161
- if(!inherits(x, "enve.RecPlot2"))
1162
- stop("'x' must inherit from class `enve.RecPlot2`")
1163
- if(!inherits(y, "enve.RecPlot2"))
1164
- stop("'y' must inherit from class `enve.RecPlot2`")
1165
- if(length(x$id.breaks) != length(y$id.breaks))
1166
- stop("'x' and 'y' must have the same number of `id.breaks`")
1167
- dev <- mean(abs(x$id.breaks - y$id.breaks))
1168
- if(dev > max.deviation)
1169
- stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
1170
- dev)
1171
- x.cnt <- x$id.counts
1172
- y.cnt <- y$id.counts
1173
- if(is.null(smooth.par) || smooth.par > 0){
1174
- x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
1175
- y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
1176
- p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
1177
- q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
1178
- x.cnt <- pmax(p.spline$y, 0)
1179
- y.cnt <- pmax(q.spline$y, 0)
1180
- }
1181
- a <- as.numeric(pseudocounts)
1182
- p <- (x.cnt + a) / sum(x.cnt + a)
1183
- q <- (y.cnt + a) / sum(y.cnt + a)
1184
- d <- NA
1185
- if(i.meth %in% c(1L, 2L)){
1186
- d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
1187
- if(i.meth==2L) d <- 1 - d**2
1188
- }else if(i.meth %in% c(3L, 4L)){
1189
- sel <- p>0
1190
- if(any(q[sel]==0))
1191
- stop("Undefined distance without absolute continuity, use pseudocounts")
1192
- d <- -sum(p[sel]*log(q[sel]/p[sel]))
1193
- }else if(i.meth == 5L){
1194
- d <- sqrt(sum((q-p)**2))
1195
- }
1196
- return(d)
1197
- }
1198
-
1199
- #' Enveomics: Recruitment Plot (2) Coordinates
1200
- #'
1201
- #' Returns the sequence name and coordinates of the requested position bins.
1202
- #'
1203
- #' @param x
1204
- #' \code{\link{enve.RecPlot2}} object.
1205
- #' @param bins
1206
- #' Vector of selected bins to return. It can be a vector of logical values
1207
- #' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
1208
- #' missing, returns the coordinates of all windows.
1209
- #'
1210
- #' @return
1211
- #' Returns a data.frame with five columns: \code{name.from} (character),
1212
- #' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
1213
- #' (numeric), and \code{seq.name} (character).
1214
- #' The first two correspond to sequence and position of the start point of the
1215
- #' bin. The next two correspond to the sequence and position of the end point of
1216
- #' the bin. The last one indicates the name of the sequence (if defined).
1217
- #'
1218
- #' @author Luis M. Rodriguez-R [aut, cre]
1219
- #'
1220
- #' @export
1221
-
1222
- enve.recplot2.coordinates <- function
1223
- (x,
1224
- bins
1225
- ){
1226
- if(!inherits(x, "enve.RecPlot2"))
1227
- stop("'x' must inherit from class `enve.RecPlot2`")
1228
- if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
1229
- if(!is.vector(bins)) stop("'bins' must be a vector")
1230
- if(inherits(bins, "logical")) bins <- which(bins)
1231
-
1232
- y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
1233
-
1234
- for(i in 1:length(bins)){
1235
- j <- bins[i]
1236
- # Concatenated coordinates
1237
- cc <- x$pos.breaks[c(j, j+1)]
1238
- # Find the corresponding `seq.breaks`
1239
- sb.from <- which(
1240
- cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
1241
- cc[1] < x$seq.breaks[-1])
1242
- sb.to <- which(
1243
- cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
1244
- cc[2] <= x$seq.breaks[-1])
1245
- # Translate coordinates
1246
- if(length(sb.from)==1 & length(sb.to)==1){
1247
- y[i, 'name.from'] <- x$seq.names[sb.from]
1248
- y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
1249
- y[i, 'name.to'] <- x$seq.names[sb.to]
1250
- y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
1251
- y[i, 'seq.name'] <- x$pos.names[i]
1252
- }
1253
- }
1254
-
1255
- return(y)
1256
- }
1257
-
1258
- #' Enveomics: Recruitment Plot (2) Sequencing Depth
1259
- #'
1260
- #' Calculate the sequencing depth of the given window(s).
1261
- #'
1262
- #' @param x
1263
- #' \code{\link{enve.RecPlot2}} object.
1264
- #' @param sel
1265
- #' Window(s) for which the sequencing depth is to be calculated. If not
1266
- #' passed, it returns the sequencing depth of all windows.
1267
- #' @param low.identity
1268
- #' A logical indicating if the sequencing depth is to be estimated only
1269
- #' with low-identity matches. By default, only high-identity matches are
1270
- #' used.
1271
- #'
1272
- #' @return
1273
- #' Returns a numeric vector of sequencing depths (in bp/bp).
1274
- #'
1275
- #' @author Luis M. Rodriguez-R [aut, cre]
1276
- #'
1277
- #' @export
1278
-
1279
- enve.recplot2.seqdepth <- function
1280
-
1281
- (x,
1282
- sel,
1283
- low.identity=FALSE
1284
- ){
1285
- if(!inherits(x, "enve.RecPlot2"))
1286
- stop("'x' must inherit from class `enve.RecPlot2`")
1287
- if(low.identity){
1288
- pos.cnts.in <- x$pos.counts.out
1289
- }else{
1290
- pos.cnts.in <- x$pos.counts.in
1291
- }
1292
- pos.breaks <- x$pos.breaks
1293
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
1294
- seqdepth.in <- pos.cnts.in/pos.binsize
1295
- if(missing(sel)) return(seqdepth.in)
1296
- return(seqdepth.in[sel])
1297
- }
1298
-
1299
- #' Enveomics: Recruitment Plot (2) ANI Estimate
1300
- #'
1301
- #' Estimate the Average Nucleotide Identity from reads (ANIr) from a
1302
- #' recruitment plot.
1303
- #'
1304
- #' @param x
1305
- #' \code{\link{enve.RecPlot2}} object.
1306
- #' @param range
1307
- #' Range of identities to be considered. By default, the full range
1308
- #' is used (note that the upper boundary is \code{Inf} and not 100 because
1309
- #' recruitment plots can also be built with bit-scores). To use only
1310
- #' intra-population matches (with identities), use c(95,100). To use only
1311
- #' inter-population values, use c(0,95).
1312
- #'
1313
- #' @author Luis M. Rodriguez-R [aut, cre]
1314
- #'
1315
- #' @export
1316
-
1317
- enve.recplot2.ANIr <- function
1318
- (x,
1319
- range=c(0,Inf)
1320
- ){
1321
- if(!inherits(x, "enve.RecPlot2"))
1322
- stop("'x' must inherit from class `enve.RecPlot2`")
1323
- id.b <- x$id.breaks
1324
- id <- (id.b[-1]+id.b[-length(id.b)])/2
1325
- cnt <- x$id.counts
1326
- cnt[id < range[1]] <- 0
1327
- cnt[id > range[2]] <- 0
1328
- return(sum(id*cnt/sum(cnt)))
1329
- }
1330
-
1331
- #==============> Define internal functions
1332
-
1333
- #' Enveomics: Recruitment Plot (2) Internal Ancillary Function
1334
- #'
1335
- #' Internal ancillary function (see \code{\link{enve.recplot2}}).
1336
- #'
1337
- #' @param x \code{\link{enve.RecPlot2}} object
1338
- #' @param pos.breaks Position breaks
1339
- #' @param id.breaks Identity breaks
1340
- #' @param rec.idcol Identity column to use
1341
- #'
1342
- #' @author Luis M. Rodriguez-R [aut, cre]
1343
- #' @author Kenji Gerhardt [aut]
1344
- #'
1345
- #' @export
1346
-
1347
- enve.recplot2.__counts <- function
1348
- (x, pos.breaks, id.breaks, rec.idcol) {
1349
- rec2 <- x$rec
1350
- verbose <- x$verbose
1351
-
1352
- # get counts of how many occurrences of each genome pos.bin there are per read
1353
- x.bins <- mapply(
1354
- function(start, end) {
1355
- list(rle(findInterval(start:end, pos.breaks, left.open = T)))
1356
- }, rec2[, 1], rec2[, 2])
1357
-
1358
- # find the single y bin for each row, replicates it at the correct places to
1359
- # the number of distinct bins found in its row
1360
- y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
1361
- times = unlist(lapply(x.bins, function(a) length(a$lengths))))
1362
-
1363
- # x.bins_counts is the number of occurrences of each bin a row contains,
1364
- # per row, then unlisted
1365
- x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
1366
-
1367
- # these are the pos. in. genome bins that each count in x.bins_counts falls into
1368
- x.bins <- unlist(lapply(x.bins, function(a) a$values))
1369
-
1370
- # much more efficient counts implementation in R using lists instead of a matrix:
1371
- counts <- lapply(
1372
- 1:(length(pos.breaks) - 1),
1373
- function(col_len) rep(0, length(id.breaks) - 1))
1374
-
1375
- # accesses the correct list in counts by x.bin, then
1376
- # accesses the position in that row by y.bins and adds the new count
1377
- for(i in 1:length(x.bins)){
1378
- counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
1379
- }
1380
-
1381
- counts <- do.call(rbind, counts)
1382
- return(counts)
1383
- }
1384
-
1385
- #' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
1386
- #'
1387
- #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
1388
- #'
1389
- #' @param x \code{\link{enve.RecPlot2}} object
1390
- #' @param comp Components
1391
- #' @param do_crit Function estimating the criterion
1392
- #' @param best Best solution thus far
1393
- #' @param verbose If verbose
1394
- #' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
1395
- #'
1396
- #' @author Luis M. Rodriguez-R [aut, cre]
1397
- #'
1398
- #' @export
1399
-
1400
- enve.recplot2.findPeaks.__emauto_one <- function
1401
- (x, comp, do_crit, best, verbose, ...){
1402
- peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
1403
- if(length(peaks)==0) return(best)
1404
- k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
1405
- crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
1406
- if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
1407
- ifelse(crit > best[['crit']], '*', ''), '\n')
1408
- if(crit > best[['crit']]){
1409
- best[['crit']] <- crit
1410
- best[['peaks']] <- peaks
1411
- }
1412
- best[['pstore']][[comp]] <- peaks
1413
- return(best)
1414
- }
1415
-
1416
- #' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
1417
- #'
1418
- #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1419
- #'
1420
- #' @param x Vector of log-transformed sequencing depths
1421
- #' @param theta Parameters list
1422
- #'
1423
- #' @author Luis M. Rodriguez-R [aut, cre]
1424
- #'
1425
- #' @export
1426
-
1427
- enve.recplot2.findPeaks.__em_e <- function
1428
- (x, theta){
1429
- components <- length(theta[['mu']])
1430
- product <- do.call(cbind,
1431
- lapply(1:components,
1432
- function(i) dnorm(x, theta[['mu']][i],
1433
- theta[['sd']][i])*theta[['alpha']][i]))
1434
- sum.of.components <- rowSums(product)
1435
- posterior <- product / sum.of.components
1436
- for(i in which(sum.of.components == Inf)) {
1437
- cat(i,'/',nrow(product), ':', product[i,], '\n')
1438
- }
1439
-
1440
- return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1441
- }
1442
-
1443
- #' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
1444
- #'
1445
- #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
1446
- #'
1447
- #' @param x Vector of log-transformed sequencing depths
1448
- #' @param posterior Posterior probability
1449
- #'
1450
- #' @author Luis M. Rodriguez-R [aut, cre]
1451
- #'
1452
- #' @export
1453
-
1454
- enve.recplot2.findPeaks.__em_m <- function
1455
- (x, posterior){
1456
- components <- ncol(posterior)
1457
- n <- colSums(posterior)
1458
- mu <- colSums(posterior * x) / n
1459
- sd <- sqrt( colSums(
1460
- posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1461
- alpha <- n/length(x)
1462
- return(list(mu=mu, sd=sd, alpha=alpha))
1463
- }
1464
-
1465
- #' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
1466
- #'
1467
- #' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
1468
- #'
1469
- #' @param x \code{\link{enve.RecPlot2.Peak}} object
1470
- #' @param mids Midpoints
1471
- #' @param counts Counts
1472
- #'
1473
- #' @author Luis M. Rodriguez-R [aut, cre]
1474
- #'
1475
- #' @export
1476
-
1477
- enve.recplot2.__peakHist <- function
1478
- (x, mids, counts=TRUE){
1479
- d.o <- x$param.hat
1480
- if(length(x$log)==0) x$log <- FALSE
1481
- if(x$log){
1482
- d.o$x <- log(mids)
1483
- }else{
1484
- d.o$x <- mids
1485
- }
1486
- prob <- do.call(paste('d', x$dist, sep=''), d.o)
1487
- if(!counts) return(prob)
1488
- if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
1489
- return(prob*x$n.hat/sum(prob))
1490
- }
1491
-
1492
- #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
1493
- #'
1494
- #' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1495
- #'
1496
- #' @param lsd1 Vector of log-transformed sequencing depths
1497
- #' @param min.points Minimum number of points
1498
- #' @param quant.est Quantile estimate
1499
- #' @param mlv.opts List of options for \code{mlv}
1500
- #' @param fitdist.opts List of options for \code{fitdist}
1501
- #' @param with.skewness If skewed-normal should be used
1502
- #' @param optim.rounds Maximum number of optimization rounds
1503
- #' @param optim.epsilon Minimum difference considered negligible
1504
- #' @param n.total Global number of windows
1505
- #' @param merge.logdist Attempted \code{merge.logdist} parameter
1506
- #' @param verbose If verbose
1507
- #' @param log If log-transformed depths
1508
- #'
1509
- #' @author Luis M. Rodriguez-R [aut, cre]
1510
- #'
1511
- #' @export
1512
-
1513
- enve.recplot2.findPeaks.__mow_one <- function
1514
- (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
1515
- optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
1516
- ){
1517
- dist <- ifelse(with.skewness, 'sn', 'norm');
1518
-
1519
- # Find peak
1520
- o <- mlv.opts; o$x = lsd1;
1521
- mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
1522
- if(verbose) cat('Anchoring at mode =',mode1,'\n')
1523
- param.hat <- fitdist.opts$start; last.hat <- param.hat;
1524
- lim <- NA;
1525
- if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
1526
-
1527
- # Refine peak parameters
1528
- for(round in 1:optim.rounds){
1529
- param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
1530
- lim.o <- param.hat
1531
- lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
1532
- lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
1533
- if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
1534
- 'and params =',as.numeric(param.hat),' \r')
1535
- if(length(lsd1.pop) < min.points) break;
1536
- o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
1537
- last.last.hat <- last.hat
1538
- last.hat <- param.hat
1539
- param.hat <- as.list(do.call(fitdist, o)$estimate);
1540
- if(any(is.na(param.hat))){
1541
- if(round>1) param.hat <- last.hat;
1542
- break;
1543
- }
1544
- if(round > 1){
1545
- epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1546
- if(epsilon1 < optim.epsilon) break;
1547
- if(round > 2){
1548
- epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1549
- if(epsilon2 < optim.epsilon) break;
1550
- }
1551
- }
1552
- }
1553
- if(verbose) cat('\n')
1554
- if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
1555
-
1556
- # Mow distribution
1557
- lsd2 <- c();
1558
- lsd.pop <- c();
1559
- n.hat <- length(lsd1.pop)/diff(quant.est)
1560
- peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
1561
- param.hat=param.hat, n.hat=n.hat, n.total=n.total,
1562
- merge.logdist=merge.logdist, log=log)
1563
- peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
1564
- peak.cnt <- enve.recplot2.__peakHist(peak,
1565
- (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
1566
- for(i in 2:length(peak.breaks)){
1567
- values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
1568
- n.exp <- peak.cnt[i-1]
1569
- if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
1570
- if(length(values)==0) next
1571
- in.peak <- runif(length(values)) <= n.exp/length(values)
1572
- lsd2 <- c(lsd2, values[!in.peak])
1573
- lsd.pop <- c(lsd.pop, values[in.peak])
1574
- }
1575
- if(length(lsd.pop) < min.points) return(NULL)
1576
-
1577
- # Return peak
1578
- attr(peak, 'values') <- lsd.pop
1579
- attr(peak, 'values.res') <- lsd2
1580
- attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
1581
- plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
1582
- plot=FALSE)$counts)+1)/2
1583
- mu <- tail(param.hat, n=1)
1584
- attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
1585
- if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
1586
- 'with expected n =',n.hat,'\n')
1587
- return(peak)
1588
- }
1589
-
1590
- #' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
1591
- #'
1592
- #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
1593
- #'
1594
- #' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
1595
- #'
1596
- #' @author Luis M. Rodriguez-R [aut, cre]
1597
- #'
1598
- #' @export
1599
-
1600
- enve.recplot2.findPeaks.__mower <- function
1601
- (peaks.opts){
1602
- peaks <- list()
1603
- while(length(peaks.opts$lsd1) > peaks.opts$min.points){
1604
- peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
1605
- if(is.null(peak)) break
1606
- peaks[[ length(peaks)+1 ]] <- peak
1607
- peaks.opts$lsd1 <- peak$values.res
1608
- }
1609
- return(peaks)
1610
- }
1611
-
1612
- #' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
1613
- #'
1614
- #' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
1615
- #'
1616
- #' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
1617
- #' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
1618
- #'
1619
- #' @author Luis M. Rodriguez-R [aut, cre]
1620
- #'
1621
- #' @export
1622
-
1623
- enve.recplot2.__whichClosestPeak <- function
1624
- (peak, peaks){
1625
- dist <- as.numeric(lapply(peaks,
1626
- function(x)
1627
- abs(log(x$param.hat[[ length(x$param.hat) ]] /
1628
- peak$param.hat[[ length(peak$param.hat) ]] ))))
1629
- dist[ dist==0 ] <- Inf
1630
- return(which.min(dist))
1631
- }