miga-base 1.2.17.0 → 1.2.17.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI +3659 -0
- data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
- data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +162 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +55 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +421 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +108 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +80 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- metadata +301 -5
@@ -0,0 +1,1631 @@
|
|
1
|
+
#==============> Define S4 classes
|
2
|
+
|
3
|
+
#' Enveomics: Recruitment Plot (2) - S4 Class
|
4
|
+
#'
|
5
|
+
#' Enve-omics representation of Recruitment plots. This object can
|
6
|
+
#' be produced by \code{\link{enve.recplot2}} and supports S4 method plot.
|
7
|
+
#'
|
8
|
+
#' @slot counts \code{(matrix)} Counts as a two-dimensional histogram.
|
9
|
+
#' @slot pos.counts.in \code{(numeric)} Counts of in-group hits per position bin.
|
10
|
+
#' @slot pos.counts.out \code{(numeric)} Counts of out-group hits per position bin.
|
11
|
+
#' @slot id.counts \code{(numeric)} Counts per ID bin.
|
12
|
+
#' @slot id.breaks \code{(numeric)} Breaks of identity bins.
|
13
|
+
#' @slot pos.breaks \code{(numeric)} Breaks of position bins.
|
14
|
+
#' @slot pos.names \code{(character)} Names of the position bins.
|
15
|
+
#' @slot seq.breaks \code{(numeric)} Breaks of input sequences.
|
16
|
+
#' @slot peaks \code{(list)} Peaks identified in the recplot.
|
17
|
+
#' Limits of the subject sequences after concatenation.
|
18
|
+
#' @slot seq.names \code{(character}) Names of the subject sequences.
|
19
|
+
#' @slot id.metric \code{(character}) Metric used as 'identity'.
|
20
|
+
#' @slot id.ingroup \code{(logical}) Identity bins considered in-group.
|
21
|
+
#' @slot call \code{(call)} Call producing this object.
|
22
|
+
#'
|
23
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
24
|
+
#'
|
25
|
+
#' @exportClass
|
26
|
+
|
27
|
+
enve.RecPlot2 <- setClass("enve.RecPlot2",
|
28
|
+
representation(
|
29
|
+
# slots = list(
|
30
|
+
counts='matrix',
|
31
|
+
pos.counts.in='numeric',
|
32
|
+
pos.counts.out='numeric',
|
33
|
+
id.counts='numeric',
|
34
|
+
id.breaks='numeric',
|
35
|
+
pos.breaks='numeric',
|
36
|
+
pos.names='character',
|
37
|
+
seq.breaks='numeric',
|
38
|
+
peaks='list',
|
39
|
+
seq.names='character',
|
40
|
+
id.metric='character',
|
41
|
+
id.ingroup='logical',
|
42
|
+
call='call')
|
43
|
+
,package='enveomics.R'
|
44
|
+
);
|
45
|
+
|
46
|
+
#' Enveomics: Recruitment Plot (2) Peak - S4 Class
|
47
|
+
#'
|
48
|
+
#' Enve-omics representation of a peak in the sequencing depth histogram
|
49
|
+
#' of a Recruitment plot (see \code{\link{enve.recplot2.findPeaks}}).
|
50
|
+
#'
|
51
|
+
#' @slot dist \code{(character)}
|
52
|
+
#' Distribution of the peak. Currently supported: \code{norm} (normal) and \code{sn}
|
53
|
+
#' (skew-normal).
|
54
|
+
#' @slot values \code{(numeric)}
|
55
|
+
#' Sequencing depth values predicted to conform the peak.
|
56
|
+
#' @slot values.res \code{(numeric)}
|
57
|
+
#' Sequencing depth values not explained by this or previously identified
|
58
|
+
#' peaks.
|
59
|
+
#' @slot mode \code{(numeric)}
|
60
|
+
#' Seed-value of mode anchoring the peak.
|
61
|
+
#' @slot param.hat \code{(list)}
|
62
|
+
#' Parameters of the distribution. A list of two values if dist=\code{norm} (sd
|
63
|
+
#' and mean), or three values if dist=\code{sn}(omega=scale, alpha=shape, and
|
64
|
+
#' xi=location). Note that the "dispersion" parameter is always first and
|
65
|
+
#' the "location" parameter is always last.
|
66
|
+
#' @slot n.hat \code{(numeric)}
|
67
|
+
#' Number of bins estimated to be explained by this peak. This should
|
68
|
+
#' ideally be equal to the length of \code{values}, but it's not an integer.
|
69
|
+
#' @slot n.total \code{(numeric)}
|
70
|
+
#' Total number of bins from which the peak was extracted. I.e., total
|
71
|
+
#' number of position bins with non-zero sequencing depth in the recruitment
|
72
|
+
#' plot (regardless of peak count).
|
73
|
+
#' @slot err.res \code{(numeric)}
|
74
|
+
#' Error left after adding the peak (mower) or log-likelihood (em or emauto).
|
75
|
+
#' @slot merge.logdist \code{(numeric)}
|
76
|
+
#' Attempted \code{merge.logdist} parameter.
|
77
|
+
#' @slot seq.depth \code{(numeric)}
|
78
|
+
#' Best estimate available for the sequencing depth of the peak (centrality).
|
79
|
+
#' @slot log \code{(logical)}
|
80
|
+
#' Indicates if the estimation was performed in natural logarithm space.
|
81
|
+
#'
|
82
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
83
|
+
#'
|
84
|
+
#' @exportClass
|
85
|
+
|
86
|
+
enve.RecPlot2.Peak <- setClass("enve.RecPlot2.Peak",
|
87
|
+
representation(
|
88
|
+
# slots = list(
|
89
|
+
dist='character',
|
90
|
+
values='numeric',
|
91
|
+
values.res='numeric',
|
92
|
+
mode='numeric',
|
93
|
+
param.hat='list',
|
94
|
+
n.hat='numeric',
|
95
|
+
n.total='numeric',
|
96
|
+
err.res='numeric',
|
97
|
+
merge.logdist='numeric',
|
98
|
+
seq.depth='numeric',
|
99
|
+
log='logical'
|
100
|
+
));
|
101
|
+
|
102
|
+
#' Attribute accessor
|
103
|
+
#'
|
104
|
+
#'
|
105
|
+
#' @param x Object
|
106
|
+
#' @param name Attribute name
|
107
|
+
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
108
|
+
|
109
|
+
#' Attribute accessor
|
110
|
+
#'
|
111
|
+
#'
|
112
|
+
#' @param x Object
|
113
|
+
#' @param name Attribute name
|
114
|
+
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
115
|
+
|
116
|
+
#==============> Define S4 methods
|
117
|
+
|
118
|
+
#' Enveomics: Recruitment Plot (2)
|
119
|
+
#'
|
120
|
+
#' Plots an \code{\link{enve.RecPlot2}} object.
|
121
|
+
#'
|
122
|
+
#' @param x
|
123
|
+
#' \code{\link{enve.RecPlot2}} object to plot.
|
124
|
+
#' @param layout
|
125
|
+
#' Matrix indicating the position of the different panels in the layout,
|
126
|
+
#' where:
|
127
|
+
#' \itemize{
|
128
|
+
#' \item 0: Empty space
|
129
|
+
#' \item 1: Counts matrix
|
130
|
+
#' \item 2: position histogram (sequencing depth)
|
131
|
+
#' \item 3: identity histogram
|
132
|
+
#' \item 4: Populations histogram (histogram of sequencing depths)
|
133
|
+
#' \item 5: Color scale for the counts matrix (vertical)
|
134
|
+
#' \item 6: Color scale of the counts matrix (horizontal)
|
135
|
+
#' }
|
136
|
+
#' Only panels indicated here will be plotted. To plot only one panel
|
137
|
+
#' simply set this to the number of the panel you want to plot.
|
138
|
+
#' @param panel.fun
|
139
|
+
#' List of functions to be executed after drawing each panel. Use the
|
140
|
+
#' indices in \code{layout} (as characters) as keys. Functions for indices
|
141
|
+
#' missing in \code{layout} are ignored. For example, to add a vertical line
|
142
|
+
#' at the 3Mbp mark in both the position histogram and the counts matrix:
|
143
|
+
#' \code{list('1'=function() abline(v=3), '2'=function() abline(v=3))}.
|
144
|
+
#' Note that the X-axis in both panels is in Mbp by default. To change
|
145
|
+
#' this behavior, set \code{pos.units} accordingly.
|
146
|
+
#' @param widths
|
147
|
+
#' Relative widths of the columns of \code{layout}.
|
148
|
+
#' @param heights
|
149
|
+
#' Relative heights of the rows of \code{layout}.
|
150
|
+
#' @param palette
|
151
|
+
#' Colors to be used to represent the counts matrix, sorted from no hits
|
152
|
+
#' to the maximum sequencing depth.
|
153
|
+
#' @param underlay.group
|
154
|
+
#' If TRUE, it indicates the in-group and out-group areas couloured based
|
155
|
+
#' on \code{in.col} and \code{out.col}. Requires support for semi-transparency.
|
156
|
+
#' @param peaks.col
|
157
|
+
#' If not \code{NA}, it attempts to represent peaks in the population histogram
|
158
|
+
#' in the specified color. Set to \code{NA} to avoid peak-finding.
|
159
|
+
#' @param use.peaks
|
160
|
+
#' A list of \code{\link{enve.RecPlot2.Peak}} objects, as returned by
|
161
|
+
#' \code{\link{enve.recplot2.findPeaks}}. If passed, \code{peaks.opts} is ignored.
|
162
|
+
#' @param id.lim
|
163
|
+
#' Limits of identities to represent.
|
164
|
+
#' @param pos.lim
|
165
|
+
#' Limits of positions to represent (in bp, regardless of \code{pos.units}).
|
166
|
+
#' @param pos.units
|
167
|
+
#' Units in which the positions should be represented (powers of 1,000
|
168
|
+
#' base pairs).
|
169
|
+
#' @param mar
|
170
|
+
#' Margins of the panels as a list, with the character representation of
|
171
|
+
#' the number of the panel as index (see \code{layout}).
|
172
|
+
#' @param pos.splines
|
173
|
+
#' Smoothing parameter for the splines in the position histogram. Zero
|
174
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
175
|
+
#' cross-validation.
|
176
|
+
#' @param id.splines
|
177
|
+
#' Smoothing parameter for the splines in the identity histogram. Zero
|
178
|
+
#' (0) for no splines. Use \code{NULL} to automatically detect by leave-one-out
|
179
|
+
#' cross-validation.
|
180
|
+
#' @param in.lwd
|
181
|
+
#' Line width for the sequencing depth of in-group matches.
|
182
|
+
#' @param out.lwd
|
183
|
+
#' Line width for the sequencing depth of out-group matches.
|
184
|
+
#' @param id.lwd
|
185
|
+
#' Line width for the identity histogram.
|
186
|
+
#' @param in.col
|
187
|
+
#' Color associated to in-group matches.
|
188
|
+
#' @param out.col
|
189
|
+
#' Color associated to out-group matches.
|
190
|
+
#' @param id.col
|
191
|
+
#' Color for the identity histogram.
|
192
|
+
#' @param breaks.col
|
193
|
+
#' Color of the vertical lines indicating sequence breaks.
|
194
|
+
#' @param peaks.opts
|
195
|
+
#' Options passed to \code{\link{enve.recplot2.findPeaks}},
|
196
|
+
#' if \code{peaks.col} is not \code{NA}.
|
197
|
+
#' @param ...
|
198
|
+
#' Any other graphic parameters (currently ignored).
|
199
|
+
#'
|
200
|
+
#' @return
|
201
|
+
#' Returns a list of \code{\link{enve.RecPlot2.Peak}} objects (see
|
202
|
+
#' \code{\link{enve.recplot2.findPeaks}}). If \code{peaks.col=NA} or
|
203
|
+
#' \code{layout} doesn't include 4, returns \code{NA}.
|
204
|
+
#'
|
205
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
206
|
+
#'
|
207
|
+
#' @method plot enve.RecPlot2
|
208
|
+
#' @export
|
209
|
+
|
210
|
+
plot.enve.RecPlot2 <- function
|
211
|
+
(x,
|
212
|
+
layout=matrix(c(5,5,2,1,4,3), nrow=2),
|
213
|
+
panel.fun=list(),
|
214
|
+
widths=c(1,7,2),
|
215
|
+
heights=c(1,2),
|
216
|
+
palette=grey((100:0)/100),
|
217
|
+
underlay.group=TRUE,
|
218
|
+
peaks.col='darkred',
|
219
|
+
use.peaks,
|
220
|
+
id.lim=range(x$id.breaks),
|
221
|
+
pos.lim=range(x$pos.breaks),
|
222
|
+
pos.units=c('Mbp','Kbp','bp'),
|
223
|
+
mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
|
224
|
+
'3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
|
225
|
+
'4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
|
226
|
+
'5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
|
227
|
+
pos.splines=0,
|
228
|
+
id.splines=1/2,
|
229
|
+
in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
230
|
+
out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
|
231
|
+
id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
|
232
|
+
in.col='darkblue',
|
233
|
+
out.col='lightblue',
|
234
|
+
id.col='black',
|
235
|
+
breaks.col='#AAAAAA40',
|
236
|
+
peaks.opts=list(),
|
237
|
+
...
|
238
|
+
){
|
239
|
+
pos.units <- match.arg(pos.units);
|
240
|
+
pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
|
241
|
+
pos.lim <- pos.lim/pos.factor;
|
242
|
+
lmat <- layout;
|
243
|
+
for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
|
244
|
+
|
245
|
+
layout(lmat, widths=widths, heights=heights);
|
246
|
+
ori.mar <- par('mar');
|
247
|
+
|
248
|
+
# Essential vars
|
249
|
+
counts <- x$counts
|
250
|
+
|
251
|
+
id.ingroup <- x$id.ingroup
|
252
|
+
id.counts <- x$id.counts
|
253
|
+
id.breaks <- x$id.breaks
|
254
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
|
255
|
+
id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
|
256
|
+
|
257
|
+
pos.counts.in <- x$pos.counts.in
|
258
|
+
pos.counts.out <- x$pos.counts.out
|
259
|
+
pos.breaks <- x$pos.breaks/pos.factor
|
260
|
+
pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
|
261
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
|
262
|
+
|
263
|
+
seqdepth.in <- pos.counts.in/pos.binsize
|
264
|
+
seqdepth.out <- pos.counts.out/pos.binsize
|
265
|
+
seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
|
266
|
+
seqdepth.out[seqdepth.out>0]))*c(1/2,2)
|
267
|
+
|
268
|
+
if(underlay.group){
|
269
|
+
in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
|
270
|
+
list(maxColorValue=256, alpha=62)));
|
271
|
+
out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
|
272
|
+
list(maxColorValue=256, alpha=52)));
|
273
|
+
}
|
274
|
+
|
275
|
+
# [1] Counts matrix
|
276
|
+
if(any(layout==1)){
|
277
|
+
par(mar=mar[['1']]);
|
278
|
+
plot(1, t='n', bty='l',
|
279
|
+
xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
|
280
|
+
xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
|
281
|
+
if(underlay.group){
|
282
|
+
rect(pos.lim[1], id.lim[1], pos.lim[2],
|
283
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
284
|
+
rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
|
285
|
+
id.lim[2], col=in.bg, border=NA);
|
286
|
+
}
|
287
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
288
|
+
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
289
|
+
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
290
|
+
length.out=1+length(palette)), add=TRUE);
|
291
|
+
if(exists('1',panel.fun)) panel.fun[['1']]();
|
292
|
+
}
|
293
|
+
|
294
|
+
# [2] Position histogram
|
295
|
+
if(any(layout==2)){
|
296
|
+
par(mar=mar[['2']]);
|
297
|
+
if(any(layout==1)){
|
298
|
+
xlab=''
|
299
|
+
xaxt='n'
|
300
|
+
}else{
|
301
|
+
xlab=paste('Position in genome (',pos.units,')',sep='')
|
302
|
+
xaxt='s'
|
303
|
+
}
|
304
|
+
plot(1,t='n', bty='l', log='y',
|
305
|
+
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
306
|
+
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
307
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
308
|
+
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
309
|
+
pos.f <- rep(seqdepth.in,each=2)
|
310
|
+
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
311
|
+
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
312
|
+
if(is.null(pos.splines) || pos.splines > 0){
|
313
|
+
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
314
|
+
spar=pos.splines)
|
315
|
+
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
316
|
+
}
|
317
|
+
if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
|
318
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
|
319
|
+
seqdepth.lim[1]*3/2, col=out.col, border=NA);
|
320
|
+
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
321
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
322
|
+
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
323
|
+
if(exists('2',panel.fun)) panel.fun[['2']]();
|
324
|
+
}
|
325
|
+
|
326
|
+
# [3] Identity histogram
|
327
|
+
if(any(layout==3)){
|
328
|
+
par(mar=mar[['3']]);
|
329
|
+
if(any(layout==1)){
|
330
|
+
ylab=''
|
331
|
+
yaxt='n'
|
332
|
+
}else{
|
333
|
+
ylab=x$id.metric
|
334
|
+
yaxt='s'
|
335
|
+
}
|
336
|
+
if(sum(id.counts>0) >= 4){
|
337
|
+
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
338
|
+
plot(1,t='n', bty='l', log='x',
|
339
|
+
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
340
|
+
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
341
|
+
if(underlay.group){
|
342
|
+
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
343
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
344
|
+
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
345
|
+
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
346
|
+
}
|
347
|
+
id.f <- rep(id.counts,each=2)
|
348
|
+
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
349
|
+
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
350
|
+
if(is.null(id.splines) || id.splines > 0){
|
351
|
+
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
352
|
+
spar=id.splines)
|
353
|
+
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
354
|
+
}
|
355
|
+
}else{
|
356
|
+
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
357
|
+
text(1,1,labels='Insufficient data', srt=90)
|
358
|
+
}
|
359
|
+
if(exists('3',panel.fun)) panel.fun[['3']]();
|
360
|
+
}
|
361
|
+
|
362
|
+
# [4] Populations histogram
|
363
|
+
peaks <- NA;
|
364
|
+
if(any(layout==4)){
|
365
|
+
par(mar=mar[['4']]);
|
366
|
+
if(any(layout==2)){
|
367
|
+
ylab=''
|
368
|
+
yaxt='n'
|
369
|
+
}else{
|
370
|
+
ylab='Sequencing depth (X)'
|
371
|
+
yaxt='s'
|
372
|
+
}
|
373
|
+
h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
|
374
|
+
length.out=200);
|
375
|
+
h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
|
376
|
+
h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
|
377
|
+
plot(1, t='n', log='y',
|
378
|
+
xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
|
379
|
+
xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
|
380
|
+
yaxt=yaxt)
|
381
|
+
y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
382
|
+
y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
383
|
+
lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
|
384
|
+
y.tmp.out, col=out.col)
|
385
|
+
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
386
|
+
y.tmp.in, border=NA, col=in.col)
|
387
|
+
if(!is.na(peaks.col)){
|
388
|
+
o <- peaks.opts; o$x = x;
|
389
|
+
if(missing(use.peaks)){
|
390
|
+
peaks <- do.call(enve.recplot2.findPeaks, o)
|
391
|
+
}else{
|
392
|
+
peaks <- use.peaks
|
393
|
+
}
|
394
|
+
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
395
|
+
if(!is.null(peaks) & length(peaks)>0){
|
396
|
+
pf <- h.mids*0;
|
397
|
+
for(i in 1:length(peaks)){
|
398
|
+
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
399
|
+
lines(cnt, h.mids, col='red');
|
400
|
+
pf <- pf+cnt;
|
401
|
+
axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
|
402
|
+
}
|
403
|
+
lines(pf, h.mids, col='red',lwd=1.5);
|
404
|
+
dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
|
405
|
+
frx <- signif(100*as.numeric(
|
406
|
+
lapply(peaks,
|
407
|
+
function(x) ifelse(length(x$values)==0, x$n.hat,
|
408
|
+
length(x$values))/x$n.total)), 2)
|
409
|
+
if(peaks[[1]]$err.res < 0){
|
410
|
+
err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
|
411
|
+
}else{
|
412
|
+
err <- paste(', err:',
|
413
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
|
414
|
+
}
|
415
|
+
legend('topright', bty='n', cex=1/2,
|
416
|
+
legend=paste(letters[1:length(peaks)],'. ',
|
417
|
+
dpt,'X (', frx, '%', err, ')', sep=''))
|
418
|
+
}
|
419
|
+
}
|
420
|
+
if(exists('4',panel.fun)) panel.fun[['4']]();
|
421
|
+
}
|
422
|
+
|
423
|
+
# [5] Color scale of the counts matrix (vertical)
|
424
|
+
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
425
|
+
length.out=1+length(palette))
|
426
|
+
if(any(layout==5)){
|
427
|
+
par(mar=mar[['5']]);
|
428
|
+
plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
|
429
|
+
ylim=range(count.bins), yaxs='i', ylab='')
|
430
|
+
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
431
|
+
border=NA)
|
432
|
+
if(exists('5',panel.fun)) panel.fun[['5']]();
|
433
|
+
}
|
434
|
+
|
435
|
+
# [6] Color scale of the coutnts matrix (horizontal)
|
436
|
+
if(any(layout==6)){
|
437
|
+
par(mar=mar[['6']]);
|
438
|
+
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
439
|
+
xlim=range(count.bins), xaxs='i',xlab='');
|
440
|
+
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
441
|
+
border=NA);
|
442
|
+
if(exists('6',panel.fun)) panel.fun[['6']]();
|
443
|
+
}
|
444
|
+
|
445
|
+
par(mar=ori.mar);
|
446
|
+
return(peaks);
|
447
|
+
}
|
448
|
+
|
449
|
+
#==============> Define core functions
|
450
|
+
|
451
|
+
#' Enveomics: Recruitment Plot (2)
|
452
|
+
#'
|
453
|
+
#' Produces recruitment plots provided that \code{BlastTab.catsbj.pl} has
|
454
|
+
#' been previously executed.
|
455
|
+
#'
|
456
|
+
#' @param prefix
|
457
|
+
#' Path to the prefix of the \code{BlastTab.catsbj.pl} output files. At
|
458
|
+
#' least the files .rec and .lim must exist with this prefix.
|
459
|
+
#' @param plot
|
460
|
+
#' Should the object be plotted?
|
461
|
+
#' @param pos.breaks
|
462
|
+
#' Breaks in the positions histogram. It can also be a vector of break
|
463
|
+
#' points, and values outside the range are ignored. If zero (0), it
|
464
|
+
#' uses the sequence breaks as defined in the .lim file, which means
|
465
|
+
#' one bin per contig (or gene, if the mapping is agains genes). Ignored
|
466
|
+
#' if `pos.breaks.tsv` is passed.
|
467
|
+
#' @param pos.breaks.tsv
|
468
|
+
#' Path to a list of (absolute) coordinates to use as position breaks.
|
469
|
+
#' This tab-delimited file can be produced by \code{GFF.catsbj.pl}, and it
|
470
|
+
#' must contain at least one column: coordinates of the break positions of
|
471
|
+
#' each position bin. If it has a second column, this is used as the name
|
472
|
+
#' of the position bin that ends at the given coordinate (the first row is
|
473
|
+
#' ignored). Any additional columns are currently ignored. If \code{NA},
|
474
|
+
#' position bins are determined by \code{pos.breaks}.
|
475
|
+
#' @param id.breaks
|
476
|
+
#' Breaks in the identity histogram. It can also be a vector of break
|
477
|
+
#' points, and values outside the range are ignored.
|
478
|
+
#' @param id.free.range
|
479
|
+
#' Indicates that the range should be freely set from the observed
|
480
|
+
#' values. Otherwise, 70-100\% is included in the identity histogram
|
481
|
+
#' (default).
|
482
|
+
#' @param id.metric
|
483
|
+
#' Metric of identity to be used (Y-axis). Corrected identity is only
|
484
|
+
#' supported if the original BLAST file included sequence lengths.
|
485
|
+
#' @param id.summary
|
486
|
+
#' Function summarizing the identity bins. Other recommended options
|
487
|
+
#' include: \code{median} to estimate the median instead of total bins, and
|
488
|
+
#' \code{function(x) mlv(x,method='parzen')$M} to estimate the mode.
|
489
|
+
#' @param id.cutoff
|
490
|
+
#' Cutoff of identity metric above which the hits are considered
|
491
|
+
#' \code{in-group}. The 95\% identity corresponds to the expectation of
|
492
|
+
#' ANI<95\% within species.
|
493
|
+
#' @param threads
|
494
|
+
#' Number of threads to use.
|
495
|
+
#' @param verbose
|
496
|
+
#' Indicates if the function should report the advance.
|
497
|
+
#' @param ...
|
498
|
+
#' Any additional parameters supported by \code{\link{plot.enve.RecPlot2}}.
|
499
|
+
#'
|
500
|
+
#' @return Returns an object of class \code{\link{enve.RecPlot2}}.
|
501
|
+
#'
|
502
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
503
|
+
#' @author Kenji Gerhardt [aut]
|
504
|
+
#'
|
505
|
+
#' @export
|
506
|
+
|
507
|
+
enve.recplot2 <- function(
|
508
|
+
prefix,
|
509
|
+
plot = TRUE,
|
510
|
+
pos.breaks = 1e3,
|
511
|
+
pos.breaks.tsv = NA,
|
512
|
+
id.breaks = 60,
|
513
|
+
id.free.range = FALSE,
|
514
|
+
id.metric = c('identity', 'corrected identity', 'bit score'),
|
515
|
+
id.summary = sum,
|
516
|
+
id.cutoff = 95,
|
517
|
+
threads = 2,
|
518
|
+
verbose = TRUE,
|
519
|
+
...
|
520
|
+
){
|
521
|
+
# Settings
|
522
|
+
id.metric <- match.arg(id.metric);
|
523
|
+
|
524
|
+
#Read files
|
525
|
+
if (verbose) cat("Reading files.\n")
|
526
|
+
rec <- read.table(paste(prefix, ".rec", sep = ""),
|
527
|
+
sep = "\t", comment.char = "", quote = "");
|
528
|
+
lim <- read.table(paste(prefix, ".lim", sep = ""),
|
529
|
+
sep = "\t", comment.char = "", quote = "", as.is = TRUE);
|
530
|
+
|
531
|
+
# Build matrix
|
532
|
+
if (verbose) cat("Building counts matrix.\n")
|
533
|
+
if (id.metric == "corrected identity" & ncol(rec) < 6) {
|
534
|
+
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
535
|
+
}
|
536
|
+
rec.idcol <- ifelse(id.metric == "identity", 3,
|
537
|
+
ifelse(id.metric == "corrected identity", 6, 4))
|
538
|
+
pos.names <- as.character(NULL)
|
539
|
+
if (!is.na(pos.breaks.tsv)){
|
540
|
+
tmp <- read.table(pos.breaks.tsv, sep = "\t", header = FALSE, as.is = TRUE)
|
541
|
+
pos.breaks <- as.numeric(tmp[, 1])
|
542
|
+
if (ncol(tmp) > 1) pos.names <- as.character(tmp[-1, 2])
|
543
|
+
} else if (length(pos.breaks) == 1) {
|
544
|
+
if (pos.breaks > 0){
|
545
|
+
pos.breaks <- seq(min(lim[, 2]), max(lim[, 3]), length.out = pos.breaks + 1)
|
546
|
+
} else {
|
547
|
+
pos.breaks <- c(lim[1, 2], lim[, 3])
|
548
|
+
pos.names <- lim[, 1]
|
549
|
+
}
|
550
|
+
}
|
551
|
+
if (length(id.breaks) == 1) {
|
552
|
+
id.range.v <- rec[, rec.idcol]
|
553
|
+
if (!id.free.range) id.range.v <- c(id.range.v, 70, 100)
|
554
|
+
id.range.v <- range(id.range.v)
|
555
|
+
id.breaks <- seq(id.range.v[1], id.range.v[2], length.out = id.breaks + 1)
|
556
|
+
}
|
557
|
+
|
558
|
+
# Run in parallel
|
559
|
+
# If they already set threads to 1 manually, there's no point in launching
|
560
|
+
# clusters, it's just slower. Ditto for small files.
|
561
|
+
if (nrow(rec) < 75000 | threads == 1) {
|
562
|
+
# Coerces rec into a form that __counts is happy about
|
563
|
+
rec.l <- list()
|
564
|
+
rec.l[[1]] <- list(rec = rec, verbose = FALSE)
|
565
|
+
|
566
|
+
# No need to make a temporary variable, there's only one return for sure
|
567
|
+
# and it's not a list because it isn't coming back from an apply
|
568
|
+
counts <- enve.recplot2.__counts(
|
569
|
+
rec.l[[1]], pos.breaks = pos.breaks, id.breaks = id.breaks,
|
570
|
+
rec.idcol = rec.idcol)
|
571
|
+
} else {
|
572
|
+
cl <- makeCluster(threads)
|
573
|
+
rec.l <- list()
|
574
|
+
thl <- ceiling(nrow(rec)/threads)
|
575
|
+
for (i in 0:(threads - 1)) {
|
576
|
+
rec.l[[i + 1]] <- list(
|
577
|
+
rec = rec[(i * thl + 1):min(((i + 1) * thl), nrow(rec)), ],
|
578
|
+
verbose = ifelse(i == 0, verbose, FALSE))
|
579
|
+
}
|
580
|
+
counts.l <- clusterApply(
|
581
|
+
cl, rec.l, enve.recplot2.__counts, pos.breaks = pos.breaks,
|
582
|
+
id.breaks = id.breaks, rec.idcol = rec.idcol)
|
583
|
+
stopCluster(cl) # No spooky ghost clusters
|
584
|
+
|
585
|
+
counts <- counts.l[[1]]
|
586
|
+
for (i in 2:threads) counts <- counts + counts.l[[i]]
|
587
|
+
}
|
588
|
+
|
589
|
+
# Estimate 1D histograms
|
590
|
+
if (verbose) cat("Building histograms.\n")
|
591
|
+
id.mids <- (id.breaks[-length(id.breaks)] + id.breaks[-1])/2;
|
592
|
+
id.ingroup <- (id.mids > id.cutoff);
|
593
|
+
id.counts <- apply(counts, 2, id.summary);
|
594
|
+
pos.counts.in <- apply(counts[, id.ingroup], 1, sum);
|
595
|
+
pos.counts.out <- apply(counts[, !id.ingroup], 1, sum);
|
596
|
+
|
597
|
+
# Plot and return
|
598
|
+
recplot <- new('enve.RecPlot2',
|
599
|
+
counts = counts, id.counts = id.counts,
|
600
|
+
pos.counts.in = pos.counts.in, pos.counts.out = pos.counts.out,
|
601
|
+
id.breaks = id.breaks, pos.breaks = pos.breaks,
|
602
|
+
pos.names = pos.names, seq.breaks = c(lim[1, 2], lim[, 3]),
|
603
|
+
seq.names = lim[, 1], id.ingroup = id.ingroup,
|
604
|
+
id.metric = id.metric, call = match.call());
|
605
|
+
if (plot) {
|
606
|
+
if (verbose) cat("Plotting.\n")
|
607
|
+
peaks <- plot(recplot, ...);
|
608
|
+
attr(recplot, "peaks") <- peaks
|
609
|
+
}
|
610
|
+
return(recplot);
|
611
|
+
}
|
612
|
+
|
613
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder
|
614
|
+
#'
|
615
|
+
#' Identifies peaks in the population histogram potentially indicating
|
616
|
+
#' sub-population mixtures.
|
617
|
+
#'
|
618
|
+
#' @param x
|
619
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
620
|
+
#' @param method
|
621
|
+
#' Peak-finder method. This should be one of:
|
622
|
+
#' \itemize{
|
623
|
+
#' \item \strong{emauto}
|
624
|
+
#' (Expectation-Maximization with auto-selection of components)
|
625
|
+
#' \item \strong{em}
|
626
|
+
#' (Expectation-Maximization)
|
627
|
+
#' \item \strong{mower}
|
628
|
+
#' (Custom distribution-mowing method)
|
629
|
+
#' }
|
630
|
+
#' @param ...
|
631
|
+
#' Any additional parameters supported by
|
632
|
+
#' \code{\link{enve.recplot2.findPeaks}}.
|
633
|
+
#'
|
634
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
635
|
+
#'
|
636
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
637
|
+
#'
|
638
|
+
#' export
|
639
|
+
|
640
|
+
enve.recplot2.findPeaks <- function(
|
641
|
+
x,
|
642
|
+
method="emauto",
|
643
|
+
...
|
644
|
+
){
|
645
|
+
if(method == "emauto"){
|
646
|
+
peaks <- enve.recplot2.findPeaks.emauto(x, ...)
|
647
|
+
}else if(method == "em"){
|
648
|
+
peaks <- enve.recplot2.findPeaks.em(x, ...)
|
649
|
+
}else if(method == "mower"){
|
650
|
+
peaks <- enve.recplot2.findPeaks.mower(x, ...)
|
651
|
+
}else{
|
652
|
+
stop("Invalid peak-finder method ", method)
|
653
|
+
}
|
654
|
+
return(peaks)
|
655
|
+
}
|
656
|
+
|
657
|
+
#' Enveomics: Recruitment Plot (2) Emauto Peak Finder
|
658
|
+
#'
|
659
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
660
|
+
#' Model Expectation Maximization (GMM-EM) method with number of components
|
661
|
+
#' automatically detected.
|
662
|
+
#'
|
663
|
+
#' @param x
|
664
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
665
|
+
#' @param components
|
666
|
+
#' A vector of number of components to evaluate.
|
667
|
+
#' @param criterion
|
668
|
+
#' Criterion to use for components selection. Must be one of:
|
669
|
+
#' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
|
670
|
+
#' (Bayesian Information Criterion or Schwarz Criterion).
|
671
|
+
#' @param merge.tol
|
672
|
+
#' When attempting to merge peaks with very similar sequencing depth, use
|
673
|
+
#' this number of significant digits (in log-scale).
|
674
|
+
#' @param verbose
|
675
|
+
#' Display (mostly debugging) information.
|
676
|
+
#' @param ...
|
677
|
+
#' Any additional parameters supported by
|
678
|
+
#' \code{\link{enve.recplot2.findPeaks.em}}.
|
679
|
+
#'
|
680
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
681
|
+
#'
|
682
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
683
|
+
#'
|
684
|
+
#' @export
|
685
|
+
|
686
|
+
enve.recplot2.findPeaks.emauto <- function(
|
687
|
+
x,
|
688
|
+
components = seq(1, 5),
|
689
|
+
criterion = 'aic',
|
690
|
+
merge.tol = 2L,
|
691
|
+
verbose = FALSE,
|
692
|
+
...
|
693
|
+
){
|
694
|
+
best <- list(crit=0, pstore=list())
|
695
|
+
if(criterion == 'aic'){
|
696
|
+
do_crit <- function(ll, k, n) 2*k - 2*ll
|
697
|
+
}else if(criterion %in% c('bic', 'sbc')){
|
698
|
+
do_crit <- function(ll, k, n) log(n)*k - 2*ll
|
699
|
+
}else{
|
700
|
+
stop('Invalid criterion ', criterion)
|
701
|
+
}
|
702
|
+
for(comp in components){
|
703
|
+
if(verbose) cat('Testing:',comp,'\n')
|
704
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
|
705
|
+
verbose, ...)
|
706
|
+
}
|
707
|
+
if(length(best[['peaks']])==0) return(list())
|
708
|
+
|
709
|
+
seqdepths.r <- signif(log(sapply(best[['peaks']],
|
710
|
+
function(x) x$seq.depth)), merge.tol)
|
711
|
+
distinct <- length(unique(seqdepths.r))
|
712
|
+
if(distinct < length(best[['peaks']])){
|
713
|
+
if(verbose) cat('Attempting merge to', distinct, 'components\n')
|
714
|
+
init <- apply(sapply(best[['peaks']],
|
715
|
+
function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
|
716
|
+
init <- init[!duplicated(seqdepths.r),]
|
717
|
+
init <- list(mu=init[,'mean'], sd=init[,'sd'],
|
718
|
+
alpha=init[,'alpha']/sum(init[,'alpha']))
|
719
|
+
best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
|
720
|
+
verbose, ...)
|
721
|
+
}
|
722
|
+
return(best[['peaks']])
|
723
|
+
}
|
724
|
+
|
725
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder
|
726
|
+
#'
|
727
|
+
#' Identifies peaks in the population histogram using a Gaussian Mixture
|
728
|
+
#' Model Expectation Maximization (GMM-EM) method.
|
729
|
+
#'
|
730
|
+
#' @param x
|
731
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
732
|
+
#' @param max.iter
|
733
|
+
#' Maximum number of EM iterations.
|
734
|
+
#' @param ll.diff.res
|
735
|
+
#' Maximum Log-Likelihood difference to be considered as convergent.
|
736
|
+
#' @param components
|
737
|
+
#' Number of distributions assumed in the mixture.
|
738
|
+
#' @param rm.top
|
739
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
740
|
+
#' This step is useful to remove highly conserved regions, but can be
|
741
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
742
|
+
#' \strong{after} removing zero-coverage windows.
|
743
|
+
#' @param verbose
|
744
|
+
#' Display (mostly debugging) information.
|
745
|
+
#' @param init
|
746
|
+
#' Initialization parameters. By default, these are derived from k-means
|
747
|
+
#' clustering. A named list with vectors for \code{mu}, \code{sd}, and
|
748
|
+
#' \code{alpha}, each of length \code{components}.
|
749
|
+
#' @param log
|
750
|
+
#' Logical value indicating if the estimations should be performed in
|
751
|
+
#' natural logarithm units. Do not change unless you know what you're
|
752
|
+
#' doing.
|
753
|
+
#'
|
754
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
755
|
+
#'
|
756
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
757
|
+
#'
|
758
|
+
#' @export
|
759
|
+
|
760
|
+
enve.recplot2.findPeaks.em <- function(
|
761
|
+
x,
|
762
|
+
max.iter = 1000,
|
763
|
+
ll.diff.res = 1e-8,
|
764
|
+
components = 2,
|
765
|
+
rm.top = 0.05,
|
766
|
+
verbose = FALSE,
|
767
|
+
init,
|
768
|
+
log = TRUE
|
769
|
+
){
|
770
|
+
|
771
|
+
# Essential vars
|
772
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
773
|
+
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
774
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
|
775
|
+
if(log) lsd1 <- log(lsd1)
|
776
|
+
|
777
|
+
# 1. Initialize
|
778
|
+
if(missing(init)){
|
779
|
+
km.clust <- kmeans(lsd1, components)$cluster
|
780
|
+
init <- list(
|
781
|
+
mu = tapply(lsd1, km.clust, mean),
|
782
|
+
sd = tapply(lsd1, km.clust, sd),
|
783
|
+
alpha = table(km.clust) / length(km.clust)
|
784
|
+
)
|
785
|
+
}
|
786
|
+
m.step <- init
|
787
|
+
ll <- c()
|
788
|
+
cur.ll <- -Inf
|
789
|
+
|
790
|
+
for(i in 1:max.iter){
|
791
|
+
# 2/3. EM
|
792
|
+
e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
|
793
|
+
m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
|
794
|
+
# 4. Convergence
|
795
|
+
ll <- c(ll, e.step[["ll"]])
|
796
|
+
ll.diff <- abs(cur.ll - e.step[["ll"]])
|
797
|
+
cur.ll <- e.step[["ll"]]
|
798
|
+
if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
|
799
|
+
if(is.na(ll.diff) || ll.diff == Inf) break
|
800
|
+
if(ll.diff <= ll.diff.res) break
|
801
|
+
}
|
802
|
+
|
803
|
+
# Return
|
804
|
+
peaks <- list()
|
805
|
+
for(i in 1:components){
|
806
|
+
n.hat <- m.step[['alpha']][i]*length(lsd1)
|
807
|
+
peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
|
808
|
+
values.res=0, mode=m.step[['mu']][i],
|
809
|
+
param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
|
810
|
+
n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
|
811
|
+
merge.logdist=as.numeric(), log=log,
|
812
|
+
seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
|
813
|
+
}
|
814
|
+
return(peaks)
|
815
|
+
}
|
816
|
+
|
817
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder
|
818
|
+
#'
|
819
|
+
#' Identifies peaks in the population histogram potentially indicating
|
820
|
+
#' sub-population mixtures, using a custom distribution-mowing method.
|
821
|
+
#'
|
822
|
+
#' @param x
|
823
|
+
#' An \code{\link{enve.RecPlot2}} object.
|
824
|
+
#' @param min.points
|
825
|
+
#' Minimum number of points in the quantile-estimation-range
|
826
|
+
#' \code{(quant.est)} to estimate a peak.
|
827
|
+
#' @param quant.est
|
828
|
+
#' Range of quantiles to be used in the estimation of a peak's
|
829
|
+
#' parameters.
|
830
|
+
#' @param mlv.opts
|
831
|
+
#' Ignored. For backwards compatibility.
|
832
|
+
#' @param fitdist.opts.sn
|
833
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
834
|
+
#' \code{with.skewness=TRUE}. Note that the \code{start} parameter will be
|
835
|
+
#' ammended with \code{xi=estimated} mode for each peak.
|
836
|
+
#' @param fitdist.opts.norm
|
837
|
+
#' Options passed to \code{fitdist} to estimate the standard deviation if
|
838
|
+
#' \code{with.skewness=FALSE}. Note that the \code{start} parameter will be
|
839
|
+
#' ammended with \code{mean=estimated} mode for each peak.
|
840
|
+
#' @param rm.top
|
841
|
+
#' Top-values to remove before finding peaks, as a quantile probability.
|
842
|
+
#' This step is useful to remove highly conserved regions, but can be
|
843
|
+
#' turned off by setting \code{rm.top=0}. The quantile is determined
|
844
|
+
#' \strong{after} removing zero-coverage windows.
|
845
|
+
#' @param with.skewness
|
846
|
+
#' Allow skewness correction of the peaks. Typically, the
|
847
|
+
#' sequencing-depth distribution for a single peak is left-skewed, due
|
848
|
+
#' partly (but not exclusively) to fragmentation and mapping sensitivity.
|
849
|
+
#' See \emph{Lindner et al 2013, Bioinformatics 29(10):1260-7} for an
|
850
|
+
#' alternative solution for the first problem (fragmentation) called
|
851
|
+
#' "tail distribution".
|
852
|
+
#' @param optim.rounds
|
853
|
+
#' Maximum rounds of peak optimization.
|
854
|
+
#' @param optim.epsilon
|
855
|
+
#' Trace change at which optimization stops (unless \code{optim.rounds} is
|
856
|
+
#' reached first). The trace change is estimated as the sum of square
|
857
|
+
#' differences between parameters in one round and those from two rounds
|
858
|
+
#' earlier (to avoid infinite loops from approximation).
|
859
|
+
#' @param merge.logdist
|
860
|
+
#' Maximum value of \code{|log-ratio|} between centrality parameters in peaks
|
861
|
+
#' to attempt merging. The default of ~0.22 corresponds to a maximum
|
862
|
+
#' difference of 25\%.
|
863
|
+
#' @param verbose
|
864
|
+
#' Display (mostly debugging) information.
|
865
|
+
#' @param log
|
866
|
+
#' Logical value indicating if the estimations should be performed in
|
867
|
+
#' natural logarithm units. Do not change unless you know what you're
|
868
|
+
#' doing.
|
869
|
+
#'
|
870
|
+
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
871
|
+
#'
|
872
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
873
|
+
#'
|
874
|
+
#' @export
|
875
|
+
|
876
|
+
enve.recplot2.findPeaks.mower <- function(
|
877
|
+
x,
|
878
|
+
min.points=10,
|
879
|
+
quant.est=c(0.002, 0.998),
|
880
|
+
mlv.opts=list(method='parzen'),
|
881
|
+
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
882
|
+
start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
|
883
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
|
884
|
+
start=list(sd=1), lower=c(0, -Inf)),
|
885
|
+
rm.top=0.05,
|
886
|
+
with.skewness=TRUE,
|
887
|
+
optim.rounds=200,
|
888
|
+
optim.epsilon=1e-4,
|
889
|
+
merge.logdist=log(1.75),
|
890
|
+
verbose=FALSE,
|
891
|
+
log=TRUE
|
892
|
+
){
|
893
|
+
|
894
|
+
# Essential vars
|
895
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
|
896
|
+
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
897
|
+
lsd1 <- seqdepth.in[seqdepth.in>0];
|
898
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
899
|
+
if(log) lsd1 <- log(lsd1)
|
900
|
+
if(with.skewness){
|
901
|
+
fitdist.opts <- fitdist.opts.sn
|
902
|
+
}else{
|
903
|
+
fitdist.opts <- fitdist.opts.norm
|
904
|
+
}
|
905
|
+
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
906
|
+
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
907
|
+
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
908
|
+
n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
|
909
|
+
|
910
|
+
# Find seed peaks
|
911
|
+
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
912
|
+
peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
|
913
|
+
|
914
|
+
# Merge overlapping peaks
|
915
|
+
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
916
|
+
merged <- (length(peaks)>1)
|
917
|
+
while(merged){
|
918
|
+
merged <- FALSE
|
919
|
+
ignore <- c()
|
920
|
+
peaks2 <- list();
|
921
|
+
for(i in 1:length(peaks)){
|
922
|
+
if(i %in% ignore) next
|
923
|
+
p <- peaks[[ i ]]
|
924
|
+
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
925
|
+
p2 <- peaks[[ j ]]
|
926
|
+
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
927
|
+
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
928
|
+
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
929
|
+
if(verbose) cat('==> Attempting a merge at',
|
930
|
+
p$param.hat[[ length(p$param.hat) ]],'&',
|
931
|
+
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
932
|
+
peaks.opts$lsd1 <- c(p$values, p2$values)
|
933
|
+
p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
|
934
|
+
if(length(p.new)==1){
|
935
|
+
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
936
|
+
ignore <- c(ignore, j)
|
937
|
+
merged <- TRUE
|
938
|
+
}
|
939
|
+
}
|
940
|
+
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
941
|
+
}
|
942
|
+
peaks <- peaks2
|
943
|
+
if(length(peaks)==1) break
|
944
|
+
}
|
945
|
+
|
946
|
+
if(verbose) cat('Found',length(peaks),'peak(s)\n')
|
947
|
+
return(peaks);
|
948
|
+
}
|
949
|
+
|
950
|
+
#==============> Define utils
|
951
|
+
|
952
|
+
#' Enveomics: Recruitment Plot (2) Core Peak Finder
|
953
|
+
#'
|
954
|
+
#' Finds the peak in a list of peaks that is most likely to represent the
|
955
|
+
#' "core genome" of a population.
|
956
|
+
#'
|
957
|
+
#' @param x \code{list} of \code{\link{enve.RecPlot2.Peak}} objects.
|
958
|
+
#'
|
959
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
960
|
+
#'
|
961
|
+
#' @export
|
962
|
+
|
963
|
+
enve.recplot2.corePeak <- function
|
964
|
+
(x
|
965
|
+
){
|
966
|
+
# Find the peak with maximum depth (centrality)
|
967
|
+
maxPeak <- x[[
|
968
|
+
which.max(as.numeric(lapply(x,
|
969
|
+
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
970
|
+
]]
|
971
|
+
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
972
|
+
# the default "merge.logdist" distance, take that one instead.
|
973
|
+
corePeak <- maxPeak
|
974
|
+
for(p in x){
|
975
|
+
p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
|
976
|
+
corePeak.len <- ifelse(
|
977
|
+
length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
|
978
|
+
sz.d <- log(p.len/corePeak.len)
|
979
|
+
if(is.nan(sz.d) || sz.d < 0) next
|
980
|
+
sq.d.a <- as.numeric(tail(p$param.hat, n=1))
|
981
|
+
sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
|
982
|
+
if(p$log) sq.d.a <- exp(sq.d.a)
|
983
|
+
if(corePeak$log) sq.d.b <- exp(sq.d.b)
|
984
|
+
if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
|
985
|
+
}
|
986
|
+
return(corePeak)
|
987
|
+
}
|
988
|
+
|
989
|
+
#' Enveomics: Recruitment Plot (2) Change Cutoff
|
990
|
+
#'
|
991
|
+
#' Change the intra-species cutoff of an existing recruitment plot.
|
992
|
+
#'
|
993
|
+
#' @param rp
|
994
|
+
#' \code{\link{enve.RecPlot2}} object.
|
995
|
+
#' @param new.cutoff
|
996
|
+
#' New cutoff to use.
|
997
|
+
#'
|
998
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
999
|
+
#'
|
1000
|
+
#' @export
|
1001
|
+
|
1002
|
+
enve.recplot2.changeCutoff <- function
|
1003
|
+
(rp,
|
1004
|
+
new.cutoff=98
|
1005
|
+
){
|
1006
|
+
# Re-calculate vectors
|
1007
|
+
id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
|
1008
|
+
id.ingroup <- (id.mids > new.cutoff)
|
1009
|
+
pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
|
1010
|
+
pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
|
1011
|
+
# Update object
|
1012
|
+
attr(rp, "id.ingroup") <- id.ingroup
|
1013
|
+
attr(rp, "pos.counts.in") <- pos.counts.in
|
1014
|
+
attr(rp, "pos.counts.out") <- pos.counts.out
|
1015
|
+
attr(rp, "call") <- match.call()
|
1016
|
+
return(rp)
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
#' Enveomics: Recruitment Plot (2) Window Depth Threshold
|
1020
|
+
#'
|
1021
|
+
#' Identifies the threshold below which windows should be identified as
|
1022
|
+
#' variable or absent.
|
1023
|
+
#'
|
1024
|
+
#' @param rp
|
1025
|
+
#' Recruitment plot, an \code{\link{enve.RecPlot2}} object.
|
1026
|
+
#' @param peak
|
1027
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1028
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1029
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1030
|
+
#' @param lower.tail
|
1031
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1032
|
+
#' sequencing depth.
|
1033
|
+
#' @param significance
|
1034
|
+
#' Significance threshold (alpha) to select windows.
|
1035
|
+
#'
|
1036
|
+
#' @return
|
1037
|
+
#' Returns a float. The units are depth if the peaks were estimated in
|
1038
|
+
#' linear scale, or log-depth otherwise (\code{peak$log}).
|
1039
|
+
#'
|
1040
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1041
|
+
#'
|
1042
|
+
#' @export
|
1043
|
+
|
1044
|
+
enve.recplot2.windowDepthThreshold <- function
|
1045
|
+
(rp,
|
1046
|
+
peak,
|
1047
|
+
lower.tail=TRUE,
|
1048
|
+
significance=0.05
|
1049
|
+
){
|
1050
|
+
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
1051
|
+
par <- peak$param.hat
|
1052
|
+
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
1053
|
+
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
1054
|
+
if(peak$log) thr <- exp(thr)
|
1055
|
+
|
1056
|
+
return(thr)
|
1057
|
+
}
|
1058
|
+
|
1059
|
+
#' Enveomics: Recruitment Plot (2) Extract Windows
|
1060
|
+
#'
|
1061
|
+
#' Extract windows significantly below (or above) the peak in sequencing
|
1062
|
+
#' depth.
|
1063
|
+
#'
|
1064
|
+
#' @param rp
|
1065
|
+
#' Recruitment plot, a \code{\link{enve.RecPlot2}} object.
|
1066
|
+
#' @param peak
|
1067
|
+
#' Peak, an \code{\link{enve.RecPlot2.Peak}} object. If list, it is assumed to be a
|
1068
|
+
#' list of \code{\link{enve.RecPlot2.Peak}} objects, in which case the core peak is
|
1069
|
+
#' used (see \code{\link{enve.recplot2.corePeak}}).
|
1070
|
+
#' @param lower.tail
|
1071
|
+
#' If \code{FALSE}, it returns windows significantly above the peak in
|
1072
|
+
#' sequencing depth.
|
1073
|
+
#' @param significance
|
1074
|
+
#' Significance threshold (alpha) to select windows.
|
1075
|
+
#' @param seq.names
|
1076
|
+
#' Returns subject sequence names instead of a vector of Booleans. If
|
1077
|
+
#' the recruitment plot was generated with named position bins (e.g, using
|
1078
|
+
#' \code{pos.breaks=0} or a two-column \code{pos.breaks.tsv}), it returns a
|
1079
|
+
#' vector of characters (the sequence identifiers), otherwise it returns a
|
1080
|
+
#' data.frame with a name column and two columns of coordinates.
|
1081
|
+
#'
|
1082
|
+
#' @return
|
1083
|
+
#' Returns a vector of logicals if \code{seq.names = FALSE}.
|
1084
|
+
#' If \code{seq.names = TRUE}, it returns a data.frame with five columns:
|
1085
|
+
#' \code{name.from}, \code{name.to}, \code{pos.from}, \code{pos.to}, and
|
1086
|
+
#' \code{seq.name} (see \code{\link{enve.recplot2.coordinates}}).
|
1087
|
+
#'
|
1088
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1089
|
+
#'
|
1090
|
+
#' @export
|
1091
|
+
|
1092
|
+
enve.recplot2.extractWindows <- function
|
1093
|
+
(rp,
|
1094
|
+
peak,
|
1095
|
+
lower.tail = TRUE,
|
1096
|
+
significance = 0.05,
|
1097
|
+
seq.names = FALSE
|
1098
|
+
){
|
1099
|
+
# Determine the threshold
|
1100
|
+
thr <- enve.recplot2.windowDepthThreshold(rp, peak, lower.tail, significance)
|
1101
|
+
|
1102
|
+
# Select windows past the threshold
|
1103
|
+
seqdepth.in <- enve.recplot2.seqdepth(rp)
|
1104
|
+
if(lower.tail){
|
1105
|
+
sel <- seqdepth.in < thr
|
1106
|
+
}else{
|
1107
|
+
sel <- seqdepth.in > thr
|
1108
|
+
}
|
1109
|
+
|
1110
|
+
# seq.names = FALSE
|
1111
|
+
if(!seq.names) return(sel)
|
1112
|
+
# seq.names = TRUE
|
1113
|
+
return(enve.recplot2.coordinates(rp, sel))
|
1114
|
+
}
|
1115
|
+
|
1116
|
+
#' Enveomics: Recruitment Plot (2) Compare Identities
|
1117
|
+
#'
|
1118
|
+
#' Compare the distribution of identities between two
|
1119
|
+
#' \code{\link{enve.RecPlot2}} objects.
|
1120
|
+
#'
|
1121
|
+
#' @param x
|
1122
|
+
#' First \code{\link{enve.RecPlot2}} object.
|
1123
|
+
#' @param y
|
1124
|
+
#' Second \code{\link{enve.RecPlot2}} object.
|
1125
|
+
#' @param method
|
1126
|
+
#' Distance method to use. This should be (an unambiguous abbreviation of)
|
1127
|
+
#' one of:
|
1128
|
+
#' \itemize{
|
1129
|
+
#' \item{"hellinger" (\emph{Hellinger, 1090, doi:10.1515/crll.1909.136.210}),}
|
1130
|
+
#' \item{"bhattacharyya" (\emph{Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35}),}
|
1131
|
+
#' \item{"kl" or "kullback-leibler" (\emph{Kullback & Leibler, 1951,
|
1132
|
+
#' doi:10.1214/aoms/1177729694}), or}
|
1133
|
+
#' \item{"euclidean"}
|
1134
|
+
#' }
|
1135
|
+
#' @param smooth.par
|
1136
|
+
#' Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
|
1137
|
+
#' Use \code{NULL} to automatically determine this value using leave-one-out
|
1138
|
+
#' cross-validation (see \code{smooth.spline} parameter \code{spar}).
|
1139
|
+
#' @param pseudocounts
|
1140
|
+
#' Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
|
1141
|
+
#' 1 for add-one smoothing.
|
1142
|
+
#' @param max.deviation
|
1143
|
+
#' Maximum mean deviation between identity breaks tolerated (as percent
|
1144
|
+
#' identity). Difference in number of \code{id.breaks} is never tolerated.
|
1145
|
+
#'
|
1146
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1147
|
+
#'
|
1148
|
+
#' @export
|
1149
|
+
|
1150
|
+
enve.recplot2.compareIdentities <- function
|
1151
|
+
(x,
|
1152
|
+
y,
|
1153
|
+
method="hellinger",
|
1154
|
+
smooth.par=NULL,
|
1155
|
+
pseudocounts=0,
|
1156
|
+
max.deviation=0.75
|
1157
|
+
){
|
1158
|
+
METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
|
1159
|
+
i.meth <- pmatch(method, METHODS)
|
1160
|
+
if (is.na(i.meth)) stop("Invalid distance ", method)
|
1161
|
+
if(!inherits(x, "enve.RecPlot2"))
|
1162
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
1163
|
+
if(!inherits(y, "enve.RecPlot2"))
|
1164
|
+
stop("'y' must inherit from class `enve.RecPlot2`")
|
1165
|
+
if(length(x$id.breaks) != length(y$id.breaks))
|
1166
|
+
stop("'x' and 'y' must have the same number of `id.breaks`")
|
1167
|
+
dev <- mean(abs(x$id.breaks - y$id.breaks))
|
1168
|
+
if(dev > max.deviation)
|
1169
|
+
stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
|
1170
|
+
dev)
|
1171
|
+
x.cnt <- x$id.counts
|
1172
|
+
y.cnt <- y$id.counts
|
1173
|
+
if(is.null(smooth.par) || smooth.par > 0){
|
1174
|
+
x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
|
1175
|
+
y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
|
1176
|
+
p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
|
1177
|
+
q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
|
1178
|
+
x.cnt <- pmax(p.spline$y, 0)
|
1179
|
+
y.cnt <- pmax(q.spline$y, 0)
|
1180
|
+
}
|
1181
|
+
a <- as.numeric(pseudocounts)
|
1182
|
+
p <- (x.cnt + a) / sum(x.cnt + a)
|
1183
|
+
q <- (y.cnt + a) / sum(y.cnt + a)
|
1184
|
+
d <- NA
|
1185
|
+
if(i.meth %in% c(1L, 2L)){
|
1186
|
+
d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
|
1187
|
+
if(i.meth==2L) d <- 1 - d**2
|
1188
|
+
}else if(i.meth %in% c(3L, 4L)){
|
1189
|
+
sel <- p>0
|
1190
|
+
if(any(q[sel]==0))
|
1191
|
+
stop("Undefined distance without absolute continuity, use pseudocounts")
|
1192
|
+
d <- -sum(p[sel]*log(q[sel]/p[sel]))
|
1193
|
+
}else if(i.meth == 5L){
|
1194
|
+
d <- sqrt(sum((q-p)**2))
|
1195
|
+
}
|
1196
|
+
return(d)
|
1197
|
+
}
|
1198
|
+
|
1199
|
+
#' Enveomics: Recruitment Plot (2) Coordinates
|
1200
|
+
#'
|
1201
|
+
#' Returns the sequence name and coordinates of the requested position bins.
|
1202
|
+
#'
|
1203
|
+
#' @param x
|
1204
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1205
|
+
#' @param bins
|
1206
|
+
#' Vector of selected bins to return. It can be a vector of logical values
|
1207
|
+
#' with the same length as \code{x$pos.breaks-1} or a vector of integers. If
|
1208
|
+
#' missing, returns the coordinates of all windows.
|
1209
|
+
#'
|
1210
|
+
#' @return
|
1211
|
+
#' Returns a data.frame with five columns: \code{name.from} (character),
|
1212
|
+
#' \code{pos.from} (numeric), \code{name.to} (character), \code{pos.to}
|
1213
|
+
#' (numeric), and \code{seq.name} (character).
|
1214
|
+
#' The first two correspond to sequence and position of the start point of the
|
1215
|
+
#' bin. The next two correspond to the sequence and position of the end point of
|
1216
|
+
#' the bin. The last one indicates the name of the sequence (if defined).
|
1217
|
+
#'
|
1218
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1219
|
+
#'
|
1220
|
+
#' @export
|
1221
|
+
|
1222
|
+
enve.recplot2.coordinates <- function
|
1223
|
+
(x,
|
1224
|
+
bins
|
1225
|
+
){
|
1226
|
+
if(!inherits(x, "enve.RecPlot2"))
|
1227
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
1228
|
+
if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
|
1229
|
+
if(!is.vector(bins)) stop("'bins' must be a vector")
|
1230
|
+
if(inherits(bins, "logical")) bins <- which(bins)
|
1231
|
+
|
1232
|
+
y <- data.frame(stringsAsFactors = FALSE, row.names = bins)
|
1233
|
+
|
1234
|
+
for(i in 1:length(bins)){
|
1235
|
+
j <- bins[i]
|
1236
|
+
# Concatenated coordinates
|
1237
|
+
cc <- x$pos.breaks[c(j, j+1)]
|
1238
|
+
# Find the corresponding `seq.breaks`
|
1239
|
+
sb.from <- which(
|
1240
|
+
cc[1] >= x$seq.breaks[-length(x$seq.breaks)] &
|
1241
|
+
cc[1] < x$seq.breaks[-1])
|
1242
|
+
sb.to <- which(
|
1243
|
+
cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
|
1244
|
+
cc[2] <= x$seq.breaks[-1])
|
1245
|
+
# Translate coordinates
|
1246
|
+
if(length(sb.from)==1 & length(sb.to)==1){
|
1247
|
+
y[i, 'name.from'] <- x$seq.names[sb.from]
|
1248
|
+
y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
|
1249
|
+
y[i, 'name.to'] <- x$seq.names[sb.to]
|
1250
|
+
y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
|
1251
|
+
y[i, 'seq.name'] <- x$pos.names[i]
|
1252
|
+
}
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
return(y)
|
1256
|
+
}
|
1257
|
+
|
1258
|
+
#' Enveomics: Recruitment Plot (2) Sequencing Depth
|
1259
|
+
#'
|
1260
|
+
#' Calculate the sequencing depth of the given window(s).
|
1261
|
+
#'
|
1262
|
+
#' @param x
|
1263
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1264
|
+
#' @param sel
|
1265
|
+
#' Window(s) for which the sequencing depth is to be calculated. If not
|
1266
|
+
#' passed, it returns the sequencing depth of all windows.
|
1267
|
+
#' @param low.identity
|
1268
|
+
#' A logical indicating if the sequencing depth is to be estimated only
|
1269
|
+
#' with low-identity matches. By default, only high-identity matches are
|
1270
|
+
#' used.
|
1271
|
+
#'
|
1272
|
+
#' @return
|
1273
|
+
#' Returns a numeric vector of sequencing depths (in bp/bp).
|
1274
|
+
#'
|
1275
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1276
|
+
#'
|
1277
|
+
#' @export
|
1278
|
+
|
1279
|
+
enve.recplot2.seqdepth <- function
|
1280
|
+
|
1281
|
+
(x,
|
1282
|
+
sel,
|
1283
|
+
low.identity=FALSE
|
1284
|
+
){
|
1285
|
+
if(!inherits(x, "enve.RecPlot2"))
|
1286
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
1287
|
+
if(low.identity){
|
1288
|
+
pos.cnts.in <- x$pos.counts.out
|
1289
|
+
}else{
|
1290
|
+
pos.cnts.in <- x$pos.counts.in
|
1291
|
+
}
|
1292
|
+
pos.breaks <- x$pos.breaks
|
1293
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
1294
|
+
seqdepth.in <- pos.cnts.in/pos.binsize
|
1295
|
+
if(missing(sel)) return(seqdepth.in)
|
1296
|
+
return(seqdepth.in[sel])
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
#' Enveomics: Recruitment Plot (2) ANI Estimate
|
1300
|
+
#'
|
1301
|
+
#' Estimate the Average Nucleotide Identity from reads (ANIr) from a
|
1302
|
+
#' recruitment plot.
|
1303
|
+
#'
|
1304
|
+
#' @param x
|
1305
|
+
#' \code{\link{enve.RecPlot2}} object.
|
1306
|
+
#' @param range
|
1307
|
+
#' Range of identities to be considered. By default, the full range
|
1308
|
+
#' is used (note that the upper boundary is \code{Inf} and not 100 because
|
1309
|
+
#' recruitment plots can also be built with bit-scores). To use only
|
1310
|
+
#' intra-population matches (with identities), use c(95,100). To use only
|
1311
|
+
#' inter-population values, use c(0,95).
|
1312
|
+
#'
|
1313
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1314
|
+
#'
|
1315
|
+
#' @export
|
1316
|
+
|
1317
|
+
enve.recplot2.ANIr <- function
|
1318
|
+
(x,
|
1319
|
+
range=c(0,Inf)
|
1320
|
+
){
|
1321
|
+
if(!inherits(x, "enve.RecPlot2"))
|
1322
|
+
stop("'x' must inherit from class `enve.RecPlot2`")
|
1323
|
+
id.b <- x$id.breaks
|
1324
|
+
id <- (id.b[-1]+id.b[-length(id.b)])/2
|
1325
|
+
cnt <- x$id.counts
|
1326
|
+
cnt[id < range[1]] <- 0
|
1327
|
+
cnt[id > range[2]] <- 0
|
1328
|
+
return(sum(id*cnt/sum(cnt)))
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
#==============> Define internal functions
|
1332
|
+
|
1333
|
+
#' Enveomics: Recruitment Plot (2) Internal Ancillary Function
|
1334
|
+
#'
|
1335
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2}}).
|
1336
|
+
#'
|
1337
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1338
|
+
#' @param pos.breaks Position breaks
|
1339
|
+
#' @param id.breaks Identity breaks
|
1340
|
+
#' @param rec.idcol Identity column to use
|
1341
|
+
#'
|
1342
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1343
|
+
#' @author Kenji Gerhardt [aut]
|
1344
|
+
#'
|
1345
|
+
#' @export
|
1346
|
+
|
1347
|
+
enve.recplot2.__counts <- function
|
1348
|
+
(x, pos.breaks, id.breaks, rec.idcol) {
|
1349
|
+
rec2 <- x$rec
|
1350
|
+
verbose <- x$verbose
|
1351
|
+
|
1352
|
+
# get counts of how many occurrences of each genome pos.bin there are per read
|
1353
|
+
x.bins <- mapply(
|
1354
|
+
function(start, end) {
|
1355
|
+
list(rle(findInterval(start:end, pos.breaks, left.open = T)))
|
1356
|
+
}, rec2[, 1], rec2[, 2])
|
1357
|
+
|
1358
|
+
# find the single y bin for each row, replicates it at the correct places to
|
1359
|
+
# the number of distinct bins found in its row
|
1360
|
+
y.bins <- rep(findInterval(rec2[, rec.idcol], id.breaks, left.open = T),
|
1361
|
+
times = unlist(lapply(x.bins, function(a) length(a$lengths))))
|
1362
|
+
|
1363
|
+
# x.bins_counts is the number of occurrences of each bin a row contains,
|
1364
|
+
# per row, then unlisted
|
1365
|
+
x.bins_counts <- unlist(lapply(x.bins, function(a) a$lengths))
|
1366
|
+
|
1367
|
+
# these are the pos. in. genome bins that each count in x.bins_counts falls into
|
1368
|
+
x.bins <- unlist(lapply(x.bins, function(a) a$values))
|
1369
|
+
|
1370
|
+
# much more efficient counts implementation in R using lists instead of a matrix:
|
1371
|
+
counts <- lapply(
|
1372
|
+
1:(length(pos.breaks) - 1),
|
1373
|
+
function(col_len) rep(0, length(id.breaks) - 1))
|
1374
|
+
|
1375
|
+
# accesses the correct list in counts by x.bin, then
|
1376
|
+
# accesses the position in that row by y.bins and adds the new count
|
1377
|
+
for(i in 1:length(x.bins)){
|
1378
|
+
counts[[x.bins[i]]][y.bins[i]] <- counts[[x.bins[i]]][y.bins[i]] + x.bins_counts[i]
|
1379
|
+
}
|
1380
|
+
|
1381
|
+
counts <- do.call(rbind, counts)
|
1382
|
+
return(counts)
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
#' Enveomics: Recruitment Plot (2) EMauto Peak Finder - Internal Ancillary Function
|
1386
|
+
#'
|
1387
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.emauto}}).
|
1388
|
+
#'
|
1389
|
+
#' @param x \code{\link{enve.RecPlot2}} object
|
1390
|
+
#' @param comp Components
|
1391
|
+
#' @param do_crit Function estimating the criterion
|
1392
|
+
#' @param best Best solution thus far
|
1393
|
+
#' @param verbose If verbose
|
1394
|
+
#' @param ... Additional parameters for \code{\link{enve.recplot2.findPeaks.em}}
|
1395
|
+
#'
|
1396
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1397
|
+
#'
|
1398
|
+
#' @export
|
1399
|
+
|
1400
|
+
enve.recplot2.findPeaks.__emauto_one <- function
|
1401
|
+
(x, comp, do_crit, best, verbose, ...){
|
1402
|
+
peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
|
1403
|
+
if(length(peaks)==0) return(best)
|
1404
|
+
k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
|
1405
|
+
crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
|
1406
|
+
if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
|
1407
|
+
ifelse(crit > best[['crit']], '*', ''), '\n')
|
1408
|
+
if(crit > best[['crit']]){
|
1409
|
+
best[['crit']] <- crit
|
1410
|
+
best[['peaks']] <- peaks
|
1411
|
+
}
|
1412
|
+
best[['pstore']][[comp]] <- peaks
|
1413
|
+
return(best)
|
1414
|
+
}
|
1415
|
+
|
1416
|
+
#' Enveomics: Recruitment Plot (2) EM Peak Finder - Internal Ancillary Function Expectation
|
1417
|
+
#'
|
1418
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1419
|
+
#'
|
1420
|
+
#' @param x Vector of log-transformed sequencing depths
|
1421
|
+
#' @param theta Parameters list
|
1422
|
+
#'
|
1423
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1424
|
+
#'
|
1425
|
+
#' @export
|
1426
|
+
|
1427
|
+
enve.recplot2.findPeaks.__em_e <- function
|
1428
|
+
(x, theta){
|
1429
|
+
components <- length(theta[['mu']])
|
1430
|
+
product <- do.call(cbind,
|
1431
|
+
lapply(1:components,
|
1432
|
+
function(i) dnorm(x, theta[['mu']][i],
|
1433
|
+
theta[['sd']][i])*theta[['alpha']][i]))
|
1434
|
+
sum.of.components <- rowSums(product)
|
1435
|
+
posterior <- product / sum.of.components
|
1436
|
+
for(i in which(sum.of.components == Inf)) {
|
1437
|
+
cat(i,'/',nrow(product), ':', product[i,], '\n')
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
#' Enveomics: Recruitment Plot (2) Em Peak Finder - Internal Ancillary Function Maximization
|
1444
|
+
#'
|
1445
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.em}}).
|
1446
|
+
#'
|
1447
|
+
#' @param x Vector of log-transformed sequencing depths
|
1448
|
+
#' @param posterior Posterior probability
|
1449
|
+
#'
|
1450
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1451
|
+
#'
|
1452
|
+
#' @export
|
1453
|
+
|
1454
|
+
enve.recplot2.findPeaks.__em_m <- function
|
1455
|
+
(x, posterior){
|
1456
|
+
components <- ncol(posterior)
|
1457
|
+
n <- colSums(posterior)
|
1458
|
+
mu <- colSums(posterior * x) / n
|
1459
|
+
sd <- sqrt( colSums(
|
1460
|
+
posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
|
1461
|
+
alpha <- n/length(x)
|
1462
|
+
return(list(mu=mu, sd=sd, alpha=alpha))
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
#' Enveomics: Recruitment Plot (2) Peak S4 Class - Internal Ancillary Function
|
1466
|
+
#'
|
1467
|
+
#' Internal ancillary function (see \code{\link{enve.RecPlot2.Peak}}).
|
1468
|
+
#'
|
1469
|
+
#' @param x \code{\link{enve.RecPlot2.Peak}} object
|
1470
|
+
#' @param mids Midpoints
|
1471
|
+
#' @param counts Counts
|
1472
|
+
#'
|
1473
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1474
|
+
#'
|
1475
|
+
#' @export
|
1476
|
+
|
1477
|
+
enve.recplot2.__peakHist <- function
|
1478
|
+
(x, mids, counts=TRUE){
|
1479
|
+
d.o <- x$param.hat
|
1480
|
+
if(length(x$log)==0) x$log <- FALSE
|
1481
|
+
if(x$log){
|
1482
|
+
d.o$x <- log(mids)
|
1483
|
+
}else{
|
1484
|
+
d.o$x <- mids
|
1485
|
+
}
|
1486
|
+
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
1487
|
+
if(!counts) return(prob)
|
1488
|
+
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
1489
|
+
return(prob*x$n.hat/sum(prob))
|
1490
|
+
}
|
1491
|
+
|
1492
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 1
|
1493
|
+
#'
|
1494
|
+
#' Internall ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1495
|
+
#'
|
1496
|
+
#' @param lsd1 Vector of log-transformed sequencing depths
|
1497
|
+
#' @param min.points Minimum number of points
|
1498
|
+
#' @param quant.est Quantile estimate
|
1499
|
+
#' @param mlv.opts List of options for \code{mlv}
|
1500
|
+
#' @param fitdist.opts List of options for \code{fitdist}
|
1501
|
+
#' @param with.skewness If skewed-normal should be used
|
1502
|
+
#' @param optim.rounds Maximum number of optimization rounds
|
1503
|
+
#' @param optim.epsilon Minimum difference considered negligible
|
1504
|
+
#' @param n.total Global number of windows
|
1505
|
+
#' @param merge.logdist Attempted \code{merge.logdist} parameter
|
1506
|
+
#' @param verbose If verbose
|
1507
|
+
#' @param log If log-transformed depths
|
1508
|
+
#'
|
1509
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1510
|
+
#'
|
1511
|
+
#' @export
|
1512
|
+
|
1513
|
+
enve.recplot2.findPeaks.__mow_one <- function
|
1514
|
+
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
1515
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
|
1516
|
+
){
|
1517
|
+
dist <- ifelse(with.skewness, 'sn', 'norm');
|
1518
|
+
|
1519
|
+
# Find peak
|
1520
|
+
o <- mlv.opts; o$x = lsd1;
|
1521
|
+
mode1 <- median(lsd1); # mode1 <- do.call(mlv, o)$M;
|
1522
|
+
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
1523
|
+
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
1524
|
+
lim <- NA;
|
1525
|
+
if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
|
1526
|
+
|
1527
|
+
# Refine peak parameters
|
1528
|
+
for(round in 1:optim.rounds){
|
1529
|
+
param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
|
1530
|
+
lim.o <- param.hat
|
1531
|
+
lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
|
1532
|
+
lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
|
1533
|
+
if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
|
1534
|
+
'and params =',as.numeric(param.hat),' \r')
|
1535
|
+
if(length(lsd1.pop) < min.points) break;
|
1536
|
+
o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
|
1537
|
+
last.last.hat <- last.hat
|
1538
|
+
last.hat <- param.hat
|
1539
|
+
param.hat <- as.list(do.call(fitdist, o)$estimate);
|
1540
|
+
if(any(is.na(param.hat))){
|
1541
|
+
if(round>1) param.hat <- last.hat;
|
1542
|
+
break;
|
1543
|
+
}
|
1544
|
+
if(round > 1){
|
1545
|
+
epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
|
1546
|
+
if(epsilon1 < optim.epsilon) break;
|
1547
|
+
if(round > 2){
|
1548
|
+
epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
1549
|
+
if(epsilon2 < optim.epsilon) break;
|
1550
|
+
}
|
1551
|
+
}
|
1552
|
+
}
|
1553
|
+
if(verbose) cat('\n')
|
1554
|
+
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
1555
|
+
|
1556
|
+
# Mow distribution
|
1557
|
+
lsd2 <- c();
|
1558
|
+
lsd.pop <- c();
|
1559
|
+
n.hat <- length(lsd1.pop)/diff(quant.est)
|
1560
|
+
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
1561
|
+
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
1562
|
+
merge.logdist=merge.logdist, log=log)
|
1563
|
+
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
1564
|
+
peak.cnt <- enve.recplot2.__peakHist(peak,
|
1565
|
+
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
1566
|
+
for(i in 2:length(peak.breaks)){
|
1567
|
+
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
1568
|
+
n.exp <- peak.cnt[i-1]
|
1569
|
+
if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
|
1570
|
+
if(length(values)==0) next
|
1571
|
+
in.peak <- runif(length(values)) <= n.exp/length(values)
|
1572
|
+
lsd2 <- c(lsd2, values[!in.peak])
|
1573
|
+
lsd.pop <- c(lsd.pop, values[in.peak])
|
1574
|
+
}
|
1575
|
+
if(length(lsd.pop) < min.points) return(NULL)
|
1576
|
+
|
1577
|
+
# Return peak
|
1578
|
+
attr(peak, 'values') <- lsd.pop
|
1579
|
+
attr(peak, 'values.res') <- lsd2
|
1580
|
+
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
1581
|
+
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
1582
|
+
plot=FALSE)$counts)+1)/2
|
1583
|
+
mu <- tail(param.hat, n=1)
|
1584
|
+
attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
|
1585
|
+
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
1586
|
+
'with expected n =',n.hat,'\n')
|
1587
|
+
return(peak)
|
1588
|
+
}
|
1589
|
+
|
1590
|
+
#' Enveomics: Recruitment Plot (2) Mowing Peak Finder - Internal Ancillary Function 2
|
1591
|
+
#'
|
1592
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks.mower}}).
|
1593
|
+
#'
|
1594
|
+
#' @param peaks.opts List of options for \code{\link{enve.recplot2.findPeaks.__mow_one}}
|
1595
|
+
#'
|
1596
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1597
|
+
#'
|
1598
|
+
#' @export
|
1599
|
+
|
1600
|
+
enve.recplot2.findPeaks.__mower <- function
|
1601
|
+
(peaks.opts){
|
1602
|
+
peaks <- list()
|
1603
|
+
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
1604
|
+
peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
|
1605
|
+
if(is.null(peak)) break
|
1606
|
+
peaks[[ length(peaks)+1 ]] <- peak
|
1607
|
+
peaks.opts$lsd1 <- peak$values.res
|
1608
|
+
}
|
1609
|
+
return(peaks)
|
1610
|
+
}
|
1611
|
+
|
1612
|
+
#' Enveomics: Recruitment Plot (2) Peak Finder - Internal Ancillary Function
|
1613
|
+
#'
|
1614
|
+
#' Internal ancillary function (see \code{\link{enve.recplot2.findPeaks}}).
|
1615
|
+
#'
|
1616
|
+
#' @param peak Query \code{\link{enve.RecPlot2.Peak}} object
|
1617
|
+
#' @param peaks list of \code{\link{enve.RecPlot2.Peak}} objects
|
1618
|
+
#'
|
1619
|
+
#' @author Luis M. Rodriguez-R [aut, cre]
|
1620
|
+
#'
|
1621
|
+
#' @export
|
1622
|
+
|
1623
|
+
enve.recplot2.__whichClosestPeak <- function
|
1624
|
+
(peak, peaks){
|
1625
|
+
dist <- as.numeric(lapply(peaks,
|
1626
|
+
function(x)
|
1627
|
+
abs(log(x$param.hat[[ length(x$param.hat) ]] /
|
1628
|
+
peak$param.hat[[ length(peak$param.hat) ]] ))))
|
1629
|
+
dist[ dist==0 ] <- Inf
|
1630
|
+
return(which.min(dist))
|
1631
|
+
}
|