miga-base 0.3.0.0 → 0.3.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +21 -4
- data/actions/init.rb +258 -0
- data/actions/run_local.rb +1 -2
- data/actions/test_taxonomy.rb +4 -1
- data/bin/miga +8 -1
- data/lib/miga/dataset.rb +4 -4
- data/lib/miga/dataset_result.rb +7 -4
- data/lib/miga/version.rb +2 -2
- data/scripts/_distances_noref_nomulti.bash +3 -1
- data/scripts/clade_finding.bash +1 -1
- data/scripts/init.bash +1 -1
- data/scripts/miga.bash +1 -1
- data/scripts/mytaxa.bash +78 -72
- data/scripts/mytaxa_scan.bash +67 -62
- data/scripts/ogs.bash +1 -1
- data/scripts/trimmed_fasta.bash +4 -3
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +703 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +571 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +208 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +339 -0
- data/utils/enveomics/Manifest/Tasks/other.json +746 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +454 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +132 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +40 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +162 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +61 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +106 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +40 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +87 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +64 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +254 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +306 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +50 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +30 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +373 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +26 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm.rb +137 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +44 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +35 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +121 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +165 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +119 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +117 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +263 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +320 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +745 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +423 -0
- data/utils/enveomics/enveomics.R/R/utils.R +16 -0
- data/utils/enveomics/enveomics.R/README.md +52 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +30 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +64 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +49 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +97 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +53 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +15 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +30 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +71 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +27 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/index_metadata.rb +0 -0
- data/utils/plot-taxdist.R +0 -0
- data/utils/requirements.txt +19 -19
- metadata +242 -2
@@ -0,0 +1,320 @@
|
|
1
|
+
enve.recplot <- structure(function(
|
2
|
+
### Produces recruitment plots provided that BlastTab.catsbj.pl has
|
3
|
+
### been previously executed. Requires the gplots library.
|
4
|
+
prefix,
|
5
|
+
### Path to the prefix of the BlastTab.catsbj.pl output files. At
|
6
|
+
### least the files .rec and .lim must exist with this prefix.
|
7
|
+
|
8
|
+
# Id. hist.
|
9
|
+
id.min=NULL,
|
10
|
+
### Minimum identity to be considered. By default, the minimum detected
|
11
|
+
### identity. This value is a percentage.
|
12
|
+
id.max=NULL,
|
13
|
+
### Maximum identity to be considered. By default, 100.
|
14
|
+
id.binsize=NULL,
|
15
|
+
### Size of the identity bins (vertical histograms). By default, 0.1 for
|
16
|
+
### identity metrics and 5 for bit score.
|
17
|
+
id.splines=0,
|
18
|
+
### Smoothing parameter for the splines in the identity histogram. Zero (0) for no
|
19
|
+
### splines. A generally good value is 1/2. If non-zero, requires the stats package.
|
20
|
+
id.metric='id',
|
21
|
+
### Metric of identity to be used (Y-axis). It can be any unambiguous prefix
|
22
|
+
### of "identity", "corrected identity", or "bit score".
|
23
|
+
id.summary='sum',
|
24
|
+
### Method used to build the identity histogram (Horizontal axis of the right panel).
|
25
|
+
### It can be any unambiguous prefix of "sum", "average", "median", "90% lower bound",
|
26
|
+
### "90% upper bound", "95% lower bound", and "95% upper bound". The last four options
|
27
|
+
### correspond to the upper and lower boundaries of the 90% and 95% empirical confidence
|
28
|
+
### intervals.
|
29
|
+
|
30
|
+
# Pos. hist.
|
31
|
+
pos.min=1,
|
32
|
+
### Minimum (leftmost) position in the reference (concatenated) genome (in bp).
|
33
|
+
pos.max=NULL,
|
34
|
+
### Maximum (rightmost) position in the reference (concatenated) genome (in bp).
|
35
|
+
### By default: Length of the genome.
|
36
|
+
pos.binsize=1e3,
|
37
|
+
### Size of the position bins (horizontal histograms) in bp.
|
38
|
+
pos.splines=0,
|
39
|
+
### Smoothing parameter for the splines in the position histogram. Zero (0) for no splines.
|
40
|
+
### If non-zero, requires the stats package.
|
41
|
+
|
42
|
+
# Rec. plot
|
43
|
+
rec.col1='white',
|
44
|
+
### Lightest color in the recruitment plot.
|
45
|
+
rec.col2='black',
|
46
|
+
### Darkest color in the recruitment plot.
|
47
|
+
|
48
|
+
# General
|
49
|
+
main=NULL,
|
50
|
+
### Title of the plot.
|
51
|
+
contig.col=grey(0.85),
|
52
|
+
### Color of the Contig boundaries. Set to NA to ignore Contig boundaries.
|
53
|
+
|
54
|
+
# Return
|
55
|
+
ret.recplot=FALSE,
|
56
|
+
### Indicates if the matrix of the recruitment plot is to be returned.
|
57
|
+
ret.hist=FALSE,
|
58
|
+
### Indicates if the vectors of the identity and position histograms are to be returned.
|
59
|
+
ret.mode=FALSE,
|
60
|
+
### Indicates if the mode of the identity is to be computed. It requires the modeest
|
61
|
+
### package.
|
62
|
+
|
63
|
+
# General
|
64
|
+
id.cutoff=NULL,
|
65
|
+
### Minimum identity to consider an alignment as "top". By default, it is 0.95 for the
|
66
|
+
### identity metrics and 95% of the best scoring alignment for bit score.
|
67
|
+
verbose=TRUE,
|
68
|
+
### Indicates if the function should report the advance.
|
69
|
+
...
|
70
|
+
### Any additional graphic parameters to be passed to plot for all panels except the
|
71
|
+
### recruitment plot (lower-left).
|
72
|
+
){
|
73
|
+
|
74
|
+
# Settings
|
75
|
+
METRICS <- c('identity', 'corrected identity', 'bit score');
|
76
|
+
SUMMARY <- c('sum', 'average', 'median', '');
|
77
|
+
if(is.null(prefix)) stop('Parameter prefix is mandatory.');
|
78
|
+
if(!requireNamespace("gplots", quietly=TRUE)) stop('Unavailable gplots library.');
|
79
|
+
|
80
|
+
# Read files
|
81
|
+
if(verbose) cat("Reading files.\n")
|
82
|
+
rec <- read.table(paste(prefix, '.rec', sep=''), sep="\t", comment.char='', quote='');
|
83
|
+
lim <- read.table(paste(prefix, '.lim', sep=''), sep="\t", comment.char='', quote='');
|
84
|
+
|
85
|
+
# Configure ID summary
|
86
|
+
id.summary <- pmatch(id.summary, SUMMARY);
|
87
|
+
if(is.na(id.summary)) stop('Invalid identity summary.');
|
88
|
+
if(id.summary == -1) stop('Ambiguous identity summary.');
|
89
|
+
if(id.summary==1){
|
90
|
+
id.summary.func <- function(x) colSums(x);
|
91
|
+
id.summary.name <- 'sum'
|
92
|
+
}else if(id.summary==2){
|
93
|
+
id.summary.func <- function(x) colMeans(x);
|
94
|
+
id.summary.name <- 'mean'
|
95
|
+
}else if(id.summary==3){
|
96
|
+
id.summary.func <- function(x) apply(x,2,median);
|
97
|
+
id.summary.name <- 'median'
|
98
|
+
}else if(id.summary==4){
|
99
|
+
id.summary.func <- function(x) apply(x,2,quantile,probs=0.05,names=FALSE);
|
100
|
+
id.summary.name <- '90% LB'
|
101
|
+
}else if(id.summary==5){
|
102
|
+
id.summary.func <- function(x) apply(x,2,quantile,probs=0.95,names=FALSE);
|
103
|
+
id.summary.name <- '90% UB'
|
104
|
+
}else if(id.summary==6){
|
105
|
+
id.summary.func <- function(x) apply(x,2,quantile,probs=0.025,names=FALSE);
|
106
|
+
id.summary.name <- '95% LB'
|
107
|
+
}else if(id.summary==7){
|
108
|
+
id.summary.func <- function(x) apply(x,2,quantile,probs=0.975,names=FALSE);
|
109
|
+
id.summary.name <- '95% UB'
|
110
|
+
}
|
111
|
+
|
112
|
+
# Configure metrics
|
113
|
+
id.metric <- pmatch(id.metric, METRICS);
|
114
|
+
if(is.na(id.metric)) stop('Invalid identity metric.');
|
115
|
+
if(id.metric == -1) stop('Ambiguous identity metric.');
|
116
|
+
if(id.metric==1){
|
117
|
+
id.reccol <- 3
|
118
|
+
id.shortname <- 'Id.'
|
119
|
+
id.fullname <- 'Identity'
|
120
|
+
id.units <- '%'
|
121
|
+
id.hallmarks <- seq(0, 100, by=5)
|
122
|
+
if(is.null(id.max)) id.max <- 100
|
123
|
+
if(is.null(id.cutoff)) id.cutoff <- 95
|
124
|
+
if(is.null(id.binsize)) id.binsize <- 0.1
|
125
|
+
}else if(id.metric==2){
|
126
|
+
if(ncol(rec)<6) stop("Requesting corrected identity, but .rec file doesn't have 6th column")
|
127
|
+
id.reccol <- 6
|
128
|
+
id.shortname <- 'cId.'
|
129
|
+
id.fullname <- 'Corrected identity'
|
130
|
+
id.units <- '%'
|
131
|
+
id.hallmarks <- seq(0, 100, by=5)
|
132
|
+
if(is.null(id.max)) id.max <- 100
|
133
|
+
if(is.null(id.cutoff)) id.cutoff <- 95
|
134
|
+
if(is.null(id.binsize)) id.binsize <- 0.1
|
135
|
+
}else if(id.metric==3){
|
136
|
+
id.reccol <- 4
|
137
|
+
id.shortname <- 'BSc.'
|
138
|
+
id.fullname <- 'Bit score'
|
139
|
+
id.units <- 'bits'
|
140
|
+
max.bs <- max(rec[, id.reccol])
|
141
|
+
id.hallmarks <- seq(0, max.bs*1.2, by=50)
|
142
|
+
if(is.null(id.max)) id.max <- max.bs
|
143
|
+
if(is.null(id.cutoff)) id.cutoff <- 0.95 * max.bs
|
144
|
+
if(is.null(id.binsize)) id.binsize <- 5
|
145
|
+
}
|
146
|
+
if(is.null(id.min)) id.min <- min(rec[, id.reccol]);
|
147
|
+
if(is.null(pos.max)) pos.max <- max(lim[, 3]);
|
148
|
+
id.lim <- c(id.min, id.max);
|
149
|
+
pos.lim <- c(pos.min, pos.max)/1e6;
|
150
|
+
id.breaks <- round((id.max-id.min)/id.binsize);
|
151
|
+
pos.breaks <- round((pos.max-pos.min)/pos.binsize);
|
152
|
+
if(is.null(main)) main <- paste('Recruitment plot of ', prefix, sep='');
|
153
|
+
pos.marks=seq(pos.min, pos.max, length.out=pos.breaks+1)/1e6;
|
154
|
+
id.marks=seq(id.min, id.max, length.out=id.breaks+1);
|
155
|
+
id.topclasses <- 0;
|
156
|
+
for(i in length(id.marks):1) if(id.marks[i]>id.cutoff) id.topclasses <- id.topclasses + 1;
|
157
|
+
|
158
|
+
# Set-up image
|
159
|
+
layout(matrix(c(3,4,1,2), nrow=2, byrow=TRUE), widths=c(2,1), heights=c(1,2));
|
160
|
+
out <- list();
|
161
|
+
|
162
|
+
# Recruitment plot
|
163
|
+
if(verbose) cat("Rec. plot.\n")
|
164
|
+
par(mar=c(5,4,0,0)+0.1);
|
165
|
+
rec.hist <- matrix(0, nrow=pos.breaks, ncol=id.breaks);
|
166
|
+
for(i in 1:nrow(rec)){
|
167
|
+
id.class <- ceiling((id.breaks)*((rec[i, id.reccol]-id.min)/(id.max-id.min)));
|
168
|
+
if(id.class<=id.breaks & id.class>0){
|
169
|
+
for(pos in rec[i, 1]:rec[i, 2]){
|
170
|
+
pos.class <- ceiling((pos.breaks)*((pos-pos.min)/(pos.max-pos.min)));
|
171
|
+
if(pos.class<=pos.breaks & pos.class>0) rec.hist[pos.class, id.class] <- rec.hist[pos.class, id.class]+1;
|
172
|
+
}
|
173
|
+
}
|
174
|
+
}
|
175
|
+
id.top <- c((1-id.topclasses):0) + id.breaks;
|
176
|
+
rec.col=gplots::colorpanel(256, rec.col1, rec.col2);
|
177
|
+
image(x=pos.marks, y=id.marks, z=log10(rec.hist),
|
178
|
+
breaks=seq(0, log10(max(rec.hist)), length.out=1+length(rec.col)), col=rec.col,
|
179
|
+
xlim=pos.lim, ylim=id.lim, xlab='Position in genome (Mbp)',
|
180
|
+
ylab=paste(id.fullname, ' (',id.units,')', sep=''), xaxs='i', yaxs='r');
|
181
|
+
if(!is.na(contig.col)) abline(v=c(lim$V2, lim$V3)/1e6, lty=1, col=contig.col);
|
182
|
+
abline(h=id.hallmarks, lty=2, col=grey(0.7));
|
183
|
+
abline(h=id.marks[id.top[1]], lty=3, col=grey(0.5))
|
184
|
+
legend('bottomleft', 'Rec. plot', bg=rgb(1,1,1,2/3));
|
185
|
+
out <- c(out, list(pos.marks=pos.marks, id.marks=id.marks));
|
186
|
+
if(ret.recplot) out <- c(out, list(recplot=rec.hist));
|
187
|
+
|
188
|
+
# Identity histogram
|
189
|
+
if(verbose) cat(id.shortname, " hist.\n", sep='')
|
190
|
+
par(mar=c(5,0,0,2)+0.1);
|
191
|
+
id.hist <- id.summary.func(rec.hist);
|
192
|
+
plot(1, t='n', xlim=c(1, max(id.hist)), ylim=id.lim, ylab='', yaxt='n', xlab=paste('Sequences (bp),', id.summary.name), log='x', ...);
|
193
|
+
id.x <- rep(id.marks, each=2)[2:(id.breaks*2+1)]
|
194
|
+
id.f <- rep(id.hist, each=2)[1:(id.breaks*2)]
|
195
|
+
if(sum(id.f)>0){
|
196
|
+
lines(id.f, id.x, lwd=ifelse(id.splines>0, 1/2, 2), type='o', pch='.');
|
197
|
+
if(id.splines>0){
|
198
|
+
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]), spar=id.splines)
|
199
|
+
lines(exp(id.spline$y), id.spline$x, lwd=2)
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
abline(h=id.hallmarks, lty=2, col=grey(0.7));
|
204
|
+
abline(h=id.marks[id.top[1]], lty=3, col=grey(0.5))
|
205
|
+
legend('bottomright', paste(id.shortname, 'histogram'), bg=rgb(1,1,1,2/3));
|
206
|
+
out <- c(out, list(id.mean=mean(rec[, id.reccol])));
|
207
|
+
out <- c(out, list(id.median=median(rec[, id.reccol])));
|
208
|
+
if(ret.mode) out <- c(out, list(id.mode=modeest::mlv(rec[, id.reccol], method='mfv')$M));
|
209
|
+
if(ret.hist) out <- c(out, list(id.hist=id.hist));
|
210
|
+
|
211
|
+
# Position histogram
|
212
|
+
if(verbose) cat("Pos. hist.\n")
|
213
|
+
par(mar=c(0,4,4,0)+0.1);
|
214
|
+
h1<-rep(0,nrow(rec.hist)) ;
|
215
|
+
h2<-rep(0,nrow(rec.hist)) ;
|
216
|
+
pos.winsize <- (pos.max-pos.min+1)/pos.breaks;
|
217
|
+
if(sum(rec.hist[, id.top])>0) h1 <- rowSums(matrix(rec.hist[, id.top], nrow=nrow(rec.hist)))/pos.winsize;
|
218
|
+
if(sum(rec.hist[,-id.top])>0) h2 <- rowSums(matrix(rec.hist[,-id.top], nrow=nrow(rec.hist)))/pos.winsize;
|
219
|
+
|
220
|
+
ymin <- min(1, h1[h1>0], h2[h2>0]);
|
221
|
+
ymax <- max(10, h1, h2);
|
222
|
+
if(is.na(ymin) || ymin<=0) ymin <- 1e-10;
|
223
|
+
if(is.na(ymax) || ymax<=0) ymax <- 1;
|
224
|
+
plot(1, t='n', xlab='', xaxt='n', ylab='Sequencing depth (X)', log='y', xlim=pos.lim,
|
225
|
+
ylim=c(ymin, ymax), xaxs='i', main=main, ...);
|
226
|
+
if(!is.na(contig.col)) abline(v=c(lim[,2], lim[,3])/1e6, lty=1, col=contig.col);
|
227
|
+
abline(h=10^c(0:5), lty=2, col=grey(0.7));
|
228
|
+
if(sum(h2)>0){
|
229
|
+
h2.x <- rep(pos.marks, each=2)[2:(pos.breaks*2+1)]
|
230
|
+
h2.y <- rep(h2, each=2)[1:(pos.breaks*2)]
|
231
|
+
lines(h2.x, h2.y, lwd=ifelse(pos.splines>0, 1/2, 2), col=grey(0.5));
|
232
|
+
if(pos.splines>0){
|
233
|
+
h2.spline <- smooth.spline(h2.x[h2.y>0], log(h2.y[h2.y>0]), spar=pos.splines)
|
234
|
+
lines(h2.spline$x, exp(h2.spline$y), lwd=2, col=grey(0.5))
|
235
|
+
}
|
236
|
+
if(ret.hist) out <- c(out, list(pos.hist.low=h2.y));
|
237
|
+
}
|
238
|
+
if(sum(h1)>0){
|
239
|
+
h1.x <- rep(pos.marks, each=2)[2:(pos.breaks*2+1)]
|
240
|
+
h1.y <- rep(h1, each=2)[1:(pos.breaks*2)]
|
241
|
+
lines(h1.x, h1.y, lwd=ifelse(pos.splines>0, 1/2, 2), col=grey(0));
|
242
|
+
if(pos.splines>0){
|
243
|
+
h1.spline <- smooth.spline(h1.x[h1.y>0], log(h1.y[h1.y>0]), spar=pos.splines)
|
244
|
+
lines(h1.spline$x, exp(h1.spline$y), lwd=2, col=grey(0))
|
245
|
+
}
|
246
|
+
if(ret.hist) out <- c(out, list(pos.hist.top=h1.y));
|
247
|
+
}
|
248
|
+
legend('topleft', 'Pos. histogram', bg=rgb(1,1,1,2/3));
|
249
|
+
out <- c(out, list(id.max=id.max, id.cutoff=id.marks[id.top[1]]));
|
250
|
+
out <- c(out, list(seqdepth.mean.top=mean(h1)));
|
251
|
+
out <- c(out, list(seqdepth.mean.low=mean(h2)));
|
252
|
+
out <- c(out, list(seqdepth.mean=mean(h1+h2)));
|
253
|
+
out <- c(out, list(seqdepth.median.top=median(h1)));
|
254
|
+
out <- c(out, list(seqdepth.median.low=median(h2)));
|
255
|
+
out <- c(out, list(seqdepth.median=median(h1+h2)));
|
256
|
+
out <- c(out, list(id.metric=id.fullname));
|
257
|
+
out <- c(out, list(id.summary=id.summary.name));
|
258
|
+
|
259
|
+
# Legend
|
260
|
+
par(mar=c(0,0,4,2)+0.1);
|
261
|
+
plot(1, t='n', xlab='', xaxt='n', ylab='', yaxt='n', xlim=c(0,1), ylim=c(0,1), xaxs='r', yaxs='i', ...);
|
262
|
+
text(1/2, 5/6, labels=paste('Reads per ', signif((pos.max-pos.min)/pos.breaks, 2), ' bp (rec. plot)', sep=''), pos=3);
|
263
|
+
leg.col <- gplots::colorpanel(100, rec.col1, rec.col2);
|
264
|
+
leg.lab <- signif(10^seq(0, log10(max(rec.hist)), length.out=10), 2);
|
265
|
+
for(i in 1:10){
|
266
|
+
for(j in 1:10){
|
267
|
+
k <- (i-1)*10 + j;
|
268
|
+
polygon(c(k-1, k, k, k-1)/100, c(2/3, 2/3, 5/6, 5/6), border=leg.col[k], col=leg.col[k]);
|
269
|
+
}
|
270
|
+
text((i-0.5)/10, 2/3, labels=paste(leg.lab[i], ''), srt=90, pos=2, offset=0, cex=3/4);
|
271
|
+
}
|
272
|
+
legend('bottom',
|
273
|
+
legend=c('Contig boundary', 'Hallmark', paste(id.fullname, 'cutoff'),
|
274
|
+
paste('Pos. hist.: ',id.shortname,' > ',signif(id.marks[id.top[1]],2),id.units,sep=''),
|
275
|
+
paste('Pos. hist.: ',id.shortname,' < ',signif(id.marks[id.top[1]],2),id.units,sep='')), ncol=2,
|
276
|
+
col=grey(c(0.85, 0.7, 0.5, 0, 0.5)), lty=c(1,2,3,1,1), lwd=c(1,1,1,2,2), bty='n', inset=0.05, cex=5/6);
|
277
|
+
return(out);
|
278
|
+
### A list with the following elements:
|
279
|
+
###
|
280
|
+
### pos.marks: Midpoints of the position histogram.
|
281
|
+
###
|
282
|
+
### id.matrix: Midpoints of the identity histogram.
|
283
|
+
###
|
284
|
+
### recplot (if ret.recplot=TRUE): Matrix containing the recruitment plot values.
|
285
|
+
###
|
286
|
+
### id.mean: Mean identity.
|
287
|
+
###
|
288
|
+
### id.median: Median identity.
|
289
|
+
###
|
290
|
+
### id.mode (if ret.mode=TRUE): Mode of the identity.
|
291
|
+
###
|
292
|
+
### id.hist (if ret.hist=TRUE): Values of the identity histogram.
|
293
|
+
###
|
294
|
+
### pos.hist.low (if ret.hist=TRUE): Values of the position histogram (depth) with "low"
|
295
|
+
### identity (i.e., below id.cutoff).
|
296
|
+
###
|
297
|
+
### pos.hist.top (if ret.hist=TRUE): Values of the position histogram (depth) with "top"
|
298
|
+
### identity (i.e., above id.cutoff).
|
299
|
+
###
|
300
|
+
### id.max: Value of id.max. This is returned because id.max=NULL may vary.
|
301
|
+
###
|
302
|
+
### id.cutoff: Value of id.cutoff. This is returned because id.cutoff=NULL may vary.
|
303
|
+
###
|
304
|
+
### seqdepth.mean.top: Average sequencing depth with identity above id.cutoff.
|
305
|
+
###
|
306
|
+
### seqdepth.mean.low: Average sequencing depth with identity below id.cutoff.
|
307
|
+
###
|
308
|
+
### seqdepth.mean.all: Average sequencing depth without identity filtering.
|
309
|
+
###
|
310
|
+
### seqdepth.median.top: Median sequencing depth with identity above id.cutoff.
|
311
|
+
###
|
312
|
+
### seqdepth.median.low: Median sequencing depth with identity below id.cutoff.
|
313
|
+
###
|
314
|
+
### seqdepth.median.all: Median sequencing depth without identity filtering.
|
315
|
+
###
|
316
|
+
### id.metric: Full name of the used identity metric.
|
317
|
+
###
|
318
|
+
### id.summary: Full name of the summary method used to build the identity plot.
|
319
|
+
});
|
320
|
+
|
@@ -0,0 +1,745 @@
|
|
1
|
+
#==============> Define S4 classes
|
2
|
+
setClass("enve.RecPlot2",
|
3
|
+
### Enve-omics representation of Recruitment plots. This object can
|
4
|
+
### be produced by `enve.recplot2` and supports S4 method plot.
|
5
|
+
representation(
|
6
|
+
counts='matrix', ##<< Counts as a two-dimensional histogram.
|
7
|
+
pos.counts.in='numeric', ##<< Counts of in-group hits per position bin.
|
8
|
+
pos.counts.out='numeric', ##<< Counts of out-group hits per position bin.
|
9
|
+
id.counts='numeric', ##<< Counts per ID bin.
|
10
|
+
id.breaks='numeric', ##<< Breaks of identity bins.
|
11
|
+
pos.breaks='numeric', ##<< Breaks of position bins.
|
12
|
+
seq.breaks='numeric',
|
13
|
+
peaks='list', ##<< Peaks identified in the recplot.
|
14
|
+
### Limits of the subject sequences after concatenation.
|
15
|
+
seq.names='character', ##<< Names of the subject sequences.
|
16
|
+
id.metric='character', ##<< Metric used as 'identity'.
|
17
|
+
id.ingroup='logical', ##<< Identity bins considered in-group.
|
18
|
+
call='call') ##<< Call producing this object.
|
19
|
+
,package='enveomics.R'
|
20
|
+
);
|
21
|
+
setClass("enve.RecPlot2.Peak",
|
22
|
+
### Enve-omics representation of a peak in the sequencing depth histogram
|
23
|
+
### of a Recruitment plot (see `enve.recplot2.findPeaks`).
|
24
|
+
representation(
|
25
|
+
dist='character',
|
26
|
+
### Distribution of the peak. Currently supported: 'norm' (normal) and 'sn'
|
27
|
+
### (skew-normal).
|
28
|
+
values='numeric',
|
29
|
+
### Sequencing depth values predicted to conform the peak.
|
30
|
+
values.res='numeric',
|
31
|
+
### Sequencing depth values not explained by this or previously identified
|
32
|
+
### peaks.
|
33
|
+
mode='numeric',
|
34
|
+
### Seed-value of mode anchoring the peak.
|
35
|
+
param.hat='list',
|
36
|
+
### Parameters of the distribution. A list of two values if dist='norm' (sd
|
37
|
+
### and mean), or three values if dist='sn' (omega=scale, alpha=shape, and
|
38
|
+
### xi=location). Note that the "dispersion" parameter is always first and
|
39
|
+
### the "location" parameter is always last.
|
40
|
+
n.hat='numeric',
|
41
|
+
### Number of bins estimated to be explained by this peak. This should
|
42
|
+
### ideally be equal to the length of `values`, but it's not and integer.
|
43
|
+
n.total='numeric',
|
44
|
+
### Total number of bins from which the peak was extracted. I.e., total
|
45
|
+
### number of position bins with non-zero sequencing depth in the recruitment
|
46
|
+
### plot (regardless of peak count).
|
47
|
+
err.res='numeric',
|
48
|
+
### Error left after adding the peak.
|
49
|
+
merge.logdist='numeric'
|
50
|
+
### Attempted `merge.logdist` parameter.
|
51
|
+
));
|
52
|
+
setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
|
53
|
+
setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
|
54
|
+
|
55
|
+
#==============> Define S4 methods
|
56
|
+
plot.enve.RecPlot2 <- function
|
57
|
+
### Plots an `enve.RecPlot2` object.
|
58
|
+
(x,
|
59
|
+
### `enve.RecPlot2` object to plot.
|
60
|
+
layout=matrix(c(5,5,2,1,4,3), nrow=2),
|
61
|
+
### Matrix indicating the position of the different panels in the layout,
|
62
|
+
### where:
|
63
|
+
### 0: Empty space,
|
64
|
+
### 1: Counts matrix,
|
65
|
+
### 2: position histogram (sequencing depth),
|
66
|
+
### 3: identity histogram,
|
67
|
+
### 4: Populations histogram (histogram of sequencing depths),
|
68
|
+
### 5: Color scale for the counts matrix (vertical),
|
69
|
+
### 6: Color scale of the counts
|
70
|
+
### matrix (horizontal). Only panels indicated here will be plotted. To
|
71
|
+
### plot only one panel simply set this to the number of the panel you
|
72
|
+
### want to plot.
|
73
|
+
widths=c(1,7,2),
|
74
|
+
### Relative widths of the columns of `layout`.
|
75
|
+
heights=c(1,2),
|
76
|
+
### Relative heights of the rows of `layout`.
|
77
|
+
palette=grey((100:0)/100),
|
78
|
+
### Colors to be used to represent the counts matrix, sorted from no hits
|
79
|
+
### to the maximum sequencing depth.
|
80
|
+
underlay.group=TRUE,
|
81
|
+
### If TRUE, it indicates the in-group and out-group areas couloured based
|
82
|
+
### on `in.col` and `out.col`. Requires support for semi-transparency.
|
83
|
+
peaks.col='darkred',
|
84
|
+
### If not NA, it attempts to represent peaks in the population histogram
|
85
|
+
### in the specified color. Set to NA to avoid peak-finding.
|
86
|
+
id.lim=range(x$id.breaks),
|
87
|
+
### Limits of identities to represent.
|
88
|
+
pos.lim=range(x$pos.breaks),
|
89
|
+
### Limits of positions to represent (in bp, regardless of `pos.units`).
|
90
|
+
pos.units=c('Mbp','Kbp','bp'),
|
91
|
+
### Units in which the positions should be represented (powers of 1,000
|
92
|
+
### base pairs).
|
93
|
+
mar=list('1'=c(5,4,1,1)+.1, '2'=c(ifelse(any(layout==1),1,5),4,4,1)+.1,
|
94
|
+
'3'=c(5,ifelse(any(layout==1),1,4),1,2)+0.1,
|
95
|
+
'4'=c(ifelse(any(layout==1),1,5),ifelse(any(layout==2),1,4),4,2)+0.1,
|
96
|
+
'5'=c(5,3,4,1)+0.1, '6'=c(5,4,4,2)+0.1),
|
97
|
+
### Margins of the panels as a list, with the character representation of
|
98
|
+
### the number of the panel as index (see `layout`).
|
99
|
+
pos.splines=0,
|
100
|
+
### Smoothing parameter for the splines in the position histogram. Zero
|
101
|
+
### (0) for no splines. If non-zero, requires the stats package.
|
102
|
+
id.splines=1/2,
|
103
|
+
### Smoothing parameter for the splines in the identity histogram. Zero
|
104
|
+
### (0) for no splines. If non-zero, requires the stats package.
|
105
|
+
in.lwd=ifelse(pos.splines>0, 1/2, 2),
|
106
|
+
### Line width for the sequencing depth of in-group matches.
|
107
|
+
out.lwd=ifelse(pos.splines>0, 1/2, 2),
|
108
|
+
### Line width for the sequencing depth of out-group matches.
|
109
|
+
id.lwd=ifelse(id.splines>0, 1/2, 2),
|
110
|
+
### Line width for the identity histogram.
|
111
|
+
in.col='darkblue',
|
112
|
+
### Color associated to in-group matches.
|
113
|
+
out.col='lightblue',
|
114
|
+
### Color associated to out-group matches.
|
115
|
+
id.col='black',
|
116
|
+
### Color for the identity histogram.
|
117
|
+
breaks.col='#AAAAAA40',
|
118
|
+
### Color of the vertical lines indicating sequence breaks.
|
119
|
+
peaks.opts=list(),
|
120
|
+
### Options passed to `enve.recplot2.findPeaks`, if `peaks.col` is not NA.
|
121
|
+
...
|
122
|
+
### Any other graphic parameters (currently ignored).
|
123
|
+
){
|
124
|
+
pos.units <- match.arg(pos.units);
|
125
|
+
pos.factor <- ifelse(pos.units=='bp',1,ifelse(pos.units=='Kbp',1e3,1e6));
|
126
|
+
pos.lim <- pos.lim/pos.factor;
|
127
|
+
lmat <- layout;
|
128
|
+
for(i in 1:6) if(!any(layout==i)) lmat[layout>i] <- lmat[layout>i]-1;
|
129
|
+
|
130
|
+
layout(lmat, widths=widths, heights=heights);
|
131
|
+
ori.mar <- par('mar');
|
132
|
+
|
133
|
+
# Essential vars
|
134
|
+
counts <- x$counts
|
135
|
+
|
136
|
+
id.ingroup <- x$id.ingroup
|
137
|
+
id.counts <- x$id.counts
|
138
|
+
id.breaks <- x$id.breaks
|
139
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2
|
140
|
+
id.binsize <- id.breaks[-1] - id.breaks[-length(id.breaks)]
|
141
|
+
|
142
|
+
pos.counts.in <- x$pos.counts.in
|
143
|
+
pos.counts.out <- x$pos.counts.out
|
144
|
+
pos.breaks <- x$pos.breaks/pos.factor
|
145
|
+
pos.mids <- (pos.breaks[-length(pos.breaks)]+pos.breaks[-1])/2
|
146
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])*pos.factor
|
147
|
+
|
148
|
+
seqdepth.in <- pos.counts.in/pos.binsize
|
149
|
+
seqdepth.out <- pos.counts.out/pos.binsize
|
150
|
+
seqdepth.lim <- range(c(seqdepth.in[seqdepth.in>0],
|
151
|
+
seqdepth.out[seqdepth.out>0]))*c(1/2,2)
|
152
|
+
|
153
|
+
if(underlay.group){
|
154
|
+
in.bg <- do.call(rgb, c(as.list(col2rgb(in.col)),
|
155
|
+
list(maxColorValue=256, alpha=62)));
|
156
|
+
out.bg <- do.call(rgb, c(as.list(col2rgb(out.col)[,1]),
|
157
|
+
list(maxColorValue=256, alpha=52)));
|
158
|
+
}
|
159
|
+
|
160
|
+
# Counts matrix
|
161
|
+
if(any(layout==1)){
|
162
|
+
par(mar=mar[['1']]);
|
163
|
+
plot(1, t='n', bty='l',
|
164
|
+
xlim=pos.lim, xlab=paste('Position in genome (',pos.units,')',sep=''),
|
165
|
+
xaxs='i', ylim=id.lim, ylab=x$id.metric, yaxs='i');
|
166
|
+
if(underlay.group){
|
167
|
+
rect(pos.lim[1], id.lim[1], pos.lim[2],
|
168
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
169
|
+
rect(pos.lim[1], min(id.breaks[c(id.ingroup,TRUE)]), pos.lim[2],
|
170
|
+
id.lim[2], col=in.bg, border=NA);
|
171
|
+
}
|
172
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col);
|
173
|
+
image(x=pos.breaks, y=id.breaks, z=log10(counts),col=palette,
|
174
|
+
bg=grey(1,0), breaks=seq(-.1,log10(max(counts)),
|
175
|
+
length.out=1+length(palette)), add=TRUE);
|
176
|
+
}
|
177
|
+
|
178
|
+
# Position histogram
|
179
|
+
if(any(layout==2)){
|
180
|
+
par(mar=mar[['2']]);
|
181
|
+
if(any(layout==1)){
|
182
|
+
xlab=''
|
183
|
+
xaxt='n'
|
184
|
+
}else{
|
185
|
+
xlab=paste('Position in genome (',pos.units,')',sep='')
|
186
|
+
xaxt='s'
|
187
|
+
}
|
188
|
+
plot(1,t='n', bty='l', log='y',
|
189
|
+
xlim=pos.lim, xlab=xlab, xaxt=xaxt, xaxs='i',
|
190
|
+
ylim=seqdepth.lim, yaxs='i', ylab='Sequencing depth (X)');
|
191
|
+
abline(v=x$seq.breaks/pos.factor, col=breaks.col)
|
192
|
+
pos.x <- rep(pos.breaks,each=2)[-c(1,2*length(pos.breaks))]
|
193
|
+
pos.f <- rep(seqdepth.in,each=2)
|
194
|
+
lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
|
195
|
+
lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
|
196
|
+
if(pos.splines > 0){
|
197
|
+
pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
|
198
|
+
spar=pos.splines)
|
199
|
+
lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
|
200
|
+
}
|
201
|
+
if(any(pos.counts.out==0)) rect(pos.breaks[c(pos.counts.out==0,FALSE)],
|
202
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.out==0)],
|
203
|
+
seqdepth.lim[1]*3/2, col=out.col, border=NA);
|
204
|
+
if(any(pos.counts.in==0)) rect(pos.breaks[c(pos.counts.in==0,FALSE)],
|
205
|
+
seqdepth.lim[1], pos.breaks[c(FALSE,pos.counts.in==0)],
|
206
|
+
seqdepth.lim[1]*3/2, col=in.col, border=NA);
|
207
|
+
}
|
208
|
+
|
209
|
+
# Identity histogram
|
210
|
+
if(any(layout==3)){
|
211
|
+
par(mar=mar[['3']]);
|
212
|
+
if(any(layout==1)){
|
213
|
+
ylab=''
|
214
|
+
yaxt='n'
|
215
|
+
}else{
|
216
|
+
ylab=x$id.metric
|
217
|
+
yaxt='s'
|
218
|
+
}
|
219
|
+
if(sum(id.counts>0) >= 4){
|
220
|
+
id.counts.range <- range(id.counts[id.counts>0])*c(1/2,2);
|
221
|
+
plot(1,t='n', bty='l', log='x',
|
222
|
+
xlim=id.counts.range, xlab='bps per bin', xaxs='i',
|
223
|
+
ylim=id.lim, yaxs='i', ylab=ylab, yaxt=yaxt);
|
224
|
+
if(underlay.group){
|
225
|
+
rect(id.counts.range[1], id.lim[1], id.counts.range[2],
|
226
|
+
min(id.breaks[c(id.ingroup,TRUE)]), col=out.bg, border=NA);
|
227
|
+
rect(id.counts.range[1], min(id.breaks[c(id.ingroup,TRUE)]),
|
228
|
+
id.counts.range[2], id.lim[2], col=in.bg, border=NA);
|
229
|
+
}
|
230
|
+
id.f <- rep(id.counts,each=2)
|
231
|
+
id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
|
232
|
+
lines(id.f, id.x, lwd=id.lwd, col=id.col);
|
233
|
+
if(id.splines > 0){
|
234
|
+
id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
|
235
|
+
spar=id.splines)
|
236
|
+
lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
|
237
|
+
}
|
238
|
+
}else{
|
239
|
+
plot(1,t='n',bty='l',xlab='', xaxt='n', ylab='', yaxt='n')
|
240
|
+
text(1,1,labels='Insufficient data', srt=90)
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
# Populations histogram
|
245
|
+
peaks <- NA;
|
246
|
+
if(any(layout==4)){
|
247
|
+
par(mar=mar[['4']]);
|
248
|
+
if(any(layout==2)){
|
249
|
+
ylab=''
|
250
|
+
yaxt='n'
|
251
|
+
}else{
|
252
|
+
ylab='Sequencing depth (X)'
|
253
|
+
yaxt='s'
|
254
|
+
}
|
255
|
+
h.breaks <- seq(log10(seqdepth.lim[1]*2), log10(seqdepth.lim[2]/2),
|
256
|
+
length.out=200);
|
257
|
+
h.in <- hist(log10(seqdepth.in), breaks=h.breaks, plot=FALSE);
|
258
|
+
h.out <- hist(log10(seqdepth.out), breaks=h.breaks, plot=FALSE);
|
259
|
+
plot(1, t='n', log='y',
|
260
|
+
xlim=range(c(h.in$counts,h.out$counts,sum(pos.counts.in==0))),
|
261
|
+
xaxs='r', xlab='', xaxt='n', ylim=seqdepth.lim, yaxs='i', ylab=ylab,
|
262
|
+
yaxt=yaxt)
|
263
|
+
y.tmp.in <- c(rep(10^h.in$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
264
|
+
y.tmp.out <- c(rep(10^h.out$breaks,each=2),seqdepth.lim[1]*c(1,1,3/2,3/2))
|
265
|
+
lines(c(0,rep(h.out$counts,each=2),0,0,rep(sum(pos.counts.out==0),2),0),
|
266
|
+
y.tmp.out, col=out.col)
|
267
|
+
polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
|
268
|
+
y.tmp.in, border=NA, col=in.col)
|
269
|
+
if(!is.na(peaks.col)){
|
270
|
+
o <- peaks.opts; o$x = x;
|
271
|
+
peaks <- do.call(enve.recplot2.findPeaks, o);
|
272
|
+
h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
|
273
|
+
if(!is.null(peaks) & length(peaks)>0){
|
274
|
+
pf <- h.mids*0;
|
275
|
+
for(i in 1:length(peaks)){
|
276
|
+
cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
|
277
|
+
lines(cnt, h.mids, col='red');
|
278
|
+
pf <- pf+cnt;
|
279
|
+
axis(4, at=peaks[[i]]$param.hat[[length(peaks[[i]]$param.hat)]],
|
280
|
+
letters[i], las=1, hadj=1/2)
|
281
|
+
}
|
282
|
+
lines(pf, h.mids, col='red',lwd=1.5);
|
283
|
+
legend('bottomright', legend=paste(
|
284
|
+
letters[1:length(peaks)],'. ',
|
285
|
+
signif(as.numeric(lapply(peaks,
|
286
|
+
function(x) tail(as.numeric(x$param.hat),n=1))),3),'X (',
|
287
|
+
signif(100*as.numeric(lapply(peaks,
|
288
|
+
function(x) (length(x$values)/x$n.total))), 3), '%, err: ',
|
289
|
+
signif(as.numeric(lapply(peaks, function(x) x$err.res)), 3), ')',
|
290
|
+
sep=''), bty='n');
|
291
|
+
}
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
# Color scale
|
296
|
+
count.bins <- 10^seq(log10(min(counts[counts>0])), log10(max(counts)),
|
297
|
+
length.out=1+length(palette))
|
298
|
+
if(any(layout==5)){
|
299
|
+
par(mar=mar[['5']]);
|
300
|
+
plot(1,t='n',log='y',xlim=0:1,xaxt='n',xlab='',xaxs='i',
|
301
|
+
ylim=range(count.bins), yaxs='i', ylab='')
|
302
|
+
rect(0,count.bins[-length(count.bins)],1,count.bins[-1],col=palette,
|
303
|
+
border=NA)
|
304
|
+
}
|
305
|
+
if(any(layout==6)){
|
306
|
+
par(mar=mar[['6']]);
|
307
|
+
plot(1,t='n',log='x',ylim=0:1,yaxt='n',ylab='',yaxs='i',
|
308
|
+
xlim=range(count.bins), xaxs='i',xlab='');
|
309
|
+
rect(count.bins[-length(count.bins)],0,count.bins[-1],1,col=palette,
|
310
|
+
border=NA);
|
311
|
+
}
|
312
|
+
|
313
|
+
par(mar=ori.mar);
|
314
|
+
return(peaks);
|
315
|
+
### Returns a list of `enve.RecPlot2.Peak` objects (see
|
316
|
+
### `enve.recplot2.findPeaks`). If `peaks.col`=NA or `layout` doesn't include
|
317
|
+
### 4, returns NA.
|
318
|
+
}
|
319
|
+
|
320
|
+
#==============> Define core functions
|
321
|
+
enve.recplot2 <- function(
|
322
|
+
### Produces recruitment plots provided that BlastTab.catsbj.pl has
|
323
|
+
### been previously executed.
|
324
|
+
prefix,
|
325
|
+
### Path to the prefix of the BlastTab.catsbj.pl output files. At
|
326
|
+
### least the files .rec and .lim must exist with this prefix.
|
327
|
+
plot=TRUE,
|
328
|
+
### Should the object be plotted?
|
329
|
+
pos.breaks=1e3,
|
330
|
+
### Breaks in the positions histogram. It can also be a vector of break
|
331
|
+
### points, and values outside the range are ignored. If zero (0), it
|
332
|
+
### uses the sequence breaks as defined in the .lim file, which means
|
333
|
+
### one bin per contig (or gene, if the mapping is agains genes).
|
334
|
+
id.breaks=300,
|
335
|
+
### Breaks in the identity histogram. It can also be a vector of break
|
336
|
+
### points, and values outside the range are ignored.
|
337
|
+
id.metric=c('identity', 'corrected identity', 'bit score'),
|
338
|
+
### Metric of identity to be used (Y-axis). Corrected identity is only
|
339
|
+
### supported if the original BLAST file included sequence lengths.
|
340
|
+
id.summary=sum,
|
341
|
+
### Function summarizing the identity bins. Other recommended options
|
342
|
+
### include: `median` to estimate the median instead of total bins, and
|
343
|
+
### `function(x) mlv(x,method='parzen')$M` to estimate the mode.
|
344
|
+
id.cutoff=95,
|
345
|
+
### Cutoff of identity metric above which the hits are considered
|
346
|
+
### 'in-group'. The 95% identity corresponds to the expectation of
|
347
|
+
### ANI<95% within species.
|
348
|
+
threads=2,
|
349
|
+
### Number of threads to use.
|
350
|
+
verbose=TRUE,
|
351
|
+
### Indicates if the function should report the advance.
|
352
|
+
...
|
353
|
+
### Any additional parameters supported by `plot.enve.RecPlot2`.
|
354
|
+
){
|
355
|
+
# Settings
|
356
|
+
id.metric <- match.arg(id.metric);
|
357
|
+
|
358
|
+
#Read files
|
359
|
+
if(verbose) cat("Reading files.\n")
|
360
|
+
rec <- read.table(paste(prefix, ".rec", sep=""), sep="\t", comment.char="",
|
361
|
+
quote="");
|
362
|
+
lim <- read.table(paste(prefix, ".lim", sep=""), sep="\t", comment.char="",
|
363
|
+
quote="", as.is=TRUE);
|
364
|
+
|
365
|
+
# Build matrix
|
366
|
+
if(verbose) cat("Building counts matrix.\n")
|
367
|
+
if(id.metric=="corrected identity" & ncol(rec)<6){
|
368
|
+
stop("Requesting corr. identity, but .rec file doesn't have 6th column")
|
369
|
+
}
|
370
|
+
rec.idcol <- ifelse(id.metric=="identity", 3,
|
371
|
+
ifelse(id.metric=="corrected identity", 6, 4));
|
372
|
+
if(length(pos.breaks)==1){
|
373
|
+
if(pos.breaks>0){
|
374
|
+
pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
|
375
|
+
}else{
|
376
|
+
pos.breaks <- c(lim[,2], tail(lim[,3], n=1))
|
377
|
+
}
|
378
|
+
}
|
379
|
+
if(length(id.breaks)==1){
|
380
|
+
id.breaks <- seq(min(rec[,rec.idcol]), max(rec[,rec.idcol]),
|
381
|
+
length.out=id.breaks+1);
|
382
|
+
}
|
383
|
+
|
384
|
+
# Run in parallel
|
385
|
+
if(nrow(rec) < 200) threads <- 1 # It doesn't worth the overhead
|
386
|
+
cl <- makeCluster(threads)
|
387
|
+
rec.l <- list()
|
388
|
+
thl <- ceiling(nrow(rec)/threads)
|
389
|
+
for(i in 0:(threads-1)){
|
390
|
+
rec.l[[i+1]] <- list(rec=rec[ (i*thl+1):min(((i+1)*thl),nrow(rec)), ],
|
391
|
+
verbose=ifelse(i==0, verbose, FALSE))
|
392
|
+
}
|
393
|
+
counts.l <- clusterApply(cl, rec.l, enve.recplot2.__counts,
|
394
|
+
pos.breaks=pos.breaks, id.breaks=id.breaks,
|
395
|
+
rec.idcol=rec.idcol)
|
396
|
+
counts <- counts.l[[1]]
|
397
|
+
if(threads>1) for(i in 2:threads) counts <- counts + counts.l[[i]]
|
398
|
+
stopCluster(cl)
|
399
|
+
|
400
|
+
# Estimate 1D histograms
|
401
|
+
if(verbose) cat("Building histograms.\n")
|
402
|
+
id.mids <- (id.breaks[-length(id.breaks)]+id.breaks[-1])/2;
|
403
|
+
id.ingroup <- (id.mids > id.cutoff);
|
404
|
+
id.counts <- apply(counts, 2, id.summary);
|
405
|
+
pos.counts.in <- apply(counts[,id.ingroup], 1, sum);
|
406
|
+
pos.counts.out <- apply(counts[,!id.ingroup], 1, sum);
|
407
|
+
|
408
|
+
# Plot and return
|
409
|
+
recplot <- new('enve.RecPlot2',
|
410
|
+
counts=counts, id.counts=id.counts, pos.counts.in=pos.counts.in,
|
411
|
+
pos.counts.out=pos.counts.out,
|
412
|
+
id.breaks=id.breaks, pos.breaks=pos.breaks,
|
413
|
+
seq.breaks=c(lim[1,2], lim[,3]), seq.names=lim[,1],
|
414
|
+
id.ingroup=id.ingroup,id.metric=id.metric,
|
415
|
+
call=match.call());
|
416
|
+
if(plot){
|
417
|
+
if(verbose) cat("Plotting.\n")
|
418
|
+
peaks <- plot(recplot, ...);
|
419
|
+
attr(recplot, "peaks") <- peaks
|
420
|
+
}
|
421
|
+
return(recplot);
|
422
|
+
### Returns an object of class `enve.RecPlot2`.
|
423
|
+
}
|
424
|
+
|
425
|
+
enve.recplot2.findPeaks <- function(
|
426
|
+
### Identifies peaks in the population histogram potentially indicating
|
427
|
+
### sub-population mixtures.
|
428
|
+
x,
|
429
|
+
### An `enve.RecPlot2` object.
|
430
|
+
min.points=10,
|
431
|
+
### Minimum number of points in the quantile-estimation-range
|
432
|
+
### (`quant.est`) to estimate a peak.
|
433
|
+
quant.est=c(0.002, 0.998),
|
434
|
+
### Range of quantiles to be used in the estimation of a peak's
|
435
|
+
### parameters.
|
436
|
+
mlv.opts=list(method='parzen'),
|
437
|
+
### Options passed to `mlv` to estimate the mode.
|
438
|
+
fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
|
439
|
+
start=list(omega=1, alpha=-1), lower=c(1e-6, -Inf, 0),
|
440
|
+
upper=c(Inf, 0, Inf)),
|
441
|
+
### Options passed to `fitdist` to estimate the standard deviation if
|
442
|
+
### with.skewness=TRUE. Note that the `start` parameter will be ammended
|
443
|
+
### with xi=estimated mode for each peak.
|
444
|
+
fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4,.6),
|
445
|
+
start=list(sd=1), lower=c(1e-8, 0)),
|
446
|
+
### Options passed to `fitdist` to estimate the standard deviation if
|
447
|
+
### with.skewness=FALSE. Note that the `start` parameter will be ammended
|
448
|
+
### with mean=estimated mode for each peak.
|
449
|
+
rm.top=0.05,
|
450
|
+
### Top-values to remove before finding peaks, as a quantile probability.
|
451
|
+
### This step is useful to remove highly conserved regions, but can be
|
452
|
+
### turned off by setting rm.top=0. The quantile is determined *after*
|
453
|
+
### removing zero-coverage windows.
|
454
|
+
with.skewness=TRUE,
|
455
|
+
### Allow skewness correction of the peaks. Typically, the
|
456
|
+
### sequencing-depth distribution for a single peak is left-skewed, due
|
457
|
+
### partly (but not exclusively) to fragmentation and mapping sensitivity.
|
458
|
+
### See Lindner et al 2013, Bioinformatics 29(10):1260-7 for an
|
459
|
+
### alternative solution for the first problem (fragmentation) called
|
460
|
+
### "tail distribution".
|
461
|
+
optim.rounds=200,
|
462
|
+
### Maximum rounds of peak optimization.
|
463
|
+
optim.epsilon=1e-8,
|
464
|
+
### Trace change at which optimization stops (unless `optim.rounds` is
|
465
|
+
### reached first). The trace change is estimated as the sum of square
|
466
|
+
### differences between parameters in one round and those from two rounds
|
467
|
+
### earlier (to avoid infinite loops from approximation).
|
468
|
+
merge.logdist=log(1.75),
|
469
|
+
### Maximum value of |log-ratio| between centrality parameters in peaks to
|
470
|
+
### attempt merging. The default of ~0.22 corresponds to a maximum
|
471
|
+
### difference of 25%.
|
472
|
+
verbose=FALSE
|
473
|
+
### Display (mostly debugging) information.
|
474
|
+
){
|
475
|
+
|
476
|
+
# Essential vars
|
477
|
+
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)];
|
478
|
+
seqdepth.in <- x$pos.counts.in/pos.binsize;
|
479
|
+
lsd1 <- seqdepth.in[seqdepth.in>0];
|
480
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
481
|
+
if(with.skewness){
|
482
|
+
fitdist.opts <- fitdist.opts.sn
|
483
|
+
}else{
|
484
|
+
fitdist.opts <- fitdist.opts.norm
|
485
|
+
}
|
486
|
+
peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
|
487
|
+
mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
|
488
|
+
optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
|
489
|
+
n.total=length(lsd1), merge.logdist=merge.logdist)
|
490
|
+
|
491
|
+
# Find seed peaks
|
492
|
+
if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
|
493
|
+
peaks <- enve.recplot2.__findPeaks(peaks.opts);
|
494
|
+
|
495
|
+
# Merge overlapping peaks
|
496
|
+
if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
|
497
|
+
merged <- (length(peaks)>1)
|
498
|
+
while(merged){
|
499
|
+
merged <- FALSE
|
500
|
+
ignore <- c()
|
501
|
+
peaks2 <- list();
|
502
|
+
for(i in 1:length(peaks)){
|
503
|
+
if(i %in% ignore) next
|
504
|
+
p <- peaks[[ i ]]
|
505
|
+
j <- enve.recplot2.__whichClosestPeak(p, peaks)
|
506
|
+
p2 <- peaks[[ j ]]
|
507
|
+
dst.a <- p$param.hat[[ length(p$param.hat) ]]
|
508
|
+
dst.b <- p2$param.hat[[ length(p2$param.hat) ]]
|
509
|
+
if( abs(log(dst.a/dst.b)) < merge.logdist ){
|
510
|
+
if(verbose) cat('==> Attempting a merge at',
|
511
|
+
p$param.hat[[ length(p$param.hat) ]],'&',
|
512
|
+
p2$param.hat[[ length(p2$param.hat) ]],'X\n');
|
513
|
+
peaks.opts$lsd1 <- c(p$values, p2$values)
|
514
|
+
p.new <- enve.recplot2.__findPeaks(peaks.opts)
|
515
|
+
if(length(p.new)==1){
|
516
|
+
peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
|
517
|
+
ignore <- c(ignore, j)
|
518
|
+
merged <- TRUE
|
519
|
+
}
|
520
|
+
}
|
521
|
+
if(!merged) peaks2[[ length(peaks2)+1 ]] <- p
|
522
|
+
}
|
523
|
+
peaks <- peaks2
|
524
|
+
if(length(peaks)==1) break
|
525
|
+
}
|
526
|
+
|
527
|
+
if(verbose) cat('Found',length(peaks),'peak(s)\n')
|
528
|
+
return(peaks);
|
529
|
+
### Returns a list of `enve.RecPlot2.Peak` objects.
|
530
|
+
}
|
531
|
+
|
532
|
+
#==============> Define utils
|
533
|
+
enve.recplot2.corePeak <- function
|
534
|
+
### Finds the peak in a list of peaks that is most likely to represent the
|
535
|
+
### "core genome" of a population.
|
536
|
+
(x
|
537
|
+
### `list` of `enve.RecPlot2.Peak` objects.
|
538
|
+
){
|
539
|
+
# Find the peak with maximum depth (centrality)
|
540
|
+
maxPeak <- x[[
|
541
|
+
which.max(as.numeric(lapply(x,
|
542
|
+
function(y) y$param.hat[[ length(y$param.hat) ]])))
|
543
|
+
]]
|
544
|
+
# If a "larger" peak (a peak explaining more bins of the genome) is within
|
545
|
+
# the "merge.logdist" distance, take that one instead.
|
546
|
+
corePeak <- maxPeak
|
547
|
+
for(p in x){
|
548
|
+
sz.d = log(length(p$values)/length(corePeak$values))
|
549
|
+
if(sz.d < 0)
|
550
|
+
next;
|
551
|
+
sq.d.a <- p$param.hat[[ length(p$param.hat) ]]
|
552
|
+
sq.d.b <- maxPeak$param.hat[[ length(maxPeak$param.hat) ]]
|
553
|
+
if(abs(log(sq.d.a/sq.d.b )) < maxPeak$merge.logdist+sz.d/5)
|
554
|
+
corePeak <- p
|
555
|
+
}
|
556
|
+
return(corePeak)
|
557
|
+
}
|
558
|
+
|
559
|
+
enve.recplot2.changeCutoff <- function
|
560
|
+
### Change the intra-species cutoff of an existing recruitment plot.
|
561
|
+
(rp,
|
562
|
+
### enve.RecPlot2 object.
|
563
|
+
new.cutoff=98
|
564
|
+
### New cutoff to use.
|
565
|
+
){
|
566
|
+
# Re-calculate vectors
|
567
|
+
id.mids <- (rp$id.breaks[-length(rp$id.breaks)]+rp$id.breaks[-1])/2
|
568
|
+
id.ingroup <- (id.mids > new.cutoff)
|
569
|
+
pos.counts.in <- apply(rp$counts[,id.ingroup], 1, sum)
|
570
|
+
pos.counts.out <- apply(rp$counts[,!id.ingroup], 1, sum)
|
571
|
+
# Update object
|
572
|
+
attr(rp, "id.ingroup") <- id.ingroup
|
573
|
+
attr(rp, "pos.counts.in") <- pos.counts.in
|
574
|
+
attr(rp, "pos.counts.out") <- pos.counts.out
|
575
|
+
attr(rp, "call") <- match.call()
|
576
|
+
return(rp)
|
577
|
+
}
|
578
|
+
|
579
|
+
enve.recplot2.extractWindows <- function
|
580
|
+
### Extract windows significantly below (or above) the peak in sequencing
|
581
|
+
### depth.
|
582
|
+
(rp,
|
583
|
+
### Recruitment plot, a enve.Recplot2 object.
|
584
|
+
peak,
|
585
|
+
### Peak, a enve.RecPlot2.Peak object. If list, it is assumed to be a list
|
586
|
+
### of enve.RecPlot2.Peak objects, in which case the core peak is used
|
587
|
+
### (see enve.recplot2.corePeak).
|
588
|
+
lower.tail=TRUE,
|
589
|
+
### If FALSE, it returns windows significantly above the peak in
|
590
|
+
### sequencing depth.
|
591
|
+
significance=0.05,
|
592
|
+
### Significance threshold (alpha) to select windows.
|
593
|
+
seq.names=FALSE
|
594
|
+
### Returns subject sequence names instead of a vector of Booleans. It
|
595
|
+
### assumes that the recruitment plot was generated with pos.breaks=0.
|
596
|
+
){
|
597
|
+
# Determine the threshold
|
598
|
+
if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
|
599
|
+
par <- peak$param.hat
|
600
|
+
par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
|
601
|
+
thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
|
602
|
+
|
603
|
+
# Estimate sequencing depths per window
|
604
|
+
pos.cnts.in <- rp$pos.counts.in
|
605
|
+
pos.breaks <- rp$pos.breaks
|
606
|
+
pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
|
607
|
+
seqdepth.in <- pos.cnts.in/pos.binsize
|
608
|
+
|
609
|
+
# Select windows past the threshold
|
610
|
+
if(lower.tail){
|
611
|
+
sel <- seqdepth.in < thr
|
612
|
+
}else{
|
613
|
+
sel <- seqdepth.in > thr
|
614
|
+
}
|
615
|
+
if(!seq.names) return(sel)
|
616
|
+
if(length(seqdepth.in) != length(rp$seq.names))
|
617
|
+
stop(paste("Requesting subject sequence names, but the recruitment plot",
|
618
|
+
"was not generated with pos.breaks=0."))
|
619
|
+
return(rp$seq.names[sel])
|
620
|
+
}
|
621
|
+
|
622
|
+
#==============> Define internal functions
|
623
|
+
enve.recplot2.__counts <- function
|
624
|
+
### Internal ancilliary function (see `enve.recplot2`).
|
625
|
+
(x, pos.breaks, id.breaks, rec.idcol){
|
626
|
+
rec <- x$rec
|
627
|
+
verbose <- x$verbose
|
628
|
+
counts <- matrix(0, nrow=length(pos.breaks)-1, ncol=length(id.breaks)-1);
|
629
|
+
for(i in 1:nrow(rec)){
|
630
|
+
if(verbose & i%%100==0) cat(" [",signif(i*100/nrow(rec),3),"% ] \r");
|
631
|
+
y.bin <- which(
|
632
|
+
rec[i,rec.idcol]>=id.breaks[-length(id.breaks)] &
|
633
|
+
rec[i,rec.idcol]<=id.breaks[-1])[1] ;
|
634
|
+
for(pos in rec[i,1]:rec[i,2]){
|
635
|
+
x.bin <- which(
|
636
|
+
pos>=pos.breaks[-length(pos.breaks)] & pos<=pos.breaks[-1])[1] ;
|
637
|
+
counts[x.bin, y.bin] <- counts[x.bin, y.bin]+1 ;
|
638
|
+
}
|
639
|
+
}
|
640
|
+
return(counts);
|
641
|
+
}
|
642
|
+
|
643
|
+
enve.recplot2.__peakHist <- function
|
644
|
+
### Internal ancilliary function (see `enve.RecPlot2.Peak`).
|
645
|
+
(x, mids, counts=TRUE){
|
646
|
+
d.o <- x$param.hat
|
647
|
+
d.o$x <- mids
|
648
|
+
prob <- do.call(paste('d', x$dist, sep=''), d.o)
|
649
|
+
if(!counts) return(prob)
|
650
|
+
if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
|
651
|
+
return(prob*x$n.hat/sum(prob))
|
652
|
+
}
|
653
|
+
|
654
|
+
enve.recplot2.__findPeak <- function
|
655
|
+
### Internall ancilliary function (see `enve.recplot2.findPeaks`).
|
656
|
+
(lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
|
657
|
+
optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
|
658
|
+
){
|
659
|
+
dist <- ifelse(with.skewness, 'sn', 'norm');
|
660
|
+
|
661
|
+
# Find peak
|
662
|
+
o <- mlv.opts; o$x = lsd1;
|
663
|
+
mode1 <- do.call(mlv, o)$M;
|
664
|
+
if(verbose) cat('Anchoring at mode =',mode1,'\n')
|
665
|
+
param.hat <- fitdist.opts$start; last.hat <- param.hat;
|
666
|
+
lim <- NA;
|
667
|
+
if(with.skewness){ param.hat$xi <- mode1 }else{ param.hat$mean <- mode1 }
|
668
|
+
|
669
|
+
# Refine peak parameters
|
670
|
+
for(round in 1:optim.rounds){
|
671
|
+
param.hat[[ 1 ]] <- param.hat[[ 1 ]]/diff(quant.est)# <- expand dispersion
|
672
|
+
lim.o <- param.hat
|
673
|
+
lim.o$p <- quant.est; lim <- do.call(paste('q',dist,sep=''), lim.o)
|
674
|
+
lsd1.pop <- lsd1[(lsd1>lim[1]) & (lsd1<lim[2])];
|
675
|
+
if(verbose) cat(' Round', round, 'with n =',length(lsd1.pop),
|
676
|
+
'and params =',as.numeric(param.hat),' \r')
|
677
|
+
if(length(lsd1.pop) < min.points) break;
|
678
|
+
o <- fitdist.opts; o$data = lsd1.pop; o$start = param.hat;
|
679
|
+
last.last.hat <- last.hat
|
680
|
+
last.hat <- param.hat
|
681
|
+
param.hat <- as.list(do.call(fitdist, o)$estimate);
|
682
|
+
if(any(is.na(param.hat))){
|
683
|
+
if(round>1) param.hat <- last.hat;
|
684
|
+
break;
|
685
|
+
}
|
686
|
+
epsilon <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
|
687
|
+
if(round>2) if(epsilon < optim.epsilon) break;
|
688
|
+
}
|
689
|
+
if(verbose) cat('\n')
|
690
|
+
if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
|
691
|
+
|
692
|
+
# Mow distribution
|
693
|
+
lsd2 <- c();
|
694
|
+
lsd.pop <- c();
|
695
|
+
n.hat <- length(lsd1.pop)/diff(quant.est)
|
696
|
+
peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
|
697
|
+
param.hat=param.hat, n.hat=n.hat, n.total=n.total,
|
698
|
+
merge.logdist=merge.logdist)
|
699
|
+
peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
|
700
|
+
peak.cnt <- enve.recplot2.__peakHist(peak,
|
701
|
+
(peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
|
702
|
+
for(i in 2:length(peak.breaks)){
|
703
|
+
values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
|
704
|
+
n.exp <- peak.cnt[i-1]
|
705
|
+
if(n.exp==0) n.exp=0.1
|
706
|
+
if(length(values)==0) next
|
707
|
+
in.peak <- runif(length(values)) <= n.exp/length(values)
|
708
|
+
lsd2 <- c(lsd2, values[!in.peak])
|
709
|
+
lsd.pop <- c(lsd.pop, values[in.peak])
|
710
|
+
}
|
711
|
+
if(length(lsd.pop) < min.points) return(NULL)
|
712
|
+
|
713
|
+
# Return peak
|
714
|
+
attr(peak, 'values') <- lsd.pop
|
715
|
+
attr(peak, 'values.res') <- lsd2
|
716
|
+
attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
|
717
|
+
plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
|
718
|
+
plot=FALSE)$counts)+1)/2
|
719
|
+
if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
|
720
|
+
'with expected n =',n.hat,'\n')
|
721
|
+
return(peak)
|
722
|
+
}
|
723
|
+
|
724
|
+
enve.recplot2.__findPeaks <- function
|
725
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
726
|
+
(peaks.opts){
|
727
|
+
peaks <- list()
|
728
|
+
while(length(peaks.opts$lsd1) > peaks.opts$min.points){
|
729
|
+
peak <- do.call(enve.recplot2.__findPeak, peaks.opts)
|
730
|
+
if(is.null(peak)) break
|
731
|
+
peaks[[ length(peaks)+1 ]] <- peak
|
732
|
+
peaks.opts$lsd1 <- peak$values.res
|
733
|
+
}
|
734
|
+
return(peaks)
|
735
|
+
}
|
736
|
+
|
737
|
+
|
738
|
+
enve.recplot2.__whichClosestPeak <- function
|
739
|
+
### Internal ancilliary function (see `enve.recplot2.findPeaks`).
|
740
|
+
(peak, peaks){
|
741
|
+
dist <- as.numeric(lapply(peaks, function(x) abs(log(x$param.hat[[ length(x$param.hat) ]]/peak$param.hat[[ length(peak$param.hat) ]] ))))
|
742
|
+
dist[ dist==0 ] <- Inf
|
743
|
+
return(which.min(dist))
|
744
|
+
}
|
745
|
+
|