miga-base 0.7.26.0 → 1.0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +55 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +419 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +100 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +80 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +304 -3
data/utils/distance/commands.rb
CHANGED
@@ -169,6 +169,7 @@ module MiGA::DistanceRunner::Commands
|
|
169
169
|
aai_data[out[1]] = [out[6].to_f, 0, 0, 0] if out[6] !~ /^>/
|
170
170
|
end
|
171
171
|
end
|
172
|
+
puts "Results: #{haai_data.size} | Inferences: #{aai_data.size}"
|
172
173
|
batch_data_to_db(:haai, haai_data)
|
173
174
|
batch_data_to_db(:aai, aai_data)
|
174
175
|
|
data/utils/distance/database.rb
CHANGED
data/utils/distance/runner.rb
CHANGED
@@ -18,10 +18,8 @@ class MiGA::DistanceRunner
|
|
18
18
|
@ref_project = MiGA::Project.load(ref_path)
|
19
19
|
raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
|
20
20
|
elsif !opts[:run_taxonomy] && dataset.option(:db_project)
|
21
|
-
|
22
|
-
|
23
|
-
ref_path = File.expand_path(ref_path, project.option(:db_proj_dir))
|
24
|
-
end
|
21
|
+
ref_location = project.option(:db_proj_dir) || File.dirname(project.path)
|
22
|
+
ref_path = File.expand_path(dataset.option(:db_project), ref_location)
|
25
23
|
@ref_project = MiGA::Project.load(ref_path)
|
26
24
|
raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
|
27
25
|
else
|
@@ -0,0 +1,244 @@
|
|
1
|
+
# Recruitment plots
|
2
|
+
|
3
|
+
## Aims
|
4
|
+
|
5
|
+
This document aims to cover the technical aspects of the recruitment plot functions in the
|
6
|
+
`enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
|
7
|
+
|
8
|
+
## Caveats
|
9
|
+
|
10
|
+
This is a __*working document*__, describing unstable and/or experimental code. The material
|
11
|
+
here is susceptible of changes without warning, pay attention to the modification date and (if
|
12
|
+
in doubt) the commit history. The definitions and default parameters of the functions described
|
13
|
+
here may change in the near future as result of further experimentation or more stable
|
14
|
+
implementations.
|
15
|
+
|
16
|
+
The current document was generated and tested with the `enveomics.R` package version 1.3. To
|
17
|
+
check your current version in R, use `packageVersion('enveomics.R')`.
|
18
|
+
|
19
|
+
> **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
|
20
|
+
> Carefully evaluate all your results.
|
21
|
+
|
22
|
+
---
|
23
|
+
|
24
|
+
## Package: `enveomics.R`
|
25
|
+
|
26
|
+
The functionalities described here are provided by the `enveomics.R` package. Some features
|
27
|
+
described here are updated more frequently than the official
|
28
|
+
[CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
|
29
|
+
updates (package HEAD), download (or update), and install this git repository.
|
30
|
+
|
31
|
+
### Quick installation guide
|
32
|
+
|
33
|
+
:globe_with_meridians: To install the latest stable version available in CRAN, use in R:
|
34
|
+
|
35
|
+
```R
|
36
|
+
install.packages(c('enveomics.R','optparse'))
|
37
|
+
```
|
38
|
+
|
39
|
+
:octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
|
40
|
+
|
41
|
+
```R
|
42
|
+
install.packages('devtools')
|
43
|
+
library('devtools')
|
44
|
+
install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
|
45
|
+
```
|
46
|
+
|
47
|
+
---
|
48
|
+
|
49
|
+
## Recruitment plots: `enve.recplot2`
|
50
|
+
|
51
|
+
The first step in this analysis is the mapping of reads to the genome, processed with
|
52
|
+
[BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
|
53
|
+
We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
|
54
|
+
prefix of the processed files.
|
55
|
+
|
56
|
+
Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
|
57
|
+
For this, you'll have two options.
|
58
|
+
|
59
|
+
### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
|
60
|
+
|
61
|
+
The stand-alone script
|
62
|
+
[BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
|
63
|
+
is the easiest option to run, and should be the preferred method if you're automating
|
64
|
+
this analysis to process several mappings, but it doesn't offer access to advanced options.
|
65
|
+
|
66
|
+
You can run it like this using two CPUs:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
|
70
|
+
```
|
71
|
+
|
72
|
+
> **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
|
73
|
+
> map against contigs. However, if you did map reads against genes, you may want to use the
|
74
|
+
> `--pos-breaks 0` option to use each gene as a recruitment window.
|
75
|
+
>
|
76
|
+
> **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
|
77
|
+
> `--peaks-col darkred` option.
|
78
|
+
|
79
|
+
Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
|
80
|
+
object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
|
81
|
+
|
82
|
+
### Option 2: Using the `enve.recplot2` R function
|
83
|
+
|
84
|
+
If you require access to advanced options, or for some other reason prefer to calculate the
|
85
|
+
recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
|
86
|
+
and example session in R:
|
87
|
+
|
88
|
+
```R
|
89
|
+
# Load the package
|
90
|
+
library(enveomics.R)
|
91
|
+
# Open the PDF
|
92
|
+
pdf('my-recplot.pdf')
|
93
|
+
# Build and plot the object using two threads and no peak detection
|
94
|
+
# (to turn on peak detection, simply remove `peaks.col=NA`)
|
95
|
+
rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
|
96
|
+
# Close the PDF
|
97
|
+
dev.off()
|
98
|
+
# Save the object
|
99
|
+
save(rp, file='my-recplot.rdata')
|
100
|
+
```
|
101
|
+
|
102
|
+
> **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
|
103
|
+
> before closing the R session.
|
104
|
+
|
105
|
+
Naturally, you may want to see what other (advanced) options you have. You can access the
|
106
|
+
documentation of the function in R using `?enve.recplot2`.
|
107
|
+
|
108
|
+
---
|
109
|
+
|
110
|
+
## Summary statistics
|
111
|
+
|
112
|
+
Here we explore some frequently used summary statistics from recruitment plots. First, load the
|
113
|
+
package and the `enve.RecPlot2` object you saved previously, in R:
|
114
|
+
|
115
|
+
```R
|
116
|
+
library(enveomics.R)
|
117
|
+
load('my-recplot.rdata')
|
118
|
+
```
|
119
|
+
|
120
|
+
### Centrality measures of sequencing depth
|
121
|
+
|
122
|
+
```R
|
123
|
+
mean(enve.recplot2.seqdepth(rp)) # <- Average
|
124
|
+
median(enve.recplot2.seqdepth(rp)) # <- Median
|
125
|
+
enve.truncate(enve.recplot2.seqdepth(rp)) # <- 95% Central Truncated Mean
|
126
|
+
enve.truncate(enve.recplot2.seqdepth(rp), 0.9) # <- 90% Central Truncated Mean
|
127
|
+
```
|
128
|
+
|
129
|
+
The functions above only use hits with identity above the cutoff for "in-group" (by default: 95%).
|
130
|
+
In order to estimate the sequencing depth with a different identity cutoff, modify the cutoff first:
|
131
|
+
|
132
|
+
```R
|
133
|
+
rp98 <- enve.recplot2.changeCutoff(rp, 98) # <- Change to ≥98%
|
134
|
+
mean(enve.recplot2.seqdepth(rp98)) # <- Average (for the new object)
|
135
|
+
median(enve.recplot2.seqdepth(rp98)) # <- Median (for the new object)
|
136
|
+
```
|
137
|
+
|
138
|
+
### Average and median sequencing depth excluding zero-coverage windows
|
139
|
+
|
140
|
+
```R
|
141
|
+
seqdepth <- enve.recplot2.seqdepth(rp)
|
142
|
+
mean(seqdepth[seqdepth>0]) # <- Average
|
143
|
+
median(seqdepth[seqdepth>0]) # <- Median
|
144
|
+
```
|
145
|
+
|
146
|
+
### Average Nucleotide Identity from reads (ANIr)
|
147
|
+
|
148
|
+
```R
|
149
|
+
enve.recplot2.ANIr(rp) # <- Complete recruitment plot
|
150
|
+
enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
|
151
|
+
enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
|
152
|
+
enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
|
153
|
+
```
|
154
|
+
|
155
|
+
### Coordinates of each sequence window with their respective sequencing depth
|
156
|
+
|
157
|
+
```R
|
158
|
+
d <- enve.recplot2.coordinates(rp)
|
159
|
+
d$seqdepth <- enve.recplot2.seqdepth(rp)
|
160
|
+
d
|
161
|
+
```
|
162
|
+
|
163
|
+
### Sequencing breadth (upper boundary)
|
164
|
+
|
165
|
+
This estimate depends on the window size. The smaller the window size, the better the
|
166
|
+
estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
|
167
|
+
biased (overestimate).
|
168
|
+
|
169
|
+
```R
|
170
|
+
mean(enve.recplot2.seqdepth(rp) > 0)
|
171
|
+
```
|
172
|
+
|
173
|
+
---
|
174
|
+
|
175
|
+
## Peak-finder: `enve.recplot2.findPeaks`
|
176
|
+
|
177
|
+
In this step we will try to identify one or multiple population peaks corresponding to different
|
178
|
+
sub-populations and/or composites of sub-populations.
|
179
|
+
|
180
|
+
> **NOTE** This step can be performed together with the step above, but we separate it here for
|
181
|
+
> two reasons: **(1)** This step is much more unstable but less computationally demanding than the
|
182
|
+
> step before, so it makes sense to re-run only this part with different parameters and/or
|
183
|
+
> package updates; and **(2)** We want to save the R objects independently, so the following steps
|
184
|
+
> are more clear.
|
185
|
+
|
186
|
+
In R:
|
187
|
+
|
188
|
+
```R
|
189
|
+
# Load the package
|
190
|
+
library(enveomics.R)
|
191
|
+
# Load the `enve.RecPlot2` object you saved previously
|
192
|
+
load('my-recplot.rdata')
|
193
|
+
# Find the peaks
|
194
|
+
peaks <- enve.recplot2.findPeaks(rp)
|
195
|
+
# Save the peaks R object (optional)
|
196
|
+
save(peaks, file='my-recplot-peaks.rdata')
|
197
|
+
# Plot the peaks in a PDF (optional)
|
198
|
+
pdf('my-recplot-peaks.pdf')
|
199
|
+
p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
|
200
|
+
dev.off()
|
201
|
+
```
|
202
|
+
|
203
|
+
The key function here is `enve.recplot2.findPeaks`. This function has several parameters, depending on
|
204
|
+
the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
|
205
|
+
of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
|
206
|
+
|
207
|
+
---
|
208
|
+
|
209
|
+
## Gene-content diversity: `enve.recplot2.extractWindows`
|
210
|
+
|
211
|
+
In R:
|
212
|
+
|
213
|
+
```R
|
214
|
+
# Load the package and the objects (unless you're still in the same session from the last step)
|
215
|
+
library(enveomics.R)
|
216
|
+
load('my-recplot.rdata')
|
217
|
+
load('my-recplot-peaks.rdata')
|
218
|
+
# Find the peak representing the core genome
|
219
|
+
cp <- enve.recplot2.corePeak(peaks)
|
220
|
+
#-----
|
221
|
+
# The following functions illustrate how to obtain different results. Please explore the resulting
|
222
|
+
# objects and the associated documentation
|
223
|
+
#-----
|
224
|
+
# Find the coordinates of windows significantly below the average sequencing depth
|
225
|
+
div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
|
226
|
+
# Add sequencing depth
|
227
|
+
div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
|
228
|
+
# Save the coordinates as a tab-delimited table
|
229
|
+
write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
|
230
|
+
# Find all the windows with sequencing depth zero
|
231
|
+
zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
|
232
|
+
```
|
233
|
+
|
234
|
+
---
|
235
|
+
|
236
|
+
## To do
|
237
|
+
|
238
|
+
- [x] Document structure
|
239
|
+
- [x] Package: `enveomics.R`
|
240
|
+
- [x] Recruitment plots: `enve.recplot2`
|
241
|
+
- [x] Summary statistics
|
242
|
+
- [x] Peak-finder: `enve.recplot2.findPeaks`
|
243
|
+
- [x] Gene-content diversity: `enve.recplot2.extractWindows`
|
244
|
+
- [ ] Compare identity profiles: `enve.recplot2.compareIdentities`
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
set -e # <- So it stops if there is an error
|
7
|
+
function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
|
8
|
+
|
9
|
+
OUT=$1 # <- Output file
|
10
|
+
[[ -n "$1" ]] && shift
|
11
|
+
SEQS=("$@") # <- list of all genomes
|
12
|
+
THR=2 # <- Number or threads
|
13
|
+
DEF_DIST=0.9 # <- Default distance when AAI cannot be reliably estimated
|
14
|
+
|
15
|
+
# This is just the help message
|
16
|
+
if [[ $# -lt 2 ]] ; then
|
17
|
+
echo "
|
18
|
+
Use case: Building AAI matrices from a collection of genomes.
|
19
|
+
|
20
|
+
IMPORTANT
|
21
|
+
This script is functional, but it's mainly intended for illustrative purposes.
|
22
|
+
Please take a look at the code first.
|
23
|
+
|
24
|
+
Usage:
|
25
|
+
$0 <output.txt> <genomes...>
|
26
|
+
|
27
|
+
<output.txt> The output AAI list, in tab-delimited form containing the
|
28
|
+
following columns: (1) Sequence A, (2) Sequence B, (3)
|
29
|
+
AAI, (4) AAI-SD, (5) Proteins used, (6) Number of proteins in
|
30
|
+
the smallest genome, (7) Percentage of the genome shared.
|
31
|
+
<genomes...> The list of files containing the genomes (at least 2).
|
32
|
+
|
33
|
+
" >&2
|
34
|
+
exit
|
35
|
+
fi
|
36
|
+
|
37
|
+
# 00. Create environment
|
38
|
+
export PATH=$(dirname "$0")/../Scripts:$PATH
|
39
|
+
|
40
|
+
# 01. Calculate AAI
|
41
|
+
echo "[01/03] Calculating AAI"
|
42
|
+
for i in "${SEQS[@]}" ; do
|
43
|
+
for j in "${SEQS[@]}" ; do
|
44
|
+
echo -n " o $i vs $j: "
|
45
|
+
AAI=$(aai.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
|
46
|
+
--no-save-rbm --auto --quiet)
|
47
|
+
echo ${AAI:-Below detection}
|
48
|
+
[[ "$i" == "$j" ]] && break
|
49
|
+
done
|
50
|
+
done
|
51
|
+
|
52
|
+
# 02. Extract matrix
|
53
|
+
echo "[02/03] Extracting list"
|
54
|
+
echo -e "SeqA\tSeqB\tAAI\tSD\tN\tOmega\tFrx" > "$OUT"
|
55
|
+
echo "select seq1, seq2, aai, sd, n, omega, (100.0*n/omega) from aai;" \
|
56
|
+
| sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
|
57
|
+
|
58
|
+
# 03. Make it a distance matrix.
|
59
|
+
echo "[03/03] Generating distance matrix"
|
60
|
+
echo "
|
61
|
+
source('$(dirname $0)/../enveomics.R/R/df2dist.R');
|
62
|
+
a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
|
63
|
+
aai.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
|
64
|
+
write.table(as.matrix(aai.d), '$OUT.dist',
|
65
|
+
quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
|
66
|
+
" | R --vanilla >/dev/null
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
set -e # <- So it stops if there is an error
|
7
|
+
function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
|
8
|
+
|
9
|
+
OUT=$1 # <- Output file
|
10
|
+
[[ -n "$1" ]] && shift
|
11
|
+
SEQS=("$@") # <- list of all genomes
|
12
|
+
THR=2 # <- Number or threads
|
13
|
+
DEF_DIST=0.9 # <- Default distance when ANI cannot be reliably estimated
|
14
|
+
|
15
|
+
# This is just the help message
|
16
|
+
if [[ $# -lt 2 ]] ; then
|
17
|
+
echo "
|
18
|
+
Use case: Building ANI matrices from a collection of genomes.
|
19
|
+
|
20
|
+
IMPORTANT
|
21
|
+
This script is functional, but it's mainly intended for illustrative purposes.
|
22
|
+
Please take a look at the code first.
|
23
|
+
|
24
|
+
Usage:
|
25
|
+
$0 <output.txt> <genomes...>
|
26
|
+
|
27
|
+
<output.txt> The output ANI list, in tab-delimited form containing the
|
28
|
+
following columns: (1) Sequence A, (2) Sequence B, (3)
|
29
|
+
ANI, (4) ANI-SD, (5) Fragments used, (6) Maximum number
|
30
|
+
of fragments, (7) Percentage of the genome shared.
|
31
|
+
<genomes...> The list of files containing the genomes (at least 2).
|
32
|
+
|
33
|
+
" >&2
|
34
|
+
exit
|
35
|
+
fi
|
36
|
+
|
37
|
+
# 00. Create environment
|
38
|
+
export PATH=$(dirname "$0")/../Scripts:$PATH
|
39
|
+
|
40
|
+
# 01. Calculate ANI
|
41
|
+
echo "[01/03] Calculating ANI"
|
42
|
+
for i in "${SEQS[@]}" ; do
|
43
|
+
for j in "${SEQS[@]}" ; do
|
44
|
+
echo -n " o $i vs $j: "
|
45
|
+
ANI=$(ani.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
|
46
|
+
--no-save-rbm --no-save-regions --auto --quiet)
|
47
|
+
echo ${ANI:-Below detection}
|
48
|
+
[[ "$i" == "$j" ]] && break
|
49
|
+
done
|
50
|
+
done
|
51
|
+
|
52
|
+
# 02. Extract matrix
|
53
|
+
echo "[02/03] Extracting list"
|
54
|
+
echo -e "SeqA\tSeqB\tANI\tSD\tN\tOmega\tFrx" > "$OUT"
|
55
|
+
echo "select seq1, seq2, ani, sd, n, omega, (100.0*n/omega) from ani;" \
|
56
|
+
| sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
|
57
|
+
|
58
|
+
# 03. Make it a distance matrix.
|
59
|
+
echo "[03/03] Generating distance matrix"
|
60
|
+
echo "
|
61
|
+
source('$(dirname $0)/../enveomics.R/R/df2dist.R');
|
62
|
+
a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
|
63
|
+
ani.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
|
64
|
+
write.table(as.matrix(ani.d), '$OUT.dist',
|
65
|
+
quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
|
66
|
+
" | R --vanilla >/dev/null
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author Luis M. Rodriguez-R
|
5
|
+
# @update Mar-23-2016
|
6
|
+
# @license artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
set -e # <- So it stops if there is an error
|
10
|
+
function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
|
11
|
+
|
12
|
+
ORG=$1 # <- Organism (see help)
|
13
|
+
THR=2 # <- Number or threads
|
14
|
+
|
15
|
+
# This is just the help message
|
16
|
+
if [[ "$ORG" == "" ]] ; then
|
17
|
+
echo "
|
18
|
+
Use case: Essential genes phylogeny of a species. The essential genes are a
|
19
|
+
collection of genes typically found in single copy in archaeal and bacterial
|
20
|
+
genomes
|
21
|
+
|
22
|
+
IMPORTANT
|
23
|
+
This script is functional, but it's mainly intended for illustrative purposes.
|
24
|
+
Please take a look at the code first.
|
25
|
+
|
26
|
+
Usage:
|
27
|
+
$0 <organism>
|
28
|
+
|
29
|
+
<organism> The organism to use (e.g., Streptococcus_pneumoniae).
|
30
|
+
|
31
|
+
" >&2
|
32
|
+
exit
|
33
|
+
fi
|
34
|
+
|
35
|
+
# 00. Create environment
|
36
|
+
export PATH=$(dirname $0)/../Scripts:$PATH
|
37
|
+
if [[ -e $ORG ]] ; then
|
38
|
+
echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
|
39
|
+
exit 1
|
40
|
+
fi
|
41
|
+
mkdir $ORG
|
42
|
+
for i in 01.proteome 02.essential 03.aln 04.cat 05.raxml 06.autoprune ; do
|
43
|
+
mkdir $ORG/$i
|
44
|
+
done
|
45
|
+
|
46
|
+
# 01. Download proteomes
|
47
|
+
echo "[01/06] Downloading and guzipping data"
|
48
|
+
RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
|
49
|
+
rm $ORG/01.proteome/assembly_summary.txt
|
50
|
+
for i in $ORG/01.proteome/* ; do
|
51
|
+
b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
|
52
|
+
if exists $i/*.faa.gz ; then
|
53
|
+
for j in $i/*.faa.gz ; do gunzip $j ; done
|
54
|
+
cat $i/*.faa > $ORG/01.proteome/$b.faa
|
55
|
+
fi
|
56
|
+
rm -R $i
|
57
|
+
done
|
58
|
+
|
59
|
+
# 02. Essential genes
|
60
|
+
echo "[02/06] Idenfifying essential genes"
|
61
|
+
N=0
|
62
|
+
for i in $ORG/01.proteome/*.faa ; do # <- This loop could be parallelized
|
63
|
+
genomeA=$(basename $i .faa)
|
64
|
+
dir=$ORG/02.essential/$genomeA
|
65
|
+
mkdir $dir
|
66
|
+
HMM.essential.rb -i $i -m $dir/ -R $dir/log.txt -r $genomeA -t $THR
|
67
|
+
let N=$N+1
|
68
|
+
done
|
69
|
+
|
70
|
+
# 03. Find core and align groups
|
71
|
+
echo "[03/06] Identifying core essentials and aligning groups"
|
72
|
+
CORE_ESS=$(basename -s .faa $ORG/02.essential/*/*.faa | sort | uniq -c \
|
73
|
+
| awk '$1=='$N'{print $2}')
|
74
|
+
for b in $CORE_ESS ; do # <- This loop could be parallelized
|
75
|
+
cat $ORG/02.essential/*/$b.faa > $ORG/03.aln/$b.faa
|
76
|
+
clustalo -i $ORG/03.aln/$b.faa -o $ORG/03.aln/$b.aln #--threads=$THR
|
77
|
+
done
|
78
|
+
|
79
|
+
# 04. Concatenate alignment
|
80
|
+
echo "[04/06] Concatenating alignments and removing invariable sites"
|
81
|
+
Aln.cat.rb -I -c $ORG/04.cat/essential.raxcoords -i '|' $ORG/03.aln/*.aln \
|
82
|
+
> $ORG/04.cat/essential.aln 2> $ORG/04.cat/essential.log
|
83
|
+
|
84
|
+
# 05. Run RAxML
|
85
|
+
echo "[05/06] Inferring phylogeny"
|
86
|
+
# You REALLY should consider running the following with more threads (-T) and,
|
87
|
+
# if possible, multi-nodes using MPI
|
88
|
+
cd $ORG/05.raxml
|
89
|
+
raxmlHPC-PTHREADS -T $THR -p 1234 \
|
90
|
+
-s ../04.cat/essential.aln -q ../04.cat/essential.raxcoords \
|
91
|
+
-m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
|
92
|
+
# before running this line, so you know
|
93
|
+
# that you're running what you really want. Check
|
94
|
+
# options for bootstrapping and the different
|
95
|
+
# algorithms (-f). Note that -m is required, but the
|
96
|
+
# file unus.raxcoords specifies "AUTO", so RAxML will
|
97
|
+
# attempt to find the model resulting in the highest
|
98
|
+
# likelihood.
|
99
|
+
cd ../..
|
100
|
+
|
101
|
+
# 06. Autoprune
|
102
|
+
echo "[06/06] Auto-pruning the tree"
|
103
|
+
Newick.autoprune.R --t $ORG/05.raxml/RAxML_bestTree.UNUS --min_dist 0.001 \
|
104
|
+
$ORG/06.autoprune/essential-pruned.nwk
|
105
|
+
|