miga-base 0.7.26.0 → 1.0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +55 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +419 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +100 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +80 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +304 -3
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jul-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
use Symbol;
|
11
|
+
|
12
|
+
my ($file, $base, $outN) = @ARGV;
|
13
|
+
|
14
|
+
$outN ||= 2;
|
15
|
+
($file and $base) or die "
|
16
|
+
Usage
|
17
|
+
$0 in_file.fq out_base[ no_files]
|
18
|
+
|
19
|
+
in_file.fq Input file in FastA format.
|
20
|
+
out_base Prefix for the name of the output files. It will
|
21
|
+
be appended with .<i>.fastq, where <i> is a consecutive
|
22
|
+
number starting in 1.
|
23
|
+
no_files Number of files to generate. By default: 2.
|
24
|
+
|
25
|
+
";
|
26
|
+
|
27
|
+
|
28
|
+
my @outSym = ();
|
29
|
+
for my $i (1 .. $outN){
|
30
|
+
$outSym[$i-1] = gensym;
|
31
|
+
open $outSym[$i-1], ">", "$base.$i.fastq" or die "I can not create the file: $base.$i.fa: $!\n";
|
32
|
+
}
|
33
|
+
|
34
|
+
|
35
|
+
my($i, $seq) = (-1, '');
|
36
|
+
open FILE, "<", $file or die "I can not read the file: $file: $!\n";
|
37
|
+
while(my $ln=<FILE>){
|
38
|
+
if($.%4 == 1){
|
39
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
40
|
+
$i++;
|
41
|
+
$seq = '';
|
42
|
+
}
|
43
|
+
$seq.=$ln;
|
44
|
+
}
|
45
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
46
|
+
close FILE;
|
47
|
+
|
48
|
+
for(my $j=0; $j<$outN; $j++){
|
49
|
+
close $outSym[$j];
|
50
|
+
}
|
51
|
+
|
52
|
+
print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
|
53
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
$VERSION = 1.1
|
8
|
+
|
9
|
+
o = { q: false, p: '', s: '' }
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.version = $VERSION
|
12
|
+
Enveomics.opt_banner(
|
13
|
+
opts, 'Generates easy-to-parse tagged reads from FastQ files',
|
14
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta [options]"
|
15
|
+
)
|
16
|
+
|
17
|
+
opts.separator 'Mandatory'
|
18
|
+
opts.on(
|
19
|
+
'-i', '--in FILE',
|
20
|
+
'Path to the FastQ file containing the sequences',
|
21
|
+
'Supports compression with .gz extension, use - for STDIN'
|
22
|
+
) { |v| o[:in] = v }
|
23
|
+
opts.on(
|
24
|
+
'-o', '--out FILE', 'Path to the FastQ to create',
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
26
|
+
) { |v| o[:out] = v }
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'ID options'
|
29
|
+
opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
|
30
|
+
opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
|
31
|
+
opts.separator ''
|
32
|
+
opts.separator 'Other Options'
|
33
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
34
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
35
|
+
opts.separator ''
|
36
|
+
end.parse!
|
37
|
+
|
38
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
39
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
40
|
+
|
41
|
+
begin
|
42
|
+
ifh = reader(o[:in])
|
43
|
+
ofh = writer(o[:out])
|
44
|
+
i = 0
|
45
|
+
lno = 0
|
46
|
+
ifh.each do |ln|
|
47
|
+
ln.chomp!
|
48
|
+
lno += 1
|
49
|
+
case lno % 4
|
50
|
+
when 1
|
51
|
+
ln =~ /^@/ or
|
52
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
53
|
+
i += 1
|
54
|
+
ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
|
55
|
+
when 3
|
56
|
+
ln =~ /^\+/ or
|
57
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
58
|
+
ofh.puts '+'
|
59
|
+
else
|
60
|
+
ofh.puts ln
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ifh.close
|
64
|
+
ofh.close
|
65
|
+
rescue => err
|
66
|
+
$stderr.puts "Exception: #{err}\n\n"
|
67
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
68
|
+
err
|
69
|
+
end
|
70
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
o = {q:false, key:2}
|
6
|
+
ARGV << '-h' if ARGV.empty?
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.banner = "
|
9
|
+
Compares the estimated error of sequencing reads (Q-score) with
|
10
|
+
observed mismatches (identity against a know reference sequence).
|
11
|
+
|
12
|
+
Usage: #{$0} [options]"
|
13
|
+
opts.separator ""
|
14
|
+
opts.separator "Mandatory"
|
15
|
+
opts.on("-f", "--fastq FILE",
|
16
|
+
"Path to the FastQ file containing the sequences."){ |v| o[:fastq] = v }
|
17
|
+
opts.on("-b", "--blast FILE",
|
18
|
+
"Path to the tabular BLAST file mapping reads to reference sequences."
|
19
|
+
){ |v| o[:blast] = v }
|
20
|
+
opts.on("-o", "--out FILE",
|
21
|
+
"Path to the output tab-delimited file to create."){ |v| o[:out] = v }
|
22
|
+
opts.separator ""
|
23
|
+
opts.separator "Other Options"
|
24
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
|
25
|
+
opts.on("-h", "--help", "Display this screen") do
|
26
|
+
puts opts
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
opts.separator ""
|
30
|
+
end.parse!
|
31
|
+
abort "-f is mandatory" if o[:fastq].nil?
|
32
|
+
abort "-b is mandatory" if o[:blast].nil?
|
33
|
+
abort "-o is mandatory" if o[:out].nil?
|
34
|
+
|
35
|
+
# Read the Q scores and estimate expected mismatches
|
36
|
+
mm = {} # <- Hash with read IDs as key, and arrays as values:
|
37
|
+
# [ expected mismatches, variance of mismatches, length ]
|
38
|
+
$stderr.puts "Reading FastQ file" unless o[:q]
|
39
|
+
File.open(o[:fastq], "r") do |fh|
|
40
|
+
id = nil
|
41
|
+
fh.each_line do |ln|
|
42
|
+
case $.%4
|
43
|
+
when 1
|
44
|
+
ln =~ /^@(\S+)/ or raise "Unexpected defline format: #{ln}"
|
45
|
+
id = $1
|
46
|
+
$stderr.print " #{mm.size} reads...\r" unless o[:q]
|
47
|
+
when 0
|
48
|
+
ln.chomp!
|
49
|
+
# I'm assuming ALWAYS Phred+33!!!
|
50
|
+
p = ln.split('').map{ |i| (i.ord - 33).to_f }.map{ |q| 10.0**(-q/10.0) }
|
51
|
+
mu = p.inject(:+)
|
52
|
+
var = p.map{ |i| i*(1.0-i) }.inject(:+)
|
53
|
+
mm[id] = [mu, var, p.size]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
$stderr.puts " Found: #{mm.size} reads." unless o[:q]
|
57
|
+
end
|
58
|
+
|
59
|
+
ofh = File.open(o[:out], "w")
|
60
|
+
ofh.puts %w[id obs_subs obs_id aln_len obs_ins obs_del obs_gap mu var len].join("\t")
|
61
|
+
|
62
|
+
# Read Identities and compare against expectation
|
63
|
+
$stderr.puts "Reading Tabular BLAST file" unless o[:q]
|
64
|
+
File.open(o[:blast], "r") do |fh|
|
65
|
+
k = 0
|
66
|
+
fh.each_line do |ln|
|
67
|
+
r = ln.chomp.split("\t")
|
68
|
+
id = r[0]
|
69
|
+
next if mm[id].nil?
|
70
|
+
k += 1
|
71
|
+
$stderr.print " #{k} alignments...\r" unless o[:q]
|
72
|
+
obs_m = r[4].to_i + (r[6].to_i - 1) + (mm[id][2] - r[7].to_i)
|
73
|
+
obs_del = r[3].to_i - (r[7].to_i - r[6].to_i).abs
|
74
|
+
obs_ins = r[3].to_i - (r[9].to_i - r[8].to_i).abs
|
75
|
+
ofh.puts ([id, obs_m, r[2], r[7].to_i - r[6].to_i + 1,
|
76
|
+
obs_ins, obs_del, r[5]] + mm[id]).join("\t")
|
77
|
+
end
|
78
|
+
$stderr.puts " Found #{k} alignments." unless o[:q]
|
79
|
+
end
|
80
|
+
|
81
|
+
ofh.close
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env awk -f
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Dec-26-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
BEGIN {
|
9
|
+
for (i = 0; i < ARGC; i++) {
|
10
|
+
if(ARGV[i] == "--help"){
|
11
|
+
print "Description:\n"
|
12
|
+
print " Translates FastQ files into FastA.\n"
|
13
|
+
print "Usage:\n"
|
14
|
+
print " FastQ.toFastA.awk < in.fq > out.fa\n"
|
15
|
+
exit
|
16
|
+
}
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
NR%4 == 1, NR%4 == 2 {
|
21
|
+
if(NR%4 == 1){ gsub(/^@/,">") }
|
22
|
+
print $0
|
23
|
+
}
|
24
|
+
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
use warnings;
|
7
|
+
use strict;
|
8
|
+
use List::Util qw/min max/;
|
9
|
+
use Getopt::Std;
|
10
|
+
|
11
|
+
sub HELP_MESSAGE { die "
|
12
|
+
|
13
|
+
Description:
|
14
|
+
Generates a list of coordinates from a GFF table concatenating the subject
|
15
|
+
sequences.
|
16
|
+
|
17
|
+
See also: BlastTab.recplot2.R and BlastTab.catsbj.pl
|
18
|
+
|
19
|
+
Usage:
|
20
|
+
$0 [options] seq.fa map.gff > abs-coords.tsv
|
21
|
+
|
22
|
+
seq.fa Subject sequences (contigs) in FastA format.
|
23
|
+
map.gff Features to map in GFF.
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-L path Generate a file with the absolute coordinates of the
|
27
|
+
concatenated contigs. This is identical to the .lim file
|
28
|
+
generated by BlastTab.catsbj.pl.
|
29
|
+
-i Preserve exact coordinates and include inter-feature windows as
|
30
|
+
separate bins. By default, the coordinates are set in the
|
31
|
+
midpoint between features when non-contiguous.
|
32
|
+
-s The FastA provided is to be treated as a subset of the subject.
|
33
|
+
By default, it expects all the contigs to be present in the
|
34
|
+
BLAST.
|
35
|
+
-q Run quietly.
|
36
|
+
-h Display this message and exit.
|
37
|
+
|
38
|
+
"; }
|
39
|
+
|
40
|
+
my %o;
|
41
|
+
getopts('L:isqh', \%o);
|
42
|
+
my($fa, $map) = @ARGV;
|
43
|
+
($fa and $map) or &HELP_MESSAGE;
|
44
|
+
$o{h} and &HELP_MESSAGE;
|
45
|
+
|
46
|
+
my %seq = ();
|
47
|
+
my @seq = ();
|
48
|
+
my $tot = 0;
|
49
|
+
|
50
|
+
SEQ:{
|
51
|
+
print STDERR "== Reading reference sequences\n" unless $o{q};
|
52
|
+
open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
|
53
|
+
my $cur_seq = '';
|
54
|
+
while(<FA>){
|
55
|
+
chomp;
|
56
|
+
if(m/^>(\S+)/){
|
57
|
+
my $c = $1;
|
58
|
+
$seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
|
59
|
+
push @seq, $c;
|
60
|
+
$cur_seq = $c;
|
61
|
+
}else{
|
62
|
+
s/[^A-Za-z]//g;
|
63
|
+
$seq{$cur_seq} += length $_;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
close FA;
|
67
|
+
print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
|
68
|
+
}
|
69
|
+
|
70
|
+
$o{L} ||= '/dev/null';
|
71
|
+
open LIM, ">", $o{L} or die "Cannot create the file: $o{L}: $!\n";
|
72
|
+
my $l = 0;
|
73
|
+
for my $s (@seq){
|
74
|
+
print LIM "$s\t".(++$l)."\t$seq{$s}\n";
|
75
|
+
($l, $seq{$s}) = ($seq{$s}, $l);
|
76
|
+
}
|
77
|
+
close LIM;
|
78
|
+
|
79
|
+
MAP: {
|
80
|
+
print STDERR "== Reading mapping\n" unless $o{q};
|
81
|
+
open GFF, "<", $map or die "Cannot read the file: $map: $!\n";
|
82
|
+
my $last_end = 1;
|
83
|
+
my $last_name = "NA";
|
84
|
+
print "1\tNA\tNA\n";
|
85
|
+
my $i = 0;
|
86
|
+
FEATURE: while(<GFF>){
|
87
|
+
next if /^\s*(#.*)?$/; # Blank or comment lines
|
88
|
+
chomp;
|
89
|
+
my @ln = split /\t/;
|
90
|
+
$ln[4] or die "Cannot parse line $map:$.: $_\n";
|
91
|
+
unless(exists $seq{$ln[0]}){
|
92
|
+
die "Cannot find the subject sequence: $ln[0]\n" unless $o{s};
|
93
|
+
next FEATURE;
|
94
|
+
}
|
95
|
+
$i++;
|
96
|
+
my $start = $seq{$ln[0]}+$ln[3];
|
97
|
+
my $end = $seq{$ln[0]}+$ln[4];
|
98
|
+
my $name = "feat_$i";
|
99
|
+
if($ln[8] =~ /^gene_id=(\d+)/){ # <- GeneMark style
|
100
|
+
$name = "gene_id_$1";
|
101
|
+
}elsif($ln[8] =~ /^ID=\d+_(\d+)/){ # <- Prodigal style
|
102
|
+
$name = $ln[0]."_".$1;
|
103
|
+
}elsif($ln[8] =~ /^ID=([^;]+)/){
|
104
|
+
$name = $1;
|
105
|
+
}
|
106
|
+
if($o{i}){
|
107
|
+
$start = $last_end if $start < $last_end;
|
108
|
+
print "$start\t$last_name~$name\tGAP\n" unless $start==$last_end;
|
109
|
+
print "$end\t$name\tFEAT\n";
|
110
|
+
}else{
|
111
|
+
my $midpoint = int(($last_end + $start)/2);
|
112
|
+
print "$last_end\t$last_name\tFEAT\n" unless $last_end==1;
|
113
|
+
}
|
114
|
+
$last_name = $name;
|
115
|
+
$last_end = $end;
|
116
|
+
}
|
117
|
+
if($last_end > 1){
|
118
|
+
if($o{i}){
|
119
|
+
print "$l\t$last_name~NA\tGAP\n" unless $last_end==$l;
|
120
|
+
}else{
|
121
|
+
print "$l\t$last_name\tFEAT\n";
|
122
|
+
}
|
123
|
+
}
|
124
|
+
close GFF;
|
125
|
+
print STDERR " done.\n" unless $o{q};
|
126
|
+
}
|
127
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Feb-06-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
o = {:q=>FALSE, :k=>1, :split=>"#"}
|
12
|
+
ARGV << '-h' if ARGV.size==0
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = "
|
15
|
+
Adds annotations to GenBank files.
|
16
|
+
|
17
|
+
Usage: #{$0} [options]"
|
18
|
+
opts.separator ""
|
19
|
+
opts.separator "Mandatory"
|
20
|
+
opts.on("-g", "--genbank FILE", "Input GenBank file."){ |v| o[:gb]=v }
|
21
|
+
opts.on("-t", "--table FILE", "Input file containing the annotations. It must be a ",
|
22
|
+
"tab-delimited raw table including a header row with ",
|
23
|
+
"the names of the fields."){ |v| o[:table]=v }
|
24
|
+
opts.on("-o", "--out FILE", "Output file containing the annotated GenBank."){ |v| o[:out]=v }
|
25
|
+
opts.separator ""
|
26
|
+
opts.separator "Other Options"
|
27
|
+
opts.on("-k", "--key NUMBER", "Key of the column to use as identifier. By default: #{o[:k]}"){ |v| o[:k] = v.to_i }
|
28
|
+
opts.on("-s", "--split STRING", "String that separates multiple entries in the annotation features. By default: \"#{o[:split]}\""){ |v| o[:k] = v.to_i }
|
29
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
|
30
|
+
opts.on("-h", "--help", "Display this screen.") do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.separator ""
|
35
|
+
end.parse!
|
36
|
+
abort "-g is mandatory" if o[:gb].nil?
|
37
|
+
abort "-t is mandatory" if o[:table].nil?
|
38
|
+
abort "-o is mandatory" if o[:out].nil?
|
39
|
+
|
40
|
+
##### MAIN:
|
41
|
+
begin
|
42
|
+
puts "Reading annotation table: #{o[:table]}." unless o[:q]
|
43
|
+
ifh = File.open(o[:table], "r")
|
44
|
+
header = ifh.gets.chomp.split(/\t/)
|
45
|
+
puts " * using #{header[ o[:k]-1 ]} column as feature identifier."
|
46
|
+
annot = {}
|
47
|
+
while ln=ifh.gets
|
48
|
+
row = ln.chomp.split(/\t/)
|
49
|
+
warn "WARNING: #{header[ o[:k]-1 ]} #{row[ o[:k]-1 ]} found more than once." unless annot[ row[ o[:k]-1 ] ].nil?
|
50
|
+
annot[ row[ o[:k]-1 ] ] = row
|
51
|
+
end
|
52
|
+
ifh.close
|
53
|
+
puts " * found #{annot.size} annotation entries with #{header.size} fields." unless o[:q]
|
54
|
+
puts "Annotating GenBank." unless o[:q]
|
55
|
+
ifh = File.open(o[:gb], "r")
|
56
|
+
ofh = File.open(o[:out], "w")
|
57
|
+
found = 0
|
58
|
+
notfound = 0
|
59
|
+
while ln=ifh.gets
|
60
|
+
ofh.print ln
|
61
|
+
m = /^(?<sp>\s+)\/#{header[ o[:k]-1 ]}="(?<id>.+)"/.match(ln)
|
62
|
+
next if m.nil?
|
63
|
+
if annot[ m[:id] ].nil?
|
64
|
+
notfound += 1
|
65
|
+
next
|
66
|
+
end
|
67
|
+
found += 1
|
68
|
+
annot[ m[:id] ].each_index do |i|
|
69
|
+
next if i == o[:k]-1 or annot[ m[:id] ][i]==""
|
70
|
+
annot[ m[:id] ][i].split(/#{o[:split]}/).each{ |v| ofh.puts "#{m[:sp]}/#{header[i]}=\"#{v}\"" }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
ofh.close
|
74
|
+
ifh.close
|
75
|
+
puts " * annotated #{found} features." unless o[:q]
|
76
|
+
puts " * couldn't find #{notfound} features in the annotation table." unless o[:q] or notfound==0
|
77
|
+
$stderr.puts "Done.\n" unless o[:q]
|
78
|
+
rescue => err
|
79
|
+
$stderr.puts "Exception: #{err}\n\n"
|
80
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
81
|
+
err
|
82
|
+
end
|
83
|
+
|
84
|
+
|
@@ -0,0 +1,351 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license artistic license 2.0
|
5
|
+
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
8
|
+
use 'tmpdir'
|
9
|
+
use 'zlib'
|
10
|
+
|
11
|
+
o = {
|
12
|
+
bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
13
|
+
archaea: false, genomeeq: false, metagenome: false, list: false,
|
14
|
+
collection: 'dupont_2012'
|
15
|
+
}
|
16
|
+
OptionParser.new do |opts|
|
17
|
+
opts.banner = "
|
18
|
+
Finds and extracts a collection of essential proteins suitable for genome
|
19
|
+
completeness evaluation and phylogenetic analyses. Important note: most complete
|
20
|
+
bacterial genomes contain only 106/111 genes in this collection, therefore
|
21
|
+
producing a completeness of 95.5%, and most archaeal genomes only contain 26/111
|
22
|
+
genes, producing a completeness of 23.4%. Use the options --bacteria and/or
|
23
|
+
--archaea to ignore models often missing in one or both domains. Note that even
|
24
|
+
with these options, some complete archaeal genomes result in very low values of
|
25
|
+
completeness (e.g., Nanoarchaeum equitans returns 88.5%).
|
26
|
+
|
27
|
+
Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
|
28
|
+
|
29
|
+
Usage: #{$0} [options]"
|
30
|
+
opts.separator ''
|
31
|
+
opts.separator 'Mandatory'
|
32
|
+
opts.on(
|
33
|
+
'-i', '--in FILE',
|
34
|
+
'Path to the FastA file (.gz allowed) with all the proteins in a genome'
|
35
|
+
) { |v| o[:in] = v }
|
36
|
+
opts.separator ''
|
37
|
+
opts.separator 'Options'
|
38
|
+
opts.on(
|
39
|
+
'-c', '--collection STR',
|
40
|
+
'Reference collection of essential proteins to use. One of:',
|
41
|
+
'> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
|
42
|
+
' modified by https://doi.org/10.1038/ismej.2015.5',
|
43
|
+
'> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
|
44
|
+
' modified by https://doi.org/10.7717/peerj.1319'
|
45
|
+
) { |v| o[:collection] = v }
|
46
|
+
opts.on(
|
47
|
+
'-o', '--out FILE',
|
48
|
+
'Path to the output FastA file with the translated essential genes',
|
49
|
+
'By default the file is not produced'
|
50
|
+
) { |v| o[:out] = v }
|
51
|
+
opts.on(
|
52
|
+
'-m', '--per-model STR',
|
53
|
+
'Prefix of translated genes in independent files with the name of the',
|
54
|
+
'model appended. By default files are not produced'
|
55
|
+
) { |v| o[:permodel] = v }
|
56
|
+
opts.on(
|
57
|
+
'-R', '--report FILE',
|
58
|
+
'Path to the report file. By default, the report is sent to the STDOUT'
|
59
|
+
) { |v| o[:report] = v }
|
60
|
+
opts.on(
|
61
|
+
'--hmm-out FILE',
|
62
|
+
'Save HMMsearch output in this file. By default, not saved'
|
63
|
+
) { |v| o[:hmmout] = v }
|
64
|
+
opts.on(
|
65
|
+
'--alignments FILE',
|
66
|
+
'Save the aligned proteins in this file. By default, not saved'
|
67
|
+
) { |v| o[:alignments] = v }
|
68
|
+
opts.on(
|
69
|
+
'-B', '--bacteria',
|
70
|
+
'If set, ignores models typically missing in Bacteria'
|
71
|
+
) { |v| o[:bacteria] = v }
|
72
|
+
opts.on(
|
73
|
+
'-A', '--archaea',
|
74
|
+
'If set, ignores models typically missing in Archaea'
|
75
|
+
) { |v| o[:archaea] = v }
|
76
|
+
opts.on(
|
77
|
+
'-G', '--genome-eq',
|
78
|
+
'If set, ignores models not suitable for genome-equivalents estimations',
|
79
|
+
'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
|
80
|
+
) { |v| o[:genomeeq] = v }
|
81
|
+
opts.on(
|
82
|
+
'-r', '--rename STR',
|
83
|
+
'If set, renames the sequences with the string provided and appends it',
|
84
|
+
'with pipe and the gene name (except in --per-model files)'
|
85
|
+
) { |v| o[:rename] = v }
|
86
|
+
opts.on(
|
87
|
+
'-n', '--no-stats',
|
88
|
+
'If set, no statistics are reported on genome evaluation'
|
89
|
+
) { |v| o[:stats] = v }
|
90
|
+
opts.on(
|
91
|
+
'-s', '--no-genes',
|
92
|
+
'If set, statistics won\'t include the lists of missing/multi-copy genes'
|
93
|
+
) { |v| o[:genes] = v }
|
94
|
+
opts.on(
|
95
|
+
'-M', '--metagenome',
|
96
|
+
'If set, it allows for multiple copies of each gene and turns on',
|
97
|
+
'metagenomic report mode'
|
98
|
+
) { |v| o[:metagenome] = v }
|
99
|
+
opts.separator ''
|
100
|
+
opts.separator 'Other Options'
|
101
|
+
opts.on(
|
102
|
+
'-L', '--list-models',
|
103
|
+
'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
|
104
|
+
'and -q; ignores all other parameters'
|
105
|
+
) { |v| o[:list] = v }
|
106
|
+
opts.on(
|
107
|
+
'-b', '--bin DIR',
|
108
|
+
'Path to the directory containing the binaries of HMMer 3.0+'
|
109
|
+
) { |v| o[:bin] = v }
|
110
|
+
opts.on(
|
111
|
+
'--model-file',
|
112
|
+
'External file containing models to search'
|
113
|
+
) { |v| o[:model_file] = v }
|
114
|
+
opts.on(
|
115
|
+
'-t', '--threads INT', Integer,
|
116
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
117
|
+
) { |v| o[:thr] = v }
|
118
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
119
|
+
opts.on('-h', '--help', 'Display this screen') do
|
120
|
+
puts opts
|
121
|
+
exit
|
122
|
+
end
|
123
|
+
opts.separator ''
|
124
|
+
end.parse!
|
125
|
+
abort '-i is mandatory' if o[:in].nil? and not o[:list]
|
126
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
127
|
+
o[:rename] = nil if o[:metagenome]
|
128
|
+
|
129
|
+
case o[:collection]
|
130
|
+
when 'dupont_2012'
|
131
|
+
not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
|
132
|
+
TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
|
133
|
+
TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
|
134
|
+
TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
|
135
|
+
TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
|
136
|
+
TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
|
137
|
+
TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
|
138
|
+
TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
|
139
|
+
TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
|
140
|
+
TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
|
141
|
+
TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
|
142
|
+
TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
|
143
|
+
TIGR02729 TIGR03263 TIGR03594}
|
144
|
+
not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
|
145
|
+
not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
|
146
|
+
TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
|
147
|
+
when 'lee_2019'
|
148
|
+
not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
|
149
|
+
EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
|
150
|
+
Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
|
151
|
+
Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
|
152
|
+
Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
|
153
|
+
Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
|
154
|
+
RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
|
155
|
+
not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
|
156
|
+
CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
|
157
|
+
DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
|
158
|
+
Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
|
159
|
+
Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
|
160
|
+
Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
|
161
|
+
Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
|
162
|
+
Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
|
163
|
+
not_as_genomeeq = not_in_archaea + not_in_bacteria
|
164
|
+
else
|
165
|
+
raise "Unsupported collection: '#{o[:collection]}'"
|
166
|
+
end
|
167
|
+
|
168
|
+
begin
|
169
|
+
Dir.mktmpdir do |dir|
|
170
|
+
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
171
|
+
if o[:in] =~ /\.gz/
|
172
|
+
tmp_in = File.expand_path('sequences.fa', dir)
|
173
|
+
Zlib::GzipReader.open(o[:in]) do |ifh|
|
174
|
+
File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
|
175
|
+
end
|
176
|
+
o[:in] = tmp_in
|
177
|
+
end
|
178
|
+
|
179
|
+
# Create database.
|
180
|
+
$stderr.puts 'Searching models.' unless o[:q]
|
181
|
+
models = {}
|
182
|
+
model_id = nil
|
183
|
+
dbh = File.open("#{dir}/essential.hmm", 'w')
|
184
|
+
o[:model_file] ||= File.expand_path(
|
185
|
+
"../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
|
186
|
+
mfh = (File.extname(o[:model_file]) == '.gz') ?
|
187
|
+
Zlib::GzipReader.open(o[:model_file]) :
|
188
|
+
File.open(o[:model_file], 'r')
|
189
|
+
while ln = mfh.gets
|
190
|
+
dbh.print ln
|
191
|
+
ln.chomp!
|
192
|
+
model_id = $1 if ln =~ /^NAME\s+(.+)/
|
193
|
+
models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
|
194
|
+
end
|
195
|
+
dbh.close
|
196
|
+
mfh.close
|
197
|
+
models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
|
198
|
+
models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
|
199
|
+
models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
|
200
|
+
if o[:list]
|
201
|
+
models.each_pair{ |id,desc| puts [id,desc].join("\t") }
|
202
|
+
exit
|
203
|
+
end
|
204
|
+
|
205
|
+
# Check HMMer version and run HMMsearch.
|
206
|
+
if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
|
207
|
+
raise 'You have provided an unsupported version of HMMER. ' +
|
208
|
+
'This script requires HMMER 3.0+.'
|
209
|
+
end
|
210
|
+
o[:hmmout] ||= "#{dir}/hmmsearch"
|
211
|
+
`'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
|
212
|
+
-A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
|
213
|
+
> '#{dir}/hmmsearch.log'`
|
214
|
+
|
215
|
+
# Parse output
|
216
|
+
$stderr.puts 'Parsing results.' unless o[:q]
|
217
|
+
trash = []
|
218
|
+
genes = {}
|
219
|
+
File.open(o[:hmmout], 'r') do |resh|
|
220
|
+
while ln = resh.gets
|
221
|
+
next if ln =~ /^#/
|
222
|
+
r = ln.split /\s+/
|
223
|
+
next unless models.include? r[2]
|
224
|
+
if o[:metagenome]
|
225
|
+
genes[ r[2] ] = [] if genes[ r[2] ].nil?
|
226
|
+
genes[ r[2] ] << r[0]
|
227
|
+
elsif genes[ r[2] ].nil?
|
228
|
+
genes[ r[2] ] = r[0]
|
229
|
+
else
|
230
|
+
trash << r[2]
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# Report statistics
|
236
|
+
if o[:stats]
|
237
|
+
reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
|
238
|
+
modifiers = [:bacteria, :archaea, :genomeeq]
|
239
|
+
.map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
|
240
|
+
reph.puts "! Collection: #{o[:collection]} #{modifiers}"
|
241
|
+
if o[:metagenome]
|
242
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
243
|
+
gc = [0] * (models.size - genes.size) +
|
244
|
+
genes.values.map{ |g| g.length }.sort
|
245
|
+
reph.printf "! Mean number of copies per model: %.3f.\n",
|
246
|
+
gc.inject(:+).to_f / models.size
|
247
|
+
reph.printf "! Median number of copies per model: %.1f.\n",
|
248
|
+
gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
|
249
|
+
if o[:genes] and genes.size != models.size
|
250
|
+
reph.printf "! Missing genes: %s\n",
|
251
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
252
|
+
map{|m| "#{m}: #{models[m]}."}).join("\n! ")
|
253
|
+
end
|
254
|
+
else
|
255
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
256
|
+
reph.printf "! Completeness: %.1f%%.\n",
|
257
|
+
100.0 * genes.size / models.size
|
258
|
+
reph.printf "! Contamination: %.1f%%.\n",
|
259
|
+
100.0 * trash.size / models.size
|
260
|
+
if o[:genes]
|
261
|
+
reph.printf "! Multiple copies: %s\n",
|
262
|
+
([''] + trash.uniq.
|
263
|
+
map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
|
264
|
+
join("\n! ") unless trash.empty?
|
265
|
+
reph.printf "! Missing genes: %s\n",
|
266
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
267
|
+
map{ |m| "#{m}: #{models[m]}." }).
|
268
|
+
join("\n! ") unless genes.size == models.size
|
269
|
+
end
|
270
|
+
end
|
271
|
+
reph.close unless o[:report].nil?
|
272
|
+
end
|
273
|
+
|
274
|
+
# Extract sequences
|
275
|
+
unless o[:out].nil? and o[:permodel].nil?
|
276
|
+
$stderr.puts 'Extracting sequences.' unless o[:q]
|
277
|
+
faah = File.open(o[:in], 'r')
|
278
|
+
outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
|
279
|
+
geneh = nil
|
280
|
+
in_gene = nil
|
281
|
+
unless o[:permodel].nil?
|
282
|
+
genes.keys.each do |m|
|
283
|
+
File.open("#{o[:permodel]}#{m}.faa", 'w').close
|
284
|
+
end
|
285
|
+
end
|
286
|
+
while ln = faah.gets
|
287
|
+
if ln =~ /^>(\S+)/
|
288
|
+
if o[:metagenome]
|
289
|
+
in_gene = genes.keys.
|
290
|
+
map{ |k| genes[k].include?($1) ? k : nil }.compact.first
|
291
|
+
in_gene = [in_gene, $1] unless in_gene.nil?
|
292
|
+
else
|
293
|
+
in_gene = genes.rassoc($1)
|
294
|
+
end
|
295
|
+
next if in_gene.nil?
|
296
|
+
geneh.close unless geneh.nil?
|
297
|
+
geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
|
298
|
+
o[:permodel].nil?
|
299
|
+
outh.print(o[:rename].nil? ?
|
300
|
+
ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
|
301
|
+
geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
|
302
|
+
geneh.nil?
|
303
|
+
else
|
304
|
+
next if in_gene.nil?
|
305
|
+
outh.print ln unless outh.nil?
|
306
|
+
geneh.print ln unless geneh.nil?
|
307
|
+
end
|
308
|
+
end
|
309
|
+
geneh.close unless geneh.nil?
|
310
|
+
outh.close unless outh.nil?
|
311
|
+
faah.close
|
312
|
+
end
|
313
|
+
|
314
|
+
unless o[:alignments].nil?
|
315
|
+
aln = {}
|
316
|
+
File.open("#{dir}/a.sto", 'r') do |fh|
|
317
|
+
cur_model = nil
|
318
|
+
mask = []
|
319
|
+
fh.each_line do |ln|
|
320
|
+
case ln.chomp
|
321
|
+
when /^# STOCKHOLM/
|
322
|
+
cur_model = nil
|
323
|
+
mask = []
|
324
|
+
when /^#=GS (\S+)\/([\d\-]+)\s+DE/
|
325
|
+
cur_model ||= ( genes.rassoc($1) || [] ).first
|
326
|
+
aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
|
327
|
+
when /^#=GC RF\s+(\S+)/
|
328
|
+
aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
|
329
|
+
mask.each{ |d| i[d] = '' }
|
330
|
+
end
|
331
|
+
when /^[^#]\S*\s+(\S+)/
|
332
|
+
next if aln[ cur_model ][ 2 ]
|
333
|
+
aln[ cur_model ][ 2 ] = $1.upcase
|
334
|
+
mask = aln[ cur_model ][ 2 ].split('').each_with_index.
|
335
|
+
map{ |v, k| v == '.' ? k : nil }.compact.reverse
|
336
|
+
aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
340
|
+
File.open(o[:alignments], 'w') do |fh|
|
341
|
+
aln.each { |k, v| v.each{ |i| fh.puts i } }
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
$stderr.puts 'Done.' unless o[:q]
|
346
|
+
end # |dir|
|
347
|
+
rescue => err
|
348
|
+
$stderr.puts "Exception: #{err}\n\n"
|
349
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
350
|
+
err
|
351
|
+
end
|