miga-base 0.7.26.0 → 1.0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +55 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +419 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +100 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +80 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +304 -3
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
o = {range: 0.5, perseq: false, length: false}
|
6
|
+
ARGV << '-h' if ARGV.empty?
|
7
|
+
OptionParser.new do |opt|
|
8
|
+
opt.banner = "
|
9
|
+
Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
|
10
|
+
|
11
|
+
IMPORTANT: This script doesn't consider zero-coverage positions if missing
|
12
|
+
from the file. If you produce your BedGraph file with bedtools genomecov and
|
13
|
+
want to consider zero-coverage position, be sure to use -bga (not -bg).
|
14
|
+
|
15
|
+
Usage: #{$0} [options]"
|
16
|
+
opt.separator ''
|
17
|
+
opt.on('-i', '--input PATH',
|
18
|
+
'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
|
19
|
+
opt.on('-r', '--range FLOAT',
|
20
|
+
'Central range to consider, between 0 and 1.',
|
21
|
+
"By default: #{o[:range]} (inter-quartile range)."
|
22
|
+
){ |v| o[:range]=v.to_f }
|
23
|
+
opt.on('-s', '--per-seq',
|
24
|
+
'Calculate averages per reference sequence, not total.',
|
25
|
+
'Assumes a sorted BedGraph file.'
|
26
|
+
){ |v| o[:perseq] = v }
|
27
|
+
opt.on('-l', '--length',
|
28
|
+
'Add sequence length to the output.'){ |v| o[:length] = v }
|
29
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
30
|
+
puts opt
|
31
|
+
exit
|
32
|
+
end
|
33
|
+
opt.separator ''
|
34
|
+
end.parse!
|
35
|
+
abort '-i is mandatory.' if o[:i].nil?
|
36
|
+
|
37
|
+
def pad(d, idx, r)
|
38
|
+
idx.each do |i|
|
39
|
+
next if d[i].nil?
|
40
|
+
d[i] -= r
|
41
|
+
break unless d[i] < 0
|
42
|
+
r = -d[i]
|
43
|
+
d[i] = nil
|
44
|
+
end
|
45
|
+
d
|
46
|
+
end
|
47
|
+
|
48
|
+
def report(sq, d, ln, o)
|
49
|
+
# Estimate padding ranges
|
50
|
+
pad = (1.0-o[:range])/2.0
|
51
|
+
r = (pad*ln).round
|
52
|
+
|
53
|
+
# Pad
|
54
|
+
d = pad(d, d.each_index.to_a, r+0)
|
55
|
+
d = pad(d, d.each_index.to_a.reverse, r+0)
|
56
|
+
|
57
|
+
# Average
|
58
|
+
y = [0.0]
|
59
|
+
unless d.compact.empty?
|
60
|
+
s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
|
61
|
+
y[0] = s.to_f/d.compact.inject(:+)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Report
|
65
|
+
y.unshift(sq) if o[:perseq]
|
66
|
+
y << ln if o[:length]
|
67
|
+
puts y.join("\t")
|
68
|
+
end
|
69
|
+
|
70
|
+
# Read BedGraph
|
71
|
+
d = []
|
72
|
+
ln = 0
|
73
|
+
pre_sq = nil
|
74
|
+
File.open(o[:i], "r") do |ifh|
|
75
|
+
ifh.each_line do |i|
|
76
|
+
next if i =~ /^#/
|
77
|
+
r = i.chomp.split("\t")
|
78
|
+
sq = r.shift
|
79
|
+
if o[:perseq] and !pre_sq.nil? and pre_sq!=sq
|
80
|
+
report(pre_sq, d, ln, o)
|
81
|
+
d = []
|
82
|
+
ln = 0
|
83
|
+
end
|
84
|
+
r.map! { |j| j.to_i }
|
85
|
+
l = r[1]-r[0]
|
86
|
+
d[ r[2] ] ||= 0
|
87
|
+
d[ r[2] ] += l
|
88
|
+
ln += l
|
89
|
+
pre_sq = sq
|
90
|
+
end
|
91
|
+
end
|
92
|
+
report(pre_sq, d, ln, o)
|
93
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
o = {win: 1000}
|
6
|
+
ARGV << '-h' if ARGV.empty?
|
7
|
+
OptionParser.new do |opt|
|
8
|
+
opt.banner = "
|
9
|
+
Estimates the sequencing depth per windows from a BedGraph file.
|
10
|
+
|
11
|
+
IMPORTANT: This script doesn't consider zero-coverage positions if missing
|
12
|
+
from the file. If you produce your BedGraph file with bedtools genomecov and
|
13
|
+
want to consider zero-coverage position, be sure to use -bga (not -bg).
|
14
|
+
|
15
|
+
Usage: #{$0} [options]"
|
16
|
+
opt.separator ''
|
17
|
+
opt.on('-i', '--input PATH',
|
18
|
+
'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
|
19
|
+
opt.on('-w', '--win INT',
|
20
|
+
'Window size, in base pairs.', "By default: #{o[:win]}."
|
21
|
+
){ |v| o[:win]=v.to_i }
|
22
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
23
|
+
puts opt
|
24
|
+
exit
|
25
|
+
end
|
26
|
+
opt.separator ''
|
27
|
+
end.parse!
|
28
|
+
abort '-i is mandatory.' if o[:i].nil?
|
29
|
+
|
30
|
+
def report(d, a, b, seqs)
|
31
|
+
# Average
|
32
|
+
y = 0.0
|
33
|
+
unless d.compact.empty?
|
34
|
+
s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
|
35
|
+
y = s.to_f/d.compact.inject(:+)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Report
|
39
|
+
puts [a, b, y, seqs.keys.join(",")].join("\t")
|
40
|
+
end
|
41
|
+
|
42
|
+
# Read BedGraph
|
43
|
+
d = []
|
44
|
+
ln = 0
|
45
|
+
a = 1
|
46
|
+
seqs = {}
|
47
|
+
b = o[:win]
|
48
|
+
File.open(o[:i], "r") do |ifh|
|
49
|
+
ifh.each_line do |i|
|
50
|
+
next if i =~ /^#/
|
51
|
+
r = i.chomp.split("\t")
|
52
|
+
sq = r.shift
|
53
|
+
seqs[sq] = 1
|
54
|
+
r.map!{ |j| j.to_i }
|
55
|
+
l = r[1]-r[0]
|
56
|
+
d[ r[2] ] ||= 0
|
57
|
+
d[ r[2] ] += l
|
58
|
+
ln += l
|
59
|
+
while ln >= b
|
60
|
+
d[ r[2] ] -= (ln-b)
|
61
|
+
report(d, a, b, seqs)
|
62
|
+
seqs = {}
|
63
|
+
seqs[ sq ] = 1 if ln > b
|
64
|
+
d = []
|
65
|
+
d[ r[2] ] = (ln-b)
|
66
|
+
a = b + 1
|
67
|
+
b = a + o[:win] - 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @authors Konstantinos Konstantinidis (initial version)
|
4
|
+
# modified to work with the BLASTp 2.2.25+ m0 output by
|
5
|
+
# Despina Tsementzi & Luis M. Rodriguez-R
|
6
|
+
# @updated Dec-21-2015
|
7
|
+
#
|
8
|
+
|
9
|
+
|
10
|
+
$/ = "Lambda ";
|
11
|
+
use strict;
|
12
|
+
my %hash_depth;
|
13
|
+
|
14
|
+
my @query;
|
15
|
+
my @subject;
|
16
|
+
my @similarity;
|
17
|
+
my $length = "0";
|
18
|
+
|
19
|
+
my($cigar_chr, $blast) = @ARGV;
|
20
|
+
|
21
|
+
($cigar_chr and $blast) or die "
|
22
|
+
.Description:
|
23
|
+
Counts the different AA substitutions in the best hit blast alignments, from
|
24
|
+
a BLASTP pairwise format output (-outfmt 0 in BLAST+, -m 0 in legacy BLAST).
|
25
|
+
|
26
|
+
.Usage: $0 cigar_char blast.m0.txt > aa-subs.list
|
27
|
+
|
28
|
+
cigar_char Use '+' for similar substitutions, use '_' for non similar
|
29
|
+
substitutions
|
30
|
+
blast.m0.txt Blast in 'text' format (-outfmt/-m 0).
|
31
|
+
aa-subs.list A tab-delimited raw file with one substitution per row and
|
32
|
+
columns:
|
33
|
+
(1) Name-of-query_Name-of-subject
|
34
|
+
(2) AA-in-subject
|
35
|
+
(3) AA-in-query
|
36
|
+
(4) Total-Align-Length
|
37
|
+
|
38
|
+
";
|
39
|
+
|
40
|
+
# For each blast result (i.e., for each query)
|
41
|
+
open BLAST, "<", $blast or die "Cannot read file: $blast: $!\n";
|
42
|
+
while(my $data=<BLAST>) {
|
43
|
+
$data =~ s/\r//g;
|
44
|
+
my ($data_q, @array_matches) = split(/>/,$data);
|
45
|
+
my ($name_query) = ($data_q =~ /Query\= (\S+?)(?:_GENE|\s)/);
|
46
|
+
my ($length_query) = ($data_q =~ /\(([\d,]+) letters/ );
|
47
|
+
($length_query) = ($data_q =~ /Length=([\d,]+)/) unless $length_query;
|
48
|
+
$length_query =~ tr/,//d;
|
49
|
+
|
50
|
+
# For each alignment (i.e., for each HSP),
|
51
|
+
# note the "last" at the end of the block,
|
52
|
+
# so only the best match is considered
|
53
|
+
foreach my $data_f (@array_matches) {
|
54
|
+
# Capture statistics
|
55
|
+
my ($length_match) = ($data_f =~ /Identities = \d+\/(\d+)/);
|
56
|
+
my ($identity_match) = ($data_f =~ /Identities = \d+\/\d+ \((\d+)%/);
|
57
|
+
my ($target_name) = ($data_f =~ /^\s?(\S+)/);
|
58
|
+
|
59
|
+
# If the alignment meets minimum requirements
|
60
|
+
if ($length_query >30 && ($length_match/$length_query > 0.7) && $identity_match > 60) {
|
61
|
+
$data_f =~ tr/ /_/;
|
62
|
+
my @array = split ("\n", $data_f);
|
63
|
+
my $blanks = 0;
|
64
|
+
my $prefix_size = 0;
|
65
|
+
|
66
|
+
# For each line in the alignment
|
67
|
+
for my $data_fff (@array) {
|
68
|
+
if ($data_fff =~ /(Query[:_]_+\d+_+)([^_]+)/){
|
69
|
+
# Query lines
|
70
|
+
$prefix_size = length($1);
|
71
|
+
$length = length($2);
|
72
|
+
@query = split (//, $2);
|
73
|
+
}elsif ($data_fff =~ /^_{11}/){
|
74
|
+
# Cigar lines
|
75
|
+
@similarity = split(//, substr($data_fff, $prefix_size, $length));
|
76
|
+
}elsif ($data_fff =~ /Sbjct[:_]_+\d+_+([^_]+)/){
|
77
|
+
# Subject lines
|
78
|
+
@subject = split(//, $1);
|
79
|
+
# For each alignment column
|
80
|
+
for(my $i=0; $i <= $length; $i++){
|
81
|
+
if ($similarity[$i] eq $cigar_chr) {
|
82
|
+
print "$name_query\_$target_name\t$subject[$i]\t$query[$i]\t$length_match\n";
|
83
|
+
}
|
84
|
+
}
|
85
|
+
undef @query;
|
86
|
+
undef @similarity;
|
87
|
+
undef @subject;
|
88
|
+
}
|
89
|
+
|
90
|
+
# Remove secondary alignments
|
91
|
+
if ($data_fff =~ /^$/){
|
92
|
+
$blanks++;
|
93
|
+
last if $blanks >= 3;
|
94
|
+
}else{
|
95
|
+
$blanks=0;
|
96
|
+
}
|
97
|
+
} # for my $data_fff (@array)
|
98
|
+
} # if ($length_query >30 ...
|
99
|
+
last; # <---- So it takes only the best match!
|
100
|
+
} # foreach my $data_f (@array_matches)
|
101
|
+
} # while(my $data=<>)
|
102
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
|
10
|
+
o = { sbj: false, q: false }
|
11
|
+
ARGV << '-h' if ARGV.size == 0
|
12
|
+
OptionParser.new do |opts|
|
13
|
+
opts.banner = "
|
14
|
+
Appends an extra column to a BLAST with the length of the query or the subject
|
15
|
+
sequence. You can pipe two instances to add both:
|
16
|
+
cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
|
17
|
+
|
18
|
+
Usage: #{$0} [options] < input.blast > output.blast"
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Mandatory'
|
21
|
+
opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
|
22
|
+
opts.separator ''
|
23
|
+
opts.separator 'Options'
|
24
|
+
opts.on('-s', '--subject',
|
25
|
+
'Use the subject column of the BLAST, by default the query column is used'
|
26
|
+
){ o[:sbj] = true }
|
27
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
28
|
+
opts.on('-h', '--help', 'Display this screen') do
|
29
|
+
puts opts
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
opts.separator ''
|
33
|
+
end.parse!
|
34
|
+
abort '-f is mandatory' if o[:fasta].nil?
|
35
|
+
|
36
|
+
len = {}
|
37
|
+
id = ''
|
38
|
+
$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
|
39
|
+
fh = File.open(o[:fasta], 'r')
|
40
|
+
fh.each_line do |ln|
|
41
|
+
defline = /^>(\S+)/.match(ln)
|
42
|
+
if defline.nil?
|
43
|
+
ln.gsub! /[^A-Za-z]/, ''
|
44
|
+
abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
|
45
|
+
len[id] = len[id] + ln.size
|
46
|
+
else
|
47
|
+
id = defline[1]
|
48
|
+
len[id] = 0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
fh.close
|
52
|
+
|
53
|
+
unless o[:q]
|
54
|
+
$stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
|
55
|
+
end
|
56
|
+
ARGF.each_line do |ln|
|
57
|
+
ln.chomp!
|
58
|
+
row = ln.split /\t/
|
59
|
+
id = o[:sbj] ? row[1] : row[0]
|
60
|
+
abort "Impossible to find sequence of #{id}" if len[id].nil?
|
61
|
+
puts "#{ln}\t#{len[id]}"
|
62
|
+
end
|
63
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author Luis M. Rodriguez-R
|
5
|
+
# @update Mar-23-2016
|
6
|
+
# @license artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
if [[ ! $2 ]] ; then
|
10
|
+
echo "
|
11
|
+
.DESCRIPTION
|
12
|
+
Calculates the percentage of a partial BLAST result. The
|
13
|
+
value produced slightly subestimates the actual advance,
|
14
|
+
due to un-flushed output and trailing queries that could
|
15
|
+
be processed but generate no results.
|
16
|
+
|
17
|
+
.USAGE
|
18
|
+
$0 blast.txt qry.fasta
|
19
|
+
|
20
|
+
blast.txt Incomplete Tabular BLAST output.
|
21
|
+
qry.fasta FastA file with query sequences.
|
22
|
+
";
|
23
|
+
exit 1;
|
24
|
+
fi
|
25
|
+
|
26
|
+
if [[ ! -r $1 ]]; then
|
27
|
+
echo "Cannot open file: $1";
|
28
|
+
exit 1;
|
29
|
+
fi
|
30
|
+
|
31
|
+
if [[ ! -r $2 ]]; then
|
32
|
+
echo "Cannot open file: $2";
|
33
|
+
exit 1;
|
34
|
+
fi
|
35
|
+
|
36
|
+
LAST_Q=`tail -n 2 $1 | head -n 1 | awk '{print $1}'`
|
37
|
+
LAST_Q_NO=`grep -n "^>$LAST_Q\\( \\|$\\)" $2 | sed -e 's/:.*//'`
|
38
|
+
if [[ ! $LAST_Q_NO ]]; then
|
39
|
+
echo "Cannot find sequence: $LAST_Q";
|
40
|
+
echo "Make sure you are providing the right query file.";
|
41
|
+
exit 1;
|
42
|
+
fi
|
43
|
+
TOTAL_Q_NO=`cat $2 | wc -l | sed -e 's/ *//'`
|
44
|
+
let PERC=100*$LAST_Q_NO/$TOTAL_Q_NO
|
45
|
+
|
46
|
+
echo "$PERC%: $LAST_Q_NO / $TOTAL_Q_NO"
|
47
|
+
exit 0;
|
48
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
# @last_update: Mar-23-2015
|
7
|
+
#
|
8
|
+
|
9
|
+
use strict;
|
10
|
+
use warnings;
|
11
|
+
|
12
|
+
die "
|
13
|
+
Usage:
|
14
|
+
sort blast.txt ... | $0 > blast.bh.txt
|
15
|
+
$0 blast_sorted.txt ... > blast.bh.txt
|
16
|
+
$0 -h|--help|-?
|
17
|
+
|
18
|
+
blast.txt ... One or more files in Tabular BLAST format.
|
19
|
+
blast_sorted.txt ... One or more files in Tabular BLAST format pre-sorted.
|
20
|
+
blast.bh.txt Output file in BLAST format containing best-hits only.
|
21
|
+
-h|--help|-? Any of these flags trigger this help message and exits.
|
22
|
+
|
23
|
+
NOTE: This script assumes that the BLAST is sorted. Because it can read
|
24
|
+
from the STDIN, calling this script without arguments cause it to still until
|
25
|
+
killed or until an EOF (^D) is presented.
|
26
|
+
|
27
|
+
" if exists $ARGV[0] and $ARGV[0] =~ /^\-?\-(h(elp)?|\?)/i;
|
28
|
+
|
29
|
+
my $last_qry = '';
|
30
|
+
my @best_res;
|
31
|
+
|
32
|
+
sub best_result($$){
|
33
|
+
my($r1, $r2)=@_;
|
34
|
+
return $r1 unless $r2;
|
35
|
+
return $r1->[11] > $r2->[11] ? @$r1 : @$r2;
|
36
|
+
}
|
37
|
+
|
38
|
+
my $i=0;
|
39
|
+
while(<>){
|
40
|
+
chomp;
|
41
|
+
#print STDERR " Reading entry $i... \r" unless $i%1000;
|
42
|
+
my @res = split /\t/;
|
43
|
+
die "\nCannot parse BLAST line $.: $_\n" unless exists $res[1];
|
44
|
+
if($last_qry eq $res[0]){
|
45
|
+
@best_res = &best_result(\@res, \@best_res);
|
46
|
+
}else{
|
47
|
+
print join("\t", @best_res), "\n" if $#best_res>0;
|
48
|
+
@best_res = @res;
|
49
|
+
$last_qry = $res[0];
|
50
|
+
}
|
51
|
+
}
|
52
|
+
print join("\t", @best_res), "\n" if @best_res;
|
53
|
+
|
54
|
+
|
55
|
+
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
|
3
|
+
# @author: Luis M. Rodriguez-R
|
4
|
+
# @license: Artistic-2.0
|
5
|
+
|
6
|
+
use warnings;
|
7
|
+
use strict;
|
8
|
+
use List::Util qw/min max/;
|
9
|
+
use Getopt::Std;
|
10
|
+
|
11
|
+
sub HELP_MESSAGE { die "
|
12
|
+
|
13
|
+
Description:
|
14
|
+
Generates a list of hits from a BLAST result concatenating the subject
|
15
|
+
sequences. This can be used, e.g., to analyze BLAST results against
|
16
|
+
draft genomes.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
$0 [options] seq.fa map.bls
|
20
|
+
|
21
|
+
seq.fa Subject sequences (ref) in FastA format.
|
22
|
+
map.bls Mapping of the reads to the reference in BLAST Tabular
|
23
|
+
format.
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-i <float> Minimum identity to report a result. By default: 70.
|
27
|
+
-l <int> Minimum alignment length to report a result. By default: 60.
|
28
|
+
-s The FastA provided is to be treated as a subset of the subject.
|
29
|
+
By default, it expects all the BLAST subjects to be present in
|
30
|
+
the FastA.
|
31
|
+
-q Run quietly.
|
32
|
+
-h Display this message and exit.
|
33
|
+
|
34
|
+
This script creates two files using <map.bls> as prefix with extensions
|
35
|
+
.rec (for the recruitment plot) and .lim (for the limits of the different
|
36
|
+
sequences in <seq.fa>).
|
37
|
+
|
38
|
+
";}
|
39
|
+
|
40
|
+
my %o;
|
41
|
+
getopts('i:l:sqh', \%o);
|
42
|
+
my($fa, $map) = @ARGV;
|
43
|
+
($fa and $map) or &HELP_MESSAGE;
|
44
|
+
$o{h} and &HELP_MESSAGE;
|
45
|
+
$o{i} ||= 70;
|
46
|
+
$o{l} ||= 60;
|
47
|
+
|
48
|
+
my %seq = ();
|
49
|
+
my @seq = ();
|
50
|
+
my $tot = 0;
|
51
|
+
|
52
|
+
SEQ: {
|
53
|
+
print STDERR "== Reading reference sequences\n" unless $o{q};
|
54
|
+
open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
|
55
|
+
my $cur_seq = '';
|
56
|
+
while(<FA>){
|
57
|
+
chomp;
|
58
|
+
if(m/^>(\S+)/){
|
59
|
+
my $c = $1;
|
60
|
+
$seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
|
61
|
+
push @seq, $c;
|
62
|
+
$cur_seq = $c;
|
63
|
+
}else{
|
64
|
+
s/[^A-Za-z]//g;
|
65
|
+
$seq{$cur_seq} += length $_;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
close FA;
|
69
|
+
print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
|
70
|
+
}
|
71
|
+
|
72
|
+
open LIM, ">", "$map.lim" or die "Cannot create the file: $map.lim: $!\n";
|
73
|
+
my $l = 0;
|
74
|
+
for my $s (@seq){
|
75
|
+
print LIM "$s\t".(++$l)."\t$seq{$s}\n";
|
76
|
+
($l, $seq{$s}) = ($seq{$s}, $l);
|
77
|
+
}
|
78
|
+
close LIM;
|
79
|
+
|
80
|
+
MAP: {
|
81
|
+
print STDERR "== Reading mapping\n" unless $o{q};
|
82
|
+
open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
|
83
|
+
open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
|
84
|
+
RESULT: while(<BLS>){
|
85
|
+
chomp;
|
86
|
+
my @ln = split /\t/;
|
87
|
+
$ln[11] or die "Cannot parse line $map:$.: $_\n";
|
88
|
+
next unless $ln[3]>=$o{l};
|
89
|
+
next unless $ln[2]>=$o{i};
|
90
|
+
unless(exists $seq{$ln[1]}){
|
91
|
+
die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
|
92
|
+
next RESULT;
|
93
|
+
}
|
94
|
+
my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
|
95
|
+
my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
|
96
|
+
print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
|
97
|
+
(exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
|
98
|
+
exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
|
99
|
+
}
|
100
|
+
close BLS;
|
101
|
+
close REC;
|
102
|
+
print STDERR " done.\n" unless $o{q};
|
103
|
+
}
|
104
|
+
|