miga-base 0.7.26.0 → 0.7.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +766 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +243 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +67 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +829 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +501 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +156 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +163 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +57 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +418 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm.rb +146 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +50 -0
- data/utils/enveomics/enveomics.R/README.md +80 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- metadata +277 -4
@@ -0,0 +1,186 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Sep-11-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
$:.push File.expand_path(File.dirname(__FILE__) + "/lib")
|
10
|
+
require 'enveomics_rb/og'
|
11
|
+
require 'optparse'
|
12
|
+
require 'tmpdir'
|
13
|
+
|
14
|
+
o = {q:false, f:"(\\S+)-(\\S+)\\.rbm", mcl:"", inflation:1.5, blind:false,
|
15
|
+
evalue:false, thr:2, identity:false, bestmatch:false}
|
16
|
+
ARGV << "-h" if ARGV.size==0
|
17
|
+
OptionParser.new do |opts|
|
18
|
+
opts.banner = "
|
19
|
+
Identifies Orthology Groups (OGs) in Reciprocal Best Matches (RBM)
|
20
|
+
between all pairs in a collection of genomes, using the Markov Cluster
|
21
|
+
Algorithm.
|
22
|
+
|
23
|
+
Requires MCL (see http://www.micans.org/mcl).
|
24
|
+
|
25
|
+
Usage: #{$0} [options]"
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Mandatory"
|
28
|
+
opts.on("-o", "--out FILE",
|
29
|
+
"Output file containing the detected OGs."){ |v| o[:out]=v }
|
30
|
+
opts.on("-d", "--dir DIR",
|
31
|
+
"Directory containing the RBM files.",
|
32
|
+
"Becomes optional iff --abc is set to a non-empty file."){ |v| o[:dir]=v }
|
33
|
+
opts.separator ""
|
34
|
+
opts.separator "Other Options"
|
35
|
+
opts.on("-f", "--format STRING",
|
36
|
+
"Format of the filenames for the RBM files (within -d), using regex " +
|
37
|
+
"syntax.", "By default: '#{o[:f]}'."){ |v| o[:f]=v }
|
38
|
+
opts.on("-I", "--inflation FLOAT",
|
39
|
+
"Inflation parameter for MCL clustering. By default: #{o[:inflation]}."
|
40
|
+
){ |v| o[:inflation]=v.to_f }
|
41
|
+
opts.on("-b", "--blind",
|
42
|
+
"If set, computes clusters without taking bitscore into account."
|
43
|
+
){ |v| o[:blind]=v }
|
44
|
+
opts.on("-e", "--evalue",
|
45
|
+
"If set, uses the e-value to weight edges, instead of the default " +
|
46
|
+
"Bit-Score."){ |v| o[:evalue]=v }
|
47
|
+
opts.on("-i", "--identity",
|
48
|
+
"If set, uses the identity to weight edges, instead of the default " +
|
49
|
+
"Bit-Score."){ |v| o[:identity]=v }
|
50
|
+
opts.on("-B", "--best-match",
|
51
|
+
"If set, it assumes best-matches instead reciprocal best matches."
|
52
|
+
){ |v| o[:bestmatch]=v }
|
53
|
+
opts.on("-m", "--mcl-bin DIR",
|
54
|
+
"Path to the directory containing the mcl binaries.",
|
55
|
+
"By default, assumed to be in the PATH."){ |v| o[:mcl]=v+"/" }
|
56
|
+
opts.on("--abc FILE",
|
57
|
+
"Use this abc file instead of a temporal file."){ |v| o[:abc] = v }
|
58
|
+
opts.on("-t", "--threads INT",
|
59
|
+
"Number of threads to use. By default: #{o[:thr]}."){ |v| o[:thr]=v.to_i }
|
60
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
|
61
|
+
opts.on("-h", "--help", "Display this screen.") do
|
62
|
+
puts opts
|
63
|
+
exit
|
64
|
+
end
|
65
|
+
opts.separator ""
|
66
|
+
end.parse!
|
67
|
+
abort "-o is mandatory" if o[:out].nil?
|
68
|
+
o[:evalue] = false if o[:identity]
|
69
|
+
o[:evalue] = false if o[:blind]
|
70
|
+
o[:identity] = false if o[:blind]
|
71
|
+
|
72
|
+
##### MAIN:
|
73
|
+
begin
|
74
|
+
Dir.mktmpdir do |dir|
|
75
|
+
o[:abc] = "#{dir}/rbms.abc" if o[:abc].nil?
|
76
|
+
abort "-d must exist and be a directory" unless
|
77
|
+
File.size?(o[:abc]) or
|
78
|
+
(!o[:dir].nil? and File.exists?(o[:dir]) and File.directory?(o[:dir]))
|
79
|
+
# Traverse the whole directory
|
80
|
+
if File.size? o[:abc]
|
81
|
+
$stderr.puts "Reusing existing abc file '#{o[:abc]}'." unless o[:q]
|
82
|
+
else
|
83
|
+
file_i = 0
|
84
|
+
ln_i = 0
|
85
|
+
$stderr.puts "Reading RBM files within '#{o[:dir]}'." unless o[:q]
|
86
|
+
abc = File.open(o[:abc] + ".tmp", "w")
|
87
|
+
Dir.entries(o[:dir]).each do |rbm_file|
|
88
|
+
next unless File.file?(o[:dir]+"/"+rbm_file)
|
89
|
+
# Parse the filename to identify the genomes
|
90
|
+
m = /#{o[:f]}/.match(rbm_file)
|
91
|
+
if m.nil? or m[2].nil?
|
92
|
+
warn "Ignoring #{rbm_file}: doesn't match /#{o[:f]}/."
|
93
|
+
next
|
94
|
+
end
|
95
|
+
file_i += 1
|
96
|
+
# Read the RBMs list
|
97
|
+
f = File.open(o[:dir]+"/"+rbm_file, "r")
|
98
|
+
while ln = f.gets
|
99
|
+
# Add the RBM to the abc file
|
100
|
+
row = ln.split(/\t/)
|
101
|
+
abc.puts [m[1]+">"+row[0], m[2]+">"+row[1],
|
102
|
+
(o[:blind] ? "1" :
|
103
|
+
(o[:evalue] ? row[10] :
|
104
|
+
(o[:identity] ? row[2] : row[11])))].join("\t")
|
105
|
+
ln_i += 1
|
106
|
+
end
|
107
|
+
f.close
|
108
|
+
$stderr.print " Scanned files: #{file_i}. " +
|
109
|
+
"Found RBMs: #{ln_i}. \r" unless o[:q]
|
110
|
+
end
|
111
|
+
abc.close
|
112
|
+
File.rename(o[:abc] + ".tmp", o[:abc])
|
113
|
+
$stderr.print "\n" unless o[:q]
|
114
|
+
end # if File.size? o[:abc] ... else
|
115
|
+
|
116
|
+
# Build .mci file (mcxload) and compute .mccl file (mcl)
|
117
|
+
$stderr.puts "Markov-Clustering" unless o[:q]
|
118
|
+
`'#{o[:mcl]}mcxload' #{"--stream-mirror" unless o[:bestmatch]} \
|
119
|
+
-abc '#{o[:abc]}' -o '#{dir}/rbms.mci' --write-binary \
|
120
|
+
-write-tab '#{dir}/genes.tab' #{"--stream-neg-log10" if o[:evalue]} \
|
121
|
+
&>/dev/null`
|
122
|
+
`'#{o[:mcl]}mcl' '#{dir}/rbms.mci' -V all -I #{o[:inflation].to_s} \
|
123
|
+
-o '#{dir}/ogs.mcl' -te #{o[:thr].to_s}`
|
124
|
+
|
125
|
+
# Load .tab as Gene objects
|
126
|
+
$stderr.puts "Loading gene table from '#{dir}/genes.tab'." unless o[:q]
|
127
|
+
genes = []
|
128
|
+
tab = File.open("#{dir}/genes.tab", "r")
|
129
|
+
while ln = tab.gets
|
130
|
+
ln.chomp!
|
131
|
+
r = ln.split /\t|>/
|
132
|
+
genes[ r[0].to_i ] = Gene.new(r[1], r[2])
|
133
|
+
end
|
134
|
+
tab.close
|
135
|
+
$stderr.puts " Got " + genes.size.to_s + " genes in " +
|
136
|
+
Gene.genomes.size.to_s + " genomes." unless o[:q]
|
137
|
+
|
138
|
+
# Load .mcl file as OGCollection
|
139
|
+
$stderr.puts "Loading clusters from '#{dir}/ogs.mcl'." unless o[:q]
|
140
|
+
collection = OGCollection.new
|
141
|
+
mcl = File.open("#{dir}/ogs.mcl", "r")
|
142
|
+
in_matrix = false
|
143
|
+
my_genes = nil
|
144
|
+
while ln = mcl.gets
|
145
|
+
ln.chomp!
|
146
|
+
if ln =~ /^\(mclmatrix$/
|
147
|
+
in_matrix = true
|
148
|
+
next
|
149
|
+
end
|
150
|
+
next if ln =~ /^begin$/
|
151
|
+
if in_matrix
|
152
|
+
break if ln =~ /^\)$/
|
153
|
+
if ln =~ /^\d+\s+/
|
154
|
+
ln.sub!(/^\d+\s+/, "")
|
155
|
+
my_genes = []
|
156
|
+
end
|
157
|
+
ln.sub!(/^\s+/, "")
|
158
|
+
raise "Incomplete mcl matrix, offending line: #{$.}: #{ln}" if
|
159
|
+
my_genes.nil?
|
160
|
+
my_genes += ln.split(/\s/)
|
161
|
+
if my_genes.last == "$"
|
162
|
+
my_genes.pop
|
163
|
+
og = OG.new
|
164
|
+
my_genes.each{|i| og << genes[ i.to_i ]}
|
165
|
+
collection << og
|
166
|
+
my_genes = nil
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
mcl.close
|
171
|
+
$stderr.puts " Got #{collection.ogs.size} clusters." unless o[:q]
|
172
|
+
|
173
|
+
# Save the output matrix
|
174
|
+
$stderr.puts "Saving matrix into '#{o[:out]}'." unless o[:q]
|
175
|
+
f = File.open(o[:out], "w")
|
176
|
+
f.puts collection.to_s
|
177
|
+
f.close
|
178
|
+
$stderr.puts "Done.\n" unless o[:q]
|
179
|
+
end
|
180
|
+
rescue => err
|
181
|
+
$stderr.puts "Exception: #{err}\n\n"
|
182
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
183
|
+
err
|
184
|
+
end
|
185
|
+
|
186
|
+
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Apr-29-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
$:.push File.expand_path(File.dirname(__FILE__) + '/lib')
|
10
|
+
require 'enveomics_rb/og'
|
11
|
+
require 'optparse'
|
12
|
+
|
13
|
+
o = {:q=>FALSE, :f=>"(\\S+)-(\\S+)\\.rbm", :consolidate=>TRUE, :pre=>[]}
|
14
|
+
ARGV << '-h' if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
***IMPORTANT NOTE***
|
18
|
+
This script suffers from chaining effect and is very sensitive to spurious connections,
|
19
|
+
because it applies a greedy clustering algorithm. For most practical purposes, the use
|
20
|
+
of this script is discouraged and `ogs.mcl.rb` should be preferred. [ Apr-29-2015 ]
|
21
|
+
|
22
|
+
Identifies Orthology Groups (OGs) in Reciprocal Best Matches (RBM)
|
23
|
+
between all pairs in a collection of genomes.
|
24
|
+
|
25
|
+
Usage: #{$0} [options]"
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Mandatory"
|
28
|
+
opts.on("-o", "--out FILE", "Output file containing the detected OGs."){ |v| o[:out]=v }
|
29
|
+
opts.separator ""
|
30
|
+
opts.separator "Other Options"
|
31
|
+
opts.on("-d", "--dir DIR", "Directory containing the RBM files."){ |v| o[:dir]=v }
|
32
|
+
opts.on("-p", "--pre-ogs FILE1,FILE2,...", Array, "Pre-computed OGs file(s), separated by commas."){ |v| o[:pre]=v }
|
33
|
+
opts.on("-n", "--unchecked", "Do not check internal redundancy in OGs."){ o[:consolidate]=FALSE }
|
34
|
+
opts.on("-f","--format STRING", "Format of the filenames for the RBM files (within -d), using regex syntax. By default: '#{o[:f]}'."){ |v| o[:f]=v }
|
35
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
|
36
|
+
opts.on("-h", "--help", "Display this screen.") do
|
37
|
+
puts opts
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
opts.separator ""
|
41
|
+
end.parse!
|
42
|
+
abort "-o is mandatory" if o[:out].nil?
|
43
|
+
|
44
|
+
##### MAIN:
|
45
|
+
begin
|
46
|
+
# Initialize the collection of OGs.
|
47
|
+
collection = OGCollection.new
|
48
|
+
# Read the pre-computed OGs (if -p is passed).
|
49
|
+
o[:pre].each do |pre|
|
50
|
+
$stderr.puts "Reading pre-computed OGs in '#{pre}'." unless o[:q]
|
51
|
+
f = File.open(pre, 'r')
|
52
|
+
h = f.gets.chomp.split /\t/
|
53
|
+
while ln = f.gets
|
54
|
+
collection << OG.new(h, ln.chomp.split(/\t/))
|
55
|
+
end
|
56
|
+
f.close
|
57
|
+
$stderr.puts " Loaded OGs: #{collection.ogs.length}." unless o[:q]
|
58
|
+
end
|
59
|
+
# Read the RBM files in the directory (if -d is passed).
|
60
|
+
unless o[:dir].nil?
|
61
|
+
abort "-d must exist and be a directory" unless File.exists?(o[:dir]) and File.directory?(o[:dir])
|
62
|
+
# Traverse the whole directory.
|
63
|
+
file_i = 0
|
64
|
+
$stderr.puts "Reading RBM files within '#{o[:dir]}'." unless o[:q]
|
65
|
+
Dir.entries(o[:dir]).each do |rbm_file|
|
66
|
+
next unless File.file?(o[:dir]+"/"+rbm_file)
|
67
|
+
# Parse the filename to identify the genomes.
|
68
|
+
m = /#{o[:f]}/.match(rbm_file)
|
69
|
+
if m.nil? or m[2].nil?
|
70
|
+
warn "Cannot parse filename: #{rbm_file} (doesn't match /#{o[:f]}/)."
|
71
|
+
next
|
72
|
+
end
|
73
|
+
file_i += 1
|
74
|
+
# Read the RBMs list
|
75
|
+
f = File.open(o[:dir]+"/"+rbm_file, "r")
|
76
|
+
while ln = f.gets
|
77
|
+
# Add the RBM to the collection of OGs. Only the first two columns are used.
|
78
|
+
row = ln.split(/\t/)
|
79
|
+
collection.add_rbm( Gene.new(m[1],row[0]), Gene.new(m[2],row[1]) )
|
80
|
+
end
|
81
|
+
f.close
|
82
|
+
$stderr.print " Scanned files: #{file_i}. Found OGs: #{collection.ogs.length}. \r" unless o[:q]
|
83
|
+
end
|
84
|
+
$stderr.print "\n" unless o[:q]
|
85
|
+
end
|
86
|
+
# Evaluate internal consistency merging linked OGs (unless -n is passed).
|
87
|
+
if o[:consolidate]
|
88
|
+
$stderr.puts "Evaluating internal consistency." unless o[:q]
|
89
|
+
collection.consolidate!
|
90
|
+
$stderr.puts " Final OGs: #{collection.ogs.length}." unless o[:q]
|
91
|
+
end
|
92
|
+
# Save the output matrix
|
93
|
+
$stderr.puts "Saving matrix into '#{o[:out]}'." unless o[:q]
|
94
|
+
f = File.open(o[:out], "w")
|
95
|
+
f.puts collection.to_s
|
96
|
+
f.close
|
97
|
+
$stderr.puts "Done.\n" unless o[:q]
|
98
|
+
rescue => err
|
99
|
+
$stderr.puts "Exception: #{err}\n\n"
|
100
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
101
|
+
err
|
102
|
+
end
|
103
|
+
|
104
|
+
|
@@ -0,0 +1,131 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @license: Artistic-2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
$:.push File.expand_path(File.dirname(__FILE__) + '/lib')
|
9
|
+
require 'enveomics_rb/og'
|
10
|
+
require 'optparse'
|
11
|
+
require 'json'
|
12
|
+
|
13
|
+
o = {q:false, a:false}
|
14
|
+
ARGV << '-h' if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
Estimates some descriptive statistics on a set of Orthology Groups (OGs).
|
18
|
+
|
19
|
+
Usage: #{$0} [options]"
|
20
|
+
opts.separator ""
|
21
|
+
opts.separator "Mandatory"
|
22
|
+
opts.on("-o", "--ogs FILE",
|
23
|
+
"Input file containing the precomputed OGs."){ |v| o[:ogs]=v }
|
24
|
+
opts.separator ""
|
25
|
+
opts.separator "Other Options"
|
26
|
+
opts.on("-j", "--json FILE", "Output file in JSON format."){ |v| o[:json]=v }
|
27
|
+
opts.on("-t", "--tab FILE","Output file in tabular format."){ |v| o[:tab]=v }
|
28
|
+
opts.on("-T", "--transposed-tab FILE",
|
29
|
+
"Output file in transposed tabular format."){ |v| o[:ttab]=v }
|
30
|
+
opts.on("-a", "--auto", "Run completely quietly (no STDERR or STDOUT)") do
|
31
|
+
o[:q] = true
|
32
|
+
o[:a] = true
|
33
|
+
end
|
34
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
|
35
|
+
opts.on("-h", "--help", "Display this screen.") do
|
36
|
+
puts opts
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
opts.separator ""
|
40
|
+
end.parse!
|
41
|
+
abort "-o is mandatory" if o[:ogs].nil?
|
42
|
+
|
43
|
+
##### MAIN:
|
44
|
+
begin
|
45
|
+
# Initialize the collection of OGs.
|
46
|
+
collection = OGCollection.new
|
47
|
+
|
48
|
+
# Read the pre-computed OGs
|
49
|
+
$stderr.puts "Reading pre-computed OGs in '#{o[:ogs]}'." unless o[:q]
|
50
|
+
f = File.open(o[:ogs], "r")
|
51
|
+
h = f.gets.chomp.split /\t/
|
52
|
+
while ln = f.gets
|
53
|
+
collection << OG.new(h, ln.chomp.split(/\t/))
|
54
|
+
end
|
55
|
+
f.close
|
56
|
+
$stderr.puts " Loaded OGs: #{collection.ogs.length}." unless o[:q]
|
57
|
+
|
58
|
+
# Estimate descriptive stats
|
59
|
+
stat_name = {
|
60
|
+
genomes: "Number of genomes",
|
61
|
+
pan: "Pangenome (OGs)",
|
62
|
+
core: "Core genome (OGs)",
|
63
|
+
core90pc: "OGs in 90% of the genomes",
|
64
|
+
core80pc: "OGs in 80% of the genomes",
|
65
|
+
unus: "Unus genome, core genome discarding paralogs (OGs)",
|
66
|
+
avg: "Average number of OGs in a genome",
|
67
|
+
avg_pan: "Average genome (OGs) / Pangenome (OGs)",
|
68
|
+
core_avg: "Core genome (OGs) / Average genome (OGs)",
|
69
|
+
core_pan: "Core genome (OGs) / Pangenome (OGs)",
|
70
|
+
ogs_shannon: "Entropy of the OG frequencies (bits)"
|
71
|
+
}
|
72
|
+
stats = {}
|
73
|
+
stats[:genomes] = Gene.genomes.length
|
74
|
+
stats[:pan] = collection.ogs.length
|
75
|
+
stats[:core] = collection.ogs.map do |og|
|
76
|
+
(og.genomes.length == Gene.genomes.length) ? 1 : 0
|
77
|
+
end.inject(0,:+)
|
78
|
+
stats[:core90pc] = collection.ogs.map do |og|
|
79
|
+
(og.genomes.length >= 0.9*Gene.genomes.length) ? 1 : 0
|
80
|
+
end.inject(0,:+)
|
81
|
+
stats[:core80pc] = collection.ogs.map do |og|
|
82
|
+
(og.genomes.length >= 0.8*Gene.genomes.length) ? 1 : 0
|
83
|
+
end.inject(0,:+)
|
84
|
+
stats[:unus] = collection.ogs.map do |og|
|
85
|
+
(og.genomes.length != Gene.genomes.length) ? 0 :
|
86
|
+
(og.genes.all?{ |i| i.size==1 }) ? 1 : 0
|
87
|
+
end.inject(0,:+)
|
88
|
+
og_genomes = collection.ogs.map{ |og| og.genomes.length }.inject(0,:+)
|
89
|
+
stats[:avg] = og_genomes.to_f/Gene.genomes.length
|
90
|
+
stats[:avg_pan] = stats[:avg]/stats[:pan]
|
91
|
+
stats[:core_avg] = stats[:core].to_f/stats[:avg]
|
92
|
+
stats[:core_pan] = stats[:core].to_f/stats[:pan]
|
93
|
+
stats[:ogs_shannon] = -1 * collection.ogs.map do |og|
|
94
|
+
pi = og.genomes.length.to_f/Gene.genomes.length
|
95
|
+
pi * Math.log(pi)
|
96
|
+
end.inject(0.0,:+)
|
97
|
+
|
98
|
+
# Show result
|
99
|
+
$stderr.puts "Generating reports." unless o[:q]
|
100
|
+
stats.each_pair{ |k,v| puts " #{stat_name[k]}: #{v}" } unless o[:a]
|
101
|
+
|
102
|
+
# Save results in JSON
|
103
|
+
unless o[:json].nil?
|
104
|
+
ohf = File.open(o[:json], "w")
|
105
|
+
ohf.puts JSON.pretty_generate(stats)
|
106
|
+
ohf.close
|
107
|
+
end
|
108
|
+
|
109
|
+
# Save results in tab
|
110
|
+
unless o[:tab].nil?
|
111
|
+
ohf = File.open(o[:tab], "w")
|
112
|
+
stats.each_pair{ |k,v| ohf.puts "#{k}\t#{v}" }
|
113
|
+
ohf.close
|
114
|
+
end
|
115
|
+
|
116
|
+
# Save results in T(tab)
|
117
|
+
unless o[:ttab].nil?
|
118
|
+
ohf = File.open(o[:ttab], "w")
|
119
|
+
ohf.puts stats.keys.join("\t")
|
120
|
+
ohf.puts stats.values.join("\t")
|
121
|
+
ohf.close
|
122
|
+
end
|
123
|
+
|
124
|
+
$stderr.puts "Done.\n" unless o[:q]
|
125
|
+
rescue => err
|
126
|
+
$stderr.puts "Exception: #{err}\n\n"
|
127
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
128
|
+
err
|
129
|
+
end
|
130
|
+
|
131
|
+
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Aug-25-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
require 'tmpdir'
|
11
|
+
|
12
|
+
o = {len:0, id:0, fract:0, score:0, q:false, bin:"", program:"blast+", thr:1,
|
13
|
+
nucl:false}
|
14
|
+
ARGV << "-h" if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
Finds the reciprocal best matches between two sets of sequences.
|
18
|
+
|
19
|
+
Usage: #{$0} [options]"
|
20
|
+
opts.separator ""
|
21
|
+
opts.separator "Mandatory"
|
22
|
+
opts.on("-1", "--seq1 FILE",
|
23
|
+
"Path to the FastA file containing the set 1."){ |v| o[:seq1] = v }
|
24
|
+
opts.on("-2", "--seq2 FILE",
|
25
|
+
"Path to the FastA file containing the set 2."){ |v| o[:seq2] = v }
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Search Options"
|
28
|
+
opts.on("-n", "--nucl",
|
29
|
+
"Sequences are assumed to be nucleotides (proteins by default)."
|
30
|
+
){ |v| o[:nucl] = true }
|
31
|
+
opts.on("-l", "--len INT",
|
32
|
+
"Minimum alignment length (in residues). By default: #{o[:len]}."
|
33
|
+
){ |v| o[:len] = v.to_i }
|
34
|
+
opts.on("-f", "--fract FLOAT",
|
35
|
+
"Minimum alignment length (as a fraction of the query).",
|
36
|
+
"If set, requires BLAST+ or Diamond (see -p). By default: #{o[:fract]}."
|
37
|
+
){ |v| o[:fract] = v.to_i }
|
38
|
+
opts.on("-i", "--id NUM",
|
39
|
+
"Minimum alignment identity (in %). By default: #{o[:id].to_s}."
|
40
|
+
){ |v| o[:id] = v.to_f }
|
41
|
+
opts.on("-s", "--score NUM",
|
42
|
+
"Minimum alignment score (in bits). By default: #{o[:score]}."
|
43
|
+
){ |v| o[:score] = v.to_f }
|
44
|
+
opts.separator ""
|
45
|
+
opts.separator "Software Options"
|
46
|
+
opts.on("-b", "--bin DIR",
|
47
|
+
"Path to the directory containing the binaries of the search program."
|
48
|
+
){ |v| o[:bin] = v }
|
49
|
+
opts.on("-p", "--program STR",
|
50
|
+
"Search program to be used. One of: blast+ (default), blast, diamond."
|
51
|
+
){ |v| o[:program] = v }
|
52
|
+
opts.on("-t", "--threads INT",
|
53
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}."
|
54
|
+
){ |v| o[:thr] = v.to_i }
|
55
|
+
opts.separator ""
|
56
|
+
opts.separator "Other Options"
|
57
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
|
58
|
+
opts.on("-h", "--help", "Display this screen") do
|
59
|
+
puts opts
|
60
|
+
exit
|
61
|
+
end
|
62
|
+
opts.separator ""
|
63
|
+
end.parse!
|
64
|
+
abort "-1 is mandatory" if o[:seq1].nil?
|
65
|
+
abort "-2 is mandatory" if o[:seq2].nil?
|
66
|
+
abort '-p diamond is incompatible with -n' if o[:program]=='diamond' && o[:nucl]
|
67
|
+
abort 'Argument -f/--fract requires -p blast+ or -p diamond' if
|
68
|
+
o[:fract]>0 and o[:program]!='blast+' and o[:program]!='diamond'
|
69
|
+
o[:bin] = o[:bin]+"/" if o[:bin].size > 0
|
70
|
+
|
71
|
+
Dir.mktmpdir do |dir|
|
72
|
+
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
73
|
+
|
74
|
+
# Create databases.
|
75
|
+
$stderr.puts "Creating databases." unless o[:q]
|
76
|
+
[:seq1, :seq2].each do |seq|
|
77
|
+
case o[:program].downcase
|
78
|
+
when 'blast'
|
79
|
+
`"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
|
80
|
+
-p #{(o[:nucl]?"F":"T")}`
|
81
|
+
when 'blast+'
|
82
|
+
`"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
|
83
|
+
-dbtype #{(o[:nucl]?"nucl":"prot")}`
|
84
|
+
when 'diamond'
|
85
|
+
`"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
|
86
|
+
--db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}"`
|
87
|
+
else
|
88
|
+
abort "Unsupported program: #{o[:program]}."
|
89
|
+
end
|
90
|
+
end # |seq|
|
91
|
+
|
92
|
+
# Best-hits.
|
93
|
+
rbh = {}
|
94
|
+
n2 = 0
|
95
|
+
$stderr.puts " Running comparisons." unless o[:q]
|
96
|
+
[2,1].each do |i|
|
97
|
+
qry_seen = {}
|
98
|
+
q = o[:"seq#{i}"]
|
99
|
+
s = "#{dir}/seq#{i==1?2:1}"
|
100
|
+
$stderr.puts " Query: #{q}." unless o[:q]
|
101
|
+
case o[:program].downcase
|
102
|
+
when 'blast'
|
103
|
+
`"#{o[:bin]}blastall" -p #{o[:nucl]?"blastn":"blastp"} -d "#{s}" \
|
104
|
+
-i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
|
105
|
+
when 'blast+'
|
106
|
+
`"#{o[:bin]}#{o[:nucl]?"blastn":"blastp"}" -db "#{s}" -query "#{q}" \
|
107
|
+
-max_target_seqs 1 -num_threads #{o[:thr]} -out "#{dir}/#{i}.tab" \
|
108
|
+
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
|
109
|
+
sstart send evalue bitscore qlen slen"`
|
110
|
+
when 'diamond'
|
111
|
+
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" \
|
112
|
+
--outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
|
113
|
+
sstart send evalue bitscore qlen slen" --db "#{s}.dmnd" \
|
114
|
+
--query "#{q}" --out "#{dir}/#{i}.tab" --more-sensitive`
|
115
|
+
else
|
116
|
+
abort "Unsupported program: #{o[:program]}."
|
117
|
+
end
|
118
|
+
fh = File.open("#{dir}/#{i}.tab", "r")
|
119
|
+
n = 0
|
120
|
+
fh.each_line do |ln|
|
121
|
+
ln.chomp!
|
122
|
+
row = ln.split(/\t/)
|
123
|
+
row[12] = "1" unless %w[blast+ diamond].include? o[:program]
|
124
|
+
if qry_seen[ row[0] ].nil? and row[3].to_i >= o[:len] and
|
125
|
+
row[2].to_f >= o[:id] and row[11].to_f >= o[:score] and
|
126
|
+
row[3].to_f/row[12].to_i >= o[:fract]
|
127
|
+
qry_seen[ row[0] ] = 1
|
128
|
+
n += 1
|
129
|
+
if i==2
|
130
|
+
rbh[ row[0] ] = row[1]
|
131
|
+
else
|
132
|
+
if !rbh[ row[1] ].nil? and rbh[ row[1] ]==row[0]
|
133
|
+
puts ln
|
134
|
+
n2 += 1
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end # |ln|
|
139
|
+
fh.close()
|
140
|
+
$stderr.puts " #{n} sequences with hit." unless o[:q]
|
141
|
+
end # |i|
|
142
|
+
$stderr.puts " #{n2} RBMs." unless o[:q]
|
143
|
+
end # |dir|
|
144
|
+
|
145
|
+
|
146
|
+
|