miga-base 0.7.26.0 → 0.7.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +766 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +243 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +67 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +829 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +501 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +156 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +163 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +57 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +418 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm.rb +146 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +50 -0
- data/utils/enveomics/enveomics.R/README.md +80 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- metadata +277 -4
@@ -0,0 +1,147 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$:.push File.expand_path("../lib", __FILE__)
|
7
|
+
require "enveomics_rb/enveomics"
|
8
|
+
require "enveomics_rb/vcf"
|
9
|
+
|
10
|
+
o = {}
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
opt.banner = "
|
13
|
+
Estimates the Ka/Ks ratio from the SNPs in a VCF file. Ka and Ks are corrected
|
14
|
+
using pseudo-counts, but no corrections for multiple substitutions are
|
15
|
+
applied.
|
16
|
+
|
17
|
+
Usage: #{$0} [options]".gsub(/^ +/,"")
|
18
|
+
opt.separator ""
|
19
|
+
opt.separator "Mandatory"
|
20
|
+
opt.on("-i", "--input FILE",
|
21
|
+
"Input file in Variant Call Format (VCF)."){ |v| o[:file] = v}
|
22
|
+
opt.on("-s", "--seqs FILE",
|
23
|
+
"Input gene sequences (nucleotides) in FastA format."){ |v| o[:seqs] = v}
|
24
|
+
opt.separator ""
|
25
|
+
opt.separator "Parameters"
|
26
|
+
opt.on("-f", "--syn-frx FLOAT",
|
27
|
+
"Fraction of synonymous substitutions. If passed, the number of sites are",
|
28
|
+
"estimated (not counted per gene), speeding up the computation ~10X."
|
29
|
+
){ |v| o[:syn_frx] = v.to_f }
|
30
|
+
opt.on("-b", "--syn-bacterial-code",
|
31
|
+
"Sets --syn-frx to 0.760417, approximately the proportion of synonymous",
|
32
|
+
"substitutions in the bacterial code."){ o[:syn_frx] = 0.760417 }
|
33
|
+
opt.separator ""
|
34
|
+
opt.separator "Miscellaneous"
|
35
|
+
opt.on("-c", "--codon-file FILE",
|
36
|
+
"Output file including the codons of substitution variants."
|
37
|
+
){ |v| o[:codon_file] = v }
|
38
|
+
opt.on("-h", "--help", "Display this screen.") do
|
39
|
+
puts opt
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
opt.separator ""
|
43
|
+
end.parse!
|
44
|
+
|
45
|
+
abort "--input is mandatory" if o[:file].nil?
|
46
|
+
abort "--seqs is mandatory" if o[:seqs].nil?
|
47
|
+
|
48
|
+
# Codon table (11. The Bacterial, Archaeal and Plant Plastid Code)
|
49
|
+
# https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG11
|
50
|
+
t = {
|
51
|
+
AAs: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
52
|
+
Starts: "---M------**--*----M------------MMMM---------------M------------",
|
53
|
+
Base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
|
54
|
+
Base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
|
55
|
+
Base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
|
56
|
+
}
|
57
|
+
$codon_aa = {}
|
58
|
+
$codon_st = {}
|
59
|
+
(0 .. (t[:Base1].size-1)).each do |i|
|
60
|
+
cod = [:Base1, :Base2, :Base3].map{ |k| t[k][i] }.join
|
61
|
+
$codon_aa[cod] = t[:AAs][i]
|
62
|
+
$codon_st[cod] = t[:Starts][i]
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Is the change +cod+ to +cod_alt+ synonymous? +start_codon+ indicates if the
|
67
|
+
# codon the first in the gene.
|
68
|
+
def syn?(cod, cod_alt, start_codon=false)
|
69
|
+
start_codon ?
|
70
|
+
( $codon_st[cod] == $codon_st[cod_alt] ) :
|
71
|
+
( $codon_aa[cod] == $codon_aa[cod_alt] )
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Estimates the fraction of times that the substitutions in the sequence +seq+
|
76
|
+
# result in synonymous mutations from those in position +pos+ by any of the
|
77
|
+
# nucleotides in +alts+.
|
78
|
+
def syn_fraction(seq, pos, alts)
|
79
|
+
cod_let = (pos-1)%3
|
80
|
+
cod_pos = (pos-1) - cod_let
|
81
|
+
cod = seq[cod_pos .. (cod_pos+2)]
|
82
|
+
syn = 0
|
83
|
+
cod_alts = alts.map do |alt|
|
84
|
+
cod_alt = "#{cod}"
|
85
|
+
cod_alt[cod_let] = alt
|
86
|
+
cod_alt
|
87
|
+
end
|
88
|
+
syn = cod_alts.map{ |i| syn?(cod, i, pos<=3) ? 1 : 0 }.inject(0,:+)
|
89
|
+
$codon_fh.puts [syn, cod, cod_alts.join(",")].join("\t") unless $codon_fh.nil?
|
90
|
+
syn.to_f/alts.size
|
91
|
+
end
|
92
|
+
|
93
|
+
# Read sequences
|
94
|
+
seqs = {}
|
95
|
+
File.open(o[:seqs], "r") do |fh|
|
96
|
+
id = ""
|
97
|
+
fh.each_line do |ln|
|
98
|
+
if ln =~ /^>(\S+)/
|
99
|
+
id = $1
|
100
|
+
seqs[id] = ""
|
101
|
+
else
|
102
|
+
seqs[id] += ln.chomp.gsub(/[^A-Za-z]/, "")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Process variants
|
108
|
+
$codon_fh = nil
|
109
|
+
unless o[:codon_file].nil?
|
110
|
+
$codon_fh = File.open(o[:codon_file], "w")
|
111
|
+
$codon_fh.puts "#" + %w[Syn Ref Alt].join("\t")
|
112
|
+
end
|
113
|
+
vcf = VCF.new(o[:file])
|
114
|
+
gen = {}
|
115
|
+
vcf.each_variant do |v|
|
116
|
+
next if v.indel?
|
117
|
+
raise "REF doesn't match VCF:\n#{v}" unless seqs[v.chrom][v.pos-1] == v.ref
|
118
|
+
gen[v.chrom] ||= [0.0, 0.0]
|
119
|
+
alts = v.alt.split(",")
|
120
|
+
syn = syn_fraction(seqs[v.chrom], v.pos, alts)
|
121
|
+
gen[v.chrom][0] += 1.0-syn
|
122
|
+
gen[v.chrom][1] += syn
|
123
|
+
end
|
124
|
+
$codon_fh.close unless $codon_fh.nil?
|
125
|
+
$codon_fh = nil
|
126
|
+
|
127
|
+
# Ka/Ks
|
128
|
+
puts "#" +
|
129
|
+
"SeqID KaKs Ka Ks NonSynSubs SynSubs NonSynSites SynSites".tr(" ","\t")
|
130
|
+
gen.each do |k,v|
|
131
|
+
if o[:syn_frx].nil?
|
132
|
+
v[2,3] = [0.0,0.0]
|
133
|
+
(1 .. seqs[k].size).each do |pos|
|
134
|
+
alts = %w(A C T G) - [seqs[k][pos-1]]
|
135
|
+
syn = syn_fraction(seqs[k], pos, alts)
|
136
|
+
v[2] += 1.0-syn
|
137
|
+
v[3] += syn
|
138
|
+
end
|
139
|
+
else
|
140
|
+
v[2] = seqs[k].size.to_f*o[:syn_frx]
|
141
|
+
v[3] = seqs[k].size.to_f*(1.0-o[:syn_frx])
|
142
|
+
end
|
143
|
+
ka = (v[0] + 1) / (v[2] + 2)
|
144
|
+
ks = (v[1] + 1) / (v[3] + 2)
|
145
|
+
puts ([k, ka/ks, ka, ks] + v).join("\t")
|
146
|
+
end
|
147
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$:.push File.expand_path(File.dirname(__FILE__) + "/lib")
|
7
|
+
require "enveomics_rb/enveomics"
|
8
|
+
require "enveomics_rb/vcf"
|
9
|
+
|
10
|
+
o = {min_dp:4, max_dp:Float::INFINITY, min_ref_dp:2, min_alt_dp:2, min_qual:0.0,
|
11
|
+
indels:false, min_ic:0.0}
|
12
|
+
OptionParser.new do |opt|
|
13
|
+
opt.banner = "
|
14
|
+
Counts the number of Single-Nucleotide Polymorphisms (SNPs) in a VCF file.
|
15
|
+
|
16
|
+
Usage: #{$0} [options]".gsub(/^ +/,"")
|
17
|
+
opt.separator ""
|
18
|
+
opt.separator "Mandatory"
|
19
|
+
opt.on("-i", "--input FILE",
|
20
|
+
"Input file in Variant Call Format (VCF)."){ |v| o[:file] = v}
|
21
|
+
opt.separator ""
|
22
|
+
opt.separator "Parameters"
|
23
|
+
opt.on("-o", "--out FILE",
|
24
|
+
"Output (filtered) file in Variant Call Format (VCF)."){ |v| o[:out] = v}
|
25
|
+
opt.on("-m", "--min-dp INT",
|
26
|
+
"Minimum number of reads covering the position. By default: #{o[:min_dp]}."
|
27
|
+
){ |v| o[:min_dp] = v.to_i }
|
28
|
+
opt.on("-M", "--max-dp INT",
|
29
|
+
"Maximum number of reads covering the position. By default: #{o[:max_dp]}."
|
30
|
+
){ |v| o[:max_dp] = (v=="Infinity" ? Float::INFINITY : v.to_i) }
|
31
|
+
opt.on("-r", "--min-ref-dp INT",
|
32
|
+
"Minimum number of reads supporting allele REF. " +
|
33
|
+
"By default: #{o[:min_ref_dp]}."
|
34
|
+
){ |v| o[:min_ref_dp] = v.to_i }
|
35
|
+
opt.on("-a", "--min-alt-dp INT",
|
36
|
+
"Minimum number of reads supporting allele ALT. " +
|
37
|
+
"By default: #{o[:min_alt_dp]}."
|
38
|
+
){ |v| o[:min_alt_dp] = v.to_i }
|
39
|
+
opt.on("-q", "--min-quality FLOAT",
|
40
|
+
"Minimum quality of the position mapping. By default: #{o[:min_qual]}."
|
41
|
+
){ |v| o[:max_dp] = v.to_f }
|
42
|
+
opt.on("-s", "--min-shannon FLOAT",
|
43
|
+
"Minimum information content (in bits, from 0 to 1). " +
|
44
|
+
"By default: #{o[:min_ic]}"){ |v| o[:min_ic] = v.to_f }
|
45
|
+
opt.on("--[no-]indels",
|
46
|
+
"Process (or ignore) indels. By default: ignore."
|
47
|
+
){ |v| o[:indels] = v }
|
48
|
+
opt.on("-h", "--help", "Display this screen.") do
|
49
|
+
puts opt
|
50
|
+
exit
|
51
|
+
end
|
52
|
+
opt.separator ""
|
53
|
+
end.parse!
|
54
|
+
|
55
|
+
abort "--input is mandatory" if o[:file].nil?
|
56
|
+
|
57
|
+
vcf = VCF.new(o[:file])
|
58
|
+
c = 0
|
59
|
+
dp = 0
|
60
|
+
ref_dp = 0
|
61
|
+
alt_dp = 0
|
62
|
+
h = 0
|
63
|
+
unless o[:out].nil?
|
64
|
+
ofh = File.open(o[:out], "w")
|
65
|
+
vcf.each_header{ |h| ofh.print h }
|
66
|
+
end
|
67
|
+
vcf.each_variant do |v|
|
68
|
+
next if v.indel? and not o[:indels]
|
69
|
+
next if v.dp < o[:min_dp]
|
70
|
+
next if v.dp > o[:max_dp]
|
71
|
+
next if v.ref_dp < o[:min_ref_dp]
|
72
|
+
next if v.alt_dp < o[:min_alt_dp]
|
73
|
+
next if v.qual < o[:min_qual]
|
74
|
+
next if v.shannon < o[:min_ic]
|
75
|
+
c += 1
|
76
|
+
dp += v.dp
|
77
|
+
ref_dp += v.ref_dp
|
78
|
+
alt_dp += v.alt_dp
|
79
|
+
h += v.shannon
|
80
|
+
ofh.print v.to_s unless o[:out].nil?
|
81
|
+
end
|
82
|
+
ofh.close unless o[:out].nil?
|
83
|
+
|
84
|
+
puts "SNPs: #{c}", "Information content: #{h}",
|
85
|
+
"Average SNP depth: #{dp.to_f/c}",
|
86
|
+
"Average REF allele depth: #{ref_dp.to_f/c}",
|
87
|
+
"Average ALT allele depth: #{alt_dp.to_f/c}"
|
88
|
+
|
@@ -0,0 +1,418 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'zlib'
|
9
|
+
has_rest_client = true
|
10
|
+
has_sqlite3 = true
|
11
|
+
begin
|
12
|
+
require 'rubygems'
|
13
|
+
require 'restclient'
|
14
|
+
rescue LoadError
|
15
|
+
has_rest_client = false
|
16
|
+
end
|
17
|
+
begin
|
18
|
+
require 'sqlite3'
|
19
|
+
rescue LoadError
|
20
|
+
has_sqlite3 = false
|
21
|
+
end
|
22
|
+
|
23
|
+
o = {
|
24
|
+
bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
|
25
|
+
thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
|
26
|
+
len_fraction: 0.0, max_actg: 0.95
|
27
|
+
}
|
28
|
+
ARGV << '-h' if ARGV.size == 0
|
29
|
+
OptionParser.new do |opts|
|
30
|
+
opts.banner = "
|
31
|
+
Calculates the Average Amino Acid Identity between two genomes
|
32
|
+
|
33
|
+
Usage: #{$0} [options]"
|
34
|
+
opts.separator ''
|
35
|
+
opts.separator 'Mandatory'
|
36
|
+
opts.on(
|
37
|
+
'-1', '--seq1 FILE',
|
38
|
+
'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
|
39
|
+
) { |v| o[:seq1] = v }
|
40
|
+
opts.on(
|
41
|
+
'-2', '--seq2 FILE',
|
42
|
+
'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
|
43
|
+
) { |v| o[:seq2] = v }
|
44
|
+
if has_rest_client
|
45
|
+
opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
|
46
|
+
'genome (nucleotides) with the format ncbi:CP014272 instead of files'
|
47
|
+
else
|
48
|
+
opts.separator ' Install rest-client to enable NCBI-acc support'
|
49
|
+
end
|
50
|
+
opts.separator ''
|
51
|
+
opts.separator 'Search Options'
|
52
|
+
opts.on(
|
53
|
+
'-l', '--len INT', Integer,
|
54
|
+
"Minimum alignment length (in residues). By default: #{o[:len]}"
|
55
|
+
) { |v| o[:len] = v }
|
56
|
+
opts.on(
|
57
|
+
'-L', '--len-fraction NUM', Float,
|
58
|
+
'Minimum alignment length as a fraction of the shorter sequence',
|
59
|
+
"(range 0-1). By default: #{o[:len_fraction]}"
|
60
|
+
) { |v| o[:len_fraction] = v }
|
61
|
+
opts.on(
|
62
|
+
'-i', '--id FLOAT', Float,
|
63
|
+
"Minimum alignment identity (in %). By default: #{o[:id]}"
|
64
|
+
) { |v| o[:id] = v }
|
65
|
+
opts.on(
|
66
|
+
'-s', '--bitscore FLOAT', Float,
|
67
|
+
"Minimum bit score (in bits). By default: #{o[:bits]}"
|
68
|
+
) { |v| o[:bits] = v }
|
69
|
+
opts.on(
|
70
|
+
'-n', '--hits INT', Integer,
|
71
|
+
"Minimum number of hits. By default: #{o[:hits]}"
|
72
|
+
) { |v| o[:hits] = v }
|
73
|
+
opts.on(
|
74
|
+
'-N', '--nucl',
|
75
|
+
'The input sequences are nucleotides (genes), not proteins'
|
76
|
+
) { |v| o[:nucl] = v }
|
77
|
+
opts.on(
|
78
|
+
'--max-actg FLOAT', Float,
|
79
|
+
'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
|
80
|
+
"By default: #{o[:max_actg]}"
|
81
|
+
) { |v| o[:max_actg] = v }
|
82
|
+
opts.separator ''
|
83
|
+
opts.separator 'Software Options'
|
84
|
+
opts.on(
|
85
|
+
'-b', '--bin DIR',
|
86
|
+
'Path to the directory containing the binaries of the search program'
|
87
|
+
) { |v| o[:bin] = v }
|
88
|
+
opts.on(
|
89
|
+
'-p', '--program STR',
|
90
|
+
'Search program to be used. One of: blast+ (default), blast, blat, diamond'
|
91
|
+
) { |v| o[:program] = v }
|
92
|
+
opts.on(
|
93
|
+
'-t', '--threads INT', Integer,
|
94
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
95
|
+
) { |v| o[:thr] = v }
|
96
|
+
opts.separator ''
|
97
|
+
opts.separator 'SQLite3 Options'
|
98
|
+
unless has_sqlite3
|
99
|
+
opts.separator ' Install sqlite3 gem to enable database support'
|
100
|
+
end
|
101
|
+
opts.on(
|
102
|
+
'-S', '--sqlite3 FILE',
|
103
|
+
'Path to the SQLite3 database to create (or update) with the results'
|
104
|
+
) { |v| o[:sqlite3] = v }
|
105
|
+
opts.on(
|
106
|
+
'--name1 STR',
|
107
|
+
'Name of --seq1 to use in --sqlite3. By default determined by filename'
|
108
|
+
) { |v| o[:seq1name] = v }
|
109
|
+
opts.on(
|
110
|
+
'--name2 STR',
|
111
|
+
'Name of --seq2 to use in --sqlite3. By default determined by filename'
|
112
|
+
) { |v| o[:seq2name] = v }
|
113
|
+
opts.on(
|
114
|
+
'--[no-]save-rbm',
|
115
|
+
'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
|
116
|
+
"By default: #{o[:dbrbm]}"
|
117
|
+
) { |v| o[:dbrbm] = v }
|
118
|
+
opts.on(
|
119
|
+
'--lookup-first',
|
120
|
+
'Indicates if the AAI should be looked up first in the database',
|
121
|
+
'Requires --sqlite3, --auto, --name1, and --name2',
|
122
|
+
'Incompatible with --res, --tab, --out, and --rbm'
|
123
|
+
) { |v| o[:lookupfirst] = v }
|
124
|
+
opts.separator ''
|
125
|
+
opts.separator 'Other Output Options'
|
126
|
+
opts.on(
|
127
|
+
'-d', '--dec INT', Integer,
|
128
|
+
"Decimal positions to report. By default: #{o[:dec]}"
|
129
|
+
) { |v| o[:dec] = v }
|
130
|
+
opts.on(
|
131
|
+
'-R', '--rbm FILE',
|
132
|
+
'Saves a file with the reciprocal best matches'
|
133
|
+
) { |v| o[:rbm] = v }
|
134
|
+
opts.on(
|
135
|
+
'-o', '--out FILE',
|
136
|
+
'Saves a file describing the alignments used for two-way AAI'
|
137
|
+
) { |v| o[:out] = v }
|
138
|
+
opts.on(
|
139
|
+
'-r', '--res FILE', 'Saves a file with the final results'
|
140
|
+
) { |v| o[:res] = v }
|
141
|
+
opts.on(
|
142
|
+
'-T', '--tab FILE',
|
143
|
+
'Saves a file with the final two-way results in a tab-delimited form',
|
144
|
+
'The columns are (in that order):',
|
145
|
+
'AAI, standard deviation, proteins used, proteins in the smallest genome'
|
146
|
+
) { |v| o[:tab] = v }
|
147
|
+
opts.on(
|
148
|
+
'-a', '--auto',
|
149
|
+
'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
|
150
|
+
) { o[:auto] = true }
|
151
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
152
|
+
opts.on('-h', '--help', 'Display this screen') do
|
153
|
+
puts opts
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
opts.separator ''
|
157
|
+
end.parse!
|
158
|
+
|
159
|
+
# Check input
|
160
|
+
abort '-1 is mandatory' if o[:seq1].nil?
|
161
|
+
abort '-2 is mandatory' if o[:seq2].nil?
|
162
|
+
if o[:program] == 'diamond' && o[:nucl]
|
163
|
+
abort '-p diamond is incompatible with -N'
|
164
|
+
end
|
165
|
+
unless o[:sqlite3].nil? or has_sqlite3
|
166
|
+
abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
|
167
|
+
end
|
168
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
169
|
+
if o[:lookupfirst]
|
170
|
+
abort '--lookup-first requires --name1' if o[:seq1name].nil?
|
171
|
+
abort '--lookup-first requires --name2' if o[:seq2name].nil?
|
172
|
+
abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
|
173
|
+
abort '--lookup-first requires --auto' unless o[:auto]
|
174
|
+
%w[res tab out rbm].each do |k|
|
175
|
+
abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Create SQLite3 file
|
180
|
+
unless o[:sqlite3].nil?
|
181
|
+
$stderr.puts "Accessing SQLite3 file: #{o[:sqlite3]}." unless o[:q]
|
182
|
+
sqlite_db = SQLite3::Database.new o[:sqlite3]
|
183
|
+
sqlite_db.execute "create table if not exists rbm( seq1 varchar(256), " +
|
184
|
+
"seq2 varchar(256), id1 varchar(256), id2 varchar(256), id float, " +
|
185
|
+
"evalue float, bitscore float )"
|
186
|
+
sqlite_db.execute "create table if not exists aai( seq1 varchar(256), " +
|
187
|
+
"seq2 varchar(256), aai float, sd float, n int, omega int )"
|
188
|
+
end
|
189
|
+
|
190
|
+
# Look-up first
|
191
|
+
if o[:lookupfirst]
|
192
|
+
val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
|
193
|
+
[o[:seq1name], o[:seq2name]]
|
194
|
+
val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
|
195
|
+
[o[:seq2name], o[:seq1name]] if val.empty?
|
196
|
+
unless val.empty?
|
197
|
+
puts val.first.first
|
198
|
+
exit
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
Dir.mktmpdir do |dir|
|
203
|
+
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
204
|
+
|
205
|
+
# Create databases.
|
206
|
+
$stderr.puts "Creating databases." unless o[:q]
|
207
|
+
minfrg = nil
|
208
|
+
seq_names = []
|
209
|
+
seq_len = {}
|
210
|
+
actg_cnt = {}
|
211
|
+
ori_ids = {}
|
212
|
+
[:seq1, :seq2].each do |seq|
|
213
|
+
abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
|
214
|
+
/^gi:/.match(o[seq])
|
215
|
+
acc = /^ncbi:(\S+)/.match(o[seq])
|
216
|
+
unless acc.nil?
|
217
|
+
abort "NCBI-acc requested, but rest-client not supported. First " +
|
218
|
+
"install gem rest-client." unless has_rest_client
|
219
|
+
abort "NCBI-acc are currently not supported with --nucl. Please use " +
|
220
|
+
"ani.rb instead." if o[:nucl]
|
221
|
+
$stderr.puts " Downloading dataset from NCBI:#{acc[1]}." unless o[:q]
|
222
|
+
responseLink = RestClient.get(
|
223
|
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
|
224
|
+
{params:{db:"protein",dbfrom:"nuccore",id:acc[1],idtype:"acc"}})
|
225
|
+
abort "Unable to reach NCBI EUtils, error code " +
|
226
|
+
responseLink.code.to_s + "." unless responseLink.code == 200
|
227
|
+
fromId = true
|
228
|
+
protIds = []
|
229
|
+
o[seq] = "#{dir}/ncbi-#{seq.to_s}.fa"
|
230
|
+
fo = File.open(o[seq], "w")
|
231
|
+
responseLink.to_str.each_line.grep(/\s<Id>/) do |ln|
|
232
|
+
idMatch = /<Id>(\S+)<\/Id>/.match(ln)
|
233
|
+
unless idMatch.nil?
|
234
|
+
protIds.push(idMatch[1]) unless fromId
|
235
|
+
fromId = false
|
236
|
+
end
|
237
|
+
end
|
238
|
+
response = RestClient.post(
|
239
|
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
|
240
|
+
db:"nuccore",rettype:"fasta",id:protIds.join(","),idtype:"acc")
|
241
|
+
abort "Unable to reach NCBI EUtils, error code " +
|
242
|
+
response.code.to_s + "." unless response.code == 200
|
243
|
+
fo.puts response.to_str
|
244
|
+
fo.close
|
245
|
+
seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
|
246
|
+
"ncbi:#{acc[1]}" :
|
247
|
+
o[ "#{seq}name".to_sym ])
|
248
|
+
else
|
249
|
+
seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
|
250
|
+
File.basename(o[seq], ".*") :
|
251
|
+
o[ "#{seq}name".to_sym ])
|
252
|
+
end
|
253
|
+
$stderr.puts " Reading FastA file: #{o[seq]}" unless o[:q]
|
254
|
+
unless o[:sqlite3].nil?
|
255
|
+
sqlite_db.execute "delete from rbm where seq1=? and seq2=?", seq_names
|
256
|
+
sqlite_db.execute "delete from aai where seq1=? and seq2=?", seq_names
|
257
|
+
end
|
258
|
+
ori_ids[seq] = [nil]
|
259
|
+
seq_len[seq] = [0]
|
260
|
+
actg_cnt[seq] = 0
|
261
|
+
seqs = 0
|
262
|
+
fi = File.extname(o[seq]) == '.gz' ?
|
263
|
+
Zlib::GzipReader.open(o[seq]) :
|
264
|
+
File.open(o[seq], 'r')
|
265
|
+
File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
|
266
|
+
fi.each_line do |ln|
|
267
|
+
if ln =~ /^>(\S+)/
|
268
|
+
seqs += 1
|
269
|
+
ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
|
270
|
+
seq_len[seq][seqs] = 0
|
271
|
+
fo.puts ">#{seqs}"
|
272
|
+
else
|
273
|
+
fo.puts ln
|
274
|
+
seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
|
275
|
+
actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
fi.close
|
280
|
+
unless o[:nucl]
|
281
|
+
actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
|
282
|
+
abort "Input sequences appear to be nucleotides " +
|
283
|
+
"(ACTGN fraction: %.2f%%)." % (actg_frx*100) if actg_frx > o[:max_actg]
|
284
|
+
end
|
285
|
+
$stderr.puts " File contains #{seqs} sequences." unless o[:q]
|
286
|
+
minfrg ||= seqs
|
287
|
+
minfrg = seqs if minfrg > seqs
|
288
|
+
case o[:program].downcase
|
289
|
+
when "blast"
|
290
|
+
`"#{o[:bin]}formatdb" -i "#{dir}/#{seq}.fa" \
|
291
|
+
-p #{o[:nucl] ? "F" : "T"}`
|
292
|
+
when "blast+"
|
293
|
+
`"#{o[:bin]}makeblastdb" -in "#{dir}/#{seq}.fa" \
|
294
|
+
-dbtype #{o[:nucl] ? "nucl" : "prot"}`
|
295
|
+
when "blat"
|
296
|
+
# Nothing to do
|
297
|
+
when "diamond"
|
298
|
+
`"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
|
299
|
+
--db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}" \
|
300
|
+
--quiet`
|
301
|
+
else
|
302
|
+
abort "Unsupported program: #{o[:program]}."
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# Best-hits.
|
307
|
+
$stderr.puts "Running one-way comparisons." unless o[:q]
|
308
|
+
rbh = []
|
309
|
+
id2 = 0
|
310
|
+
sq2 = 0
|
311
|
+
n2 = 0
|
312
|
+
unless o[:out].nil?
|
313
|
+
fo = File.open(o[:out], "w")
|
314
|
+
fo.puts %w(identity aln.len mismatch gap.open evalue bitscore).join("\t")
|
315
|
+
end
|
316
|
+
res = File.open(o[:res], "w") unless o[:res].nil?
|
317
|
+
rbm = File.open(o[:rbm], "w") unless o[:rbm].nil?
|
318
|
+
[1,2].each do |i|
|
319
|
+
qry_seen = []
|
320
|
+
q = "#{dir}/seq#{i}.fa"
|
321
|
+
s = "#{dir}/seq#{i==1?2:1}.fa"
|
322
|
+
case o[:program].downcase
|
323
|
+
when "blast"
|
324
|
+
`"#{o[:bin]}blastall" -p blast#{o[:nucl] ? "n": "p"} -d "#{s}" \
|
325
|
+
-i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
|
326
|
+
when "blast+"
|
327
|
+
`"#{o[:bin]}blast#{o[:nucl] ? "n" : "p"}" -db "#{s}" -query "#{q}" \
|
328
|
+
-max_target_seqs 1 -num_threads #{o[:thr]} -outfmt 6 \
|
329
|
+
-out "#{dir}/#{i}.tab"`
|
330
|
+
when "blat"
|
331
|
+
`"#{o[:bin]}blat" "#{s}" "#{q}" #{"-prot" unless o[:nucl]} -out=blast8 \
|
332
|
+
"#{dir}/#{i}.tab.uns"`
|
333
|
+
`sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
|
334
|
+
when "diamond"
|
335
|
+
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
|
336
|
+
--query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
|
337
|
+
&& "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
|
338
|
+
--out "#{dir}/#{i}.tab" --quiet`
|
339
|
+
else
|
340
|
+
abort "Unsupported program: #{o[:program]}."
|
341
|
+
end
|
342
|
+
fh = File.open("#{dir}/#{i}.tab", "r")
|
343
|
+
id = 0
|
344
|
+
sq = 0
|
345
|
+
n = 0
|
346
|
+
fh.each_line do |ln|
|
347
|
+
ln.chomp!
|
348
|
+
row = ln.split(/\t/)
|
349
|
+
next unless qry_seen[ row[0].to_i ].nil?
|
350
|
+
next if row[3].to_i < o[:len] and
|
351
|
+
next if row[2].to_f < o[:id]
|
352
|
+
next if row[11].to_f < o[:bits]
|
353
|
+
next if row[3].to_f/[
|
354
|
+
seq_len[i==1 ? :seq1 : :seq2][row[0].to_i],
|
355
|
+
seq_len[i==1 ? :seq2 : :seq1][row[1].to_i]
|
356
|
+
].min < o[:len_fraction]
|
357
|
+
qry_seen[ row[0].to_i ] = 1
|
358
|
+
id += row[2].to_f
|
359
|
+
sq += row[2].to_f ** 2
|
360
|
+
n += 1
|
361
|
+
if i==1
|
362
|
+
rbh[ row[0].to_i ] = row[1].to_i
|
363
|
+
else
|
364
|
+
if !rbh[ row[1].to_i ].nil? and rbh[ row[1].to_i ]==row[0].to_i
|
365
|
+
id2 += row[2].to_f
|
366
|
+
sq2 += row[2].to_f**2
|
367
|
+
n2 += 1
|
368
|
+
fo.puts [row[2..5],row[10..11]].join("\t") unless o[:out].nil?
|
369
|
+
rbm.puts [ori_ids[:seq1][row[1].to_i],
|
370
|
+
ori_ids[:seq2][row[0].to_i], row[2..5], row[8..9],
|
371
|
+
row[6..7], row[10..11]].join("\t") unless o[:rbm].nil?
|
372
|
+
sqlite_db.execute("insert into rbm values(?,?,?,?,?,?,?)",
|
373
|
+
seq_names + [ori_ids[:seq1][row[1].to_i],
|
374
|
+
ori_ids[:seq2][row[0].to_i], row[2], row[10], row[11]]
|
375
|
+
) if not o[:sqlite3].nil? and o[:dbrbm]
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
fh.close
|
380
|
+
if n < o[:hits]
|
381
|
+
puts "Insuffient hits to estimate one-way AAI: #{n}." unless o[:auto]
|
382
|
+
res.puts "Insufficient hits to estimate one-way AAI: #{n}" unless
|
383
|
+
o[:res].nil?
|
384
|
+
else
|
385
|
+
printf "! One-way AAI %d: %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), " +
|
386
|
+
"from %i proteins.\n", i, id/n, (sq/n - (id/n)**2)**0.5, n unless
|
387
|
+
o[:auto]
|
388
|
+
res.puts sprintf "<b>One-way AAI %d:</b> %.#{o[:dec]}f%% " +
|
389
|
+
"(SD: %.#{o[:dec]}f%%), from %i proteins.<br/>", i, id/n,
|
390
|
+
(sq/n - (id/n)**2)**0.5, n unless o[:res].nil?
|
391
|
+
end
|
392
|
+
end
|
393
|
+
rbm.close unless o[:rbm].nil?
|
394
|
+
if n2 < o[:hits]
|
395
|
+
puts "Insufficient hits to estimate two-way AAI: #{n2}" unless o[:auto]
|
396
|
+
res.puts "Insufficient hits to estimate two-way AAI: #{n2}" unless
|
397
|
+
o[:res].nil?
|
398
|
+
else
|
399
|
+
printf "! Two-way AAI : %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), from %i" +
|
400
|
+
" proteins.\n", id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:auto]
|
401
|
+
res.puts sprintf "<b>Two-way AAI:</b> %.#{o[:dec]}f%% (SD: " +
|
402
|
+
"%.#{o[:dec]}f%%), from %i proteins.<br/>", id2/n2,
|
403
|
+
(sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:res].nil?
|
404
|
+
unless o[:tab].nil?
|
405
|
+
tab = File.open(o[:tab], "w")
|
406
|
+
tab.printf "%.#{o[:dec]}f\t%.#{o[:dec]}f\t%i\t%i\n", id2/n2,
|
407
|
+
(sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg
|
408
|
+
tab.close
|
409
|
+
end
|
410
|
+
sqlite_db.execute("insert into aai values(?,?,?,?,?,?)",
|
411
|
+
seq_names + [id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg]) unless
|
412
|
+
o[:sqlite3].nil?
|
413
|
+
puts id2/n2 if o[:auto]
|
414
|
+
end
|
415
|
+
res.close unless o[:res].nil?
|
416
|
+
fo.close unless o[:out].nil?
|
417
|
+
end
|
418
|
+
|