miga-base 0.7.26.0 → 0.7.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +766 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +243 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +67 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +829 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +501 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +156 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +163 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +57 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +418 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm.rb +146 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +50 -0
- data/utils/enveomics/enveomics.R/README.md +80 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +37 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- metadata +277 -4
@@ -0,0 +1,84 @@
|
|
1
|
+
# FastAAI
|
2
|
+
Fast estimation of Average Amino Acid Identities (AAI) for bacterial and viral genomes.
|
3
|
+
Includes a module for the classification of viral genomes.
|
4
|
+
|
5
|
+
## Content Table
|
6
|
+
* [Features](#features)
|
7
|
+
* [Citation](#citation)
|
8
|
+
* [Requirements](#requirements)
|
9
|
+
* [Installation](#installation)
|
10
|
+
* [Usage](#usage)
|
11
|
+
* [FAQs](#faqs)
|
12
|
+
* [License](#license)
|
13
|
+
|
14
|
+
## Features
|
15
|
+
Coming soon
|
16
|
+
|
17
|
+
## Citation
|
18
|
+
Coming soon
|
19
|
+
|
20
|
+
## Requirements:
|
21
|
+
- Programs:
|
22
|
+
- [HMMER](http://hmmer.org/) >= 3.1
|
23
|
+
- Python >=3.6,<3.9
|
24
|
+
- Base Python Modules:
|
25
|
+
- argparse
|
26
|
+
- datetime
|
27
|
+
- pathlib
|
28
|
+
- shutil
|
29
|
+
- subprocess
|
30
|
+
- gzip
|
31
|
+
- multiprocessing
|
32
|
+
- textwrap
|
33
|
+
- pickle
|
34
|
+
- tempfile
|
35
|
+
- sys
|
36
|
+
- functools
|
37
|
+
- Additional Python Modules:
|
38
|
+
- numpy
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
### Conda Installation
|
42
|
+
FastAAIIt appears we need a bunch of pre-requisites to run FastAAI No worries, their installation using Conda is quite easy. If you don't have Conda, you can install it as follows:
|
43
|
+
1. Download Anaconda from https://www.anaconda.com/products/individual.
|
44
|
+
2. Run `bash Anaconda-latest-Linux-x86_64.sh` and follow the installation instructions.
|
45
|
+
3. Once installed you can run `conda -V`. You should get the version of conda that you installed.
|
46
|
+
|
47
|
+
Now, let's add the conda channels required to install the pre-requisites:
|
48
|
+
|
49
|
+
```bash
|
50
|
+
conda config --add channels conda-forge
|
51
|
+
conda config --add channels bioconda
|
52
|
+
conda config --add channels cruizperez
|
53
|
+
```
|
54
|
+
|
55
|
+
Then, create an environment for MicrobeAnnotator:
|
56
|
+
|
57
|
+
```bash
|
58
|
+
conda create -n fastaai hmmer prodigal numpy python=3.7 fastaai
|
59
|
+
```
|
60
|
+
|
61
|
+
And activate it:
|
62
|
+
|
63
|
+
```bash
|
64
|
+
conda activate microbeannotator
|
65
|
+
```
|
66
|
+
|
67
|
+
Both main scripts (microbeannotator and microbeannotator_db_builder) should be in your path ready for use!
|
68
|
+
This should take care of most of the requirements except for Aspera Connect and KofamScan, which are a little more involved. Let's install those.
|
69
|
+
|
70
|
+
### Pip Installation
|
71
|
+
#Once you have installed the pre-requisites to run MicrobeAnnotator, or if you already had them and you are not using Conda, you can install MicrobeAnnotator using pip:
|
72
|
+
|
73
|
+
|
74
|
+
## Usage
|
75
|
+
### Database creation
|
76
|
+
|
77
|
+
|
78
|
+
## FAQs
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
## License
|
83
|
+
|
84
|
+
See LICENSE
|
@@ -0,0 +1,1296 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
"""
|
4
|
+
########################################################################
|
5
|
+
# Author: Carlos Ruiz
|
6
|
+
# Intitution: Georgia Institute of Technology
|
7
|
+
# Version: 0.8
|
8
|
+
# Date: March 02, 2020
|
9
|
+
|
10
|
+
# Description: Calculates the average amino acid identity using k-mers
|
11
|
+
from single copy genes. It is a faster version of the regular AAI (Blast
|
12
|
+
or Diamond) and the hAAI implemented in MiGA.
|
13
|
+
########################################################################
|
14
|
+
"""
|
15
|
+
|
16
|
+
################################################################################
|
17
|
+
"""---0.0 Import Modules---"""
|
18
|
+
import subprocess, argparse, multiprocessing, datetime, shutil
|
19
|
+
import textwrap, pickle, gzip
|
20
|
+
from random import randint
|
21
|
+
from pathlib import Path
|
22
|
+
from sys import argv
|
23
|
+
from sys import exit
|
24
|
+
from functools import partial
|
25
|
+
from os.path import realpath
|
26
|
+
import numpy
|
27
|
+
import tempfile
|
28
|
+
|
29
|
+
|
30
|
+
################################################################################
|
31
|
+
"""---1.0 Define Functions---"""
|
32
|
+
# --- Run prodigal ---
|
33
|
+
# ------------------------------------------------------
|
34
|
+
def run_prodigal(input_file):
|
35
|
+
"""
|
36
|
+
Runs prodigal, compares translation tables and stores faa files
|
37
|
+
|
38
|
+
Arguments:
|
39
|
+
input_file -- Path to genome FastA file
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
output -- Path to amino acid fasta result
|
43
|
+
"""
|
44
|
+
# Predict proteins with translation tables 4 and 11
|
45
|
+
file_path = Path(input_file)
|
46
|
+
filename = file_path.name
|
47
|
+
folder = file_path.parent
|
48
|
+
protein_output = folder / (filename + '.faa')
|
49
|
+
output_11 = folder / (filename + '.faa.11')
|
50
|
+
temp_output = folder / (filename + '.temp')
|
51
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_11),
|
52
|
+
"-p", "meta", "-q", "-o", str(temp_output)])
|
53
|
+
output_4 = folder / (filename + '.faa.4')
|
54
|
+
temp_output = folder / (filename + '.temp')
|
55
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_4),
|
56
|
+
"-p", "meta", "-g", "4", "-q", "-o", str(temp_output)])
|
57
|
+
|
58
|
+
# Compare translation tables
|
59
|
+
length_4 = 0
|
60
|
+
length_11 = 0
|
61
|
+
with open(output_4, 'r') as table_4:
|
62
|
+
for line in table_4:
|
63
|
+
if line.startswith(">"):
|
64
|
+
continue
|
65
|
+
else:
|
66
|
+
length_4 += len(line.strip())
|
67
|
+
|
68
|
+
with open(output_11, 'r') as table_11:
|
69
|
+
for line in table_11:
|
70
|
+
if line.startswith(">"):
|
71
|
+
continue
|
72
|
+
else:
|
73
|
+
length_11 += len(line.strip())
|
74
|
+
|
75
|
+
if (length_4 / length_11) >= 1.1:
|
76
|
+
shutil.copy(output_4, protein_output)
|
77
|
+
else:
|
78
|
+
shutil.copy(str(output_11), str(protein_output))
|
79
|
+
|
80
|
+
# Remove intermediate files
|
81
|
+
output_4.unlink()
|
82
|
+
output_11.unlink()
|
83
|
+
temp_output.unlink()
|
84
|
+
|
85
|
+
# Remove stop '*' codons from protein sequences
|
86
|
+
with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
|
87
|
+
for line in final_protein:
|
88
|
+
if line.startswith(">"):
|
89
|
+
temporal_file.write("{}".format(line))
|
90
|
+
else:
|
91
|
+
line = line.replace('*', '')
|
92
|
+
temporal_file.write("{}".format(line))
|
93
|
+
shutil.copy(str(temp_output), str(protein_output))
|
94
|
+
temp_output.unlink()
|
95
|
+
|
96
|
+
return str(protein_output)
|
97
|
+
# ------------------------------------------------------
|
98
|
+
|
99
|
+
# --- Run prodigal for viruses ---
|
100
|
+
# ------------------------------------------------------
|
101
|
+
def run_prodigal_virus(input_file):
|
102
|
+
"""
|
103
|
+
Runs prodigal, compares translation tables and stores faa files
|
104
|
+
|
105
|
+
Arguments:
|
106
|
+
input_file -- Path to genome FastA file
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
output -- Path to amino acid fasta result
|
110
|
+
"""
|
111
|
+
# Predict proteins with translation tables 4 and 11
|
112
|
+
file_path = Path(input_file)
|
113
|
+
filename = file_path.name
|
114
|
+
folder = file_path.parent
|
115
|
+
protein_output = folder / (filename + '.faa')
|
116
|
+
temp_output = folder / (filename + '.temp')
|
117
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(protein_output),
|
118
|
+
"-p", "meta", "-q", "-o", str(temp_output)])
|
119
|
+
|
120
|
+
# Remove intermediate files
|
121
|
+
temp_output.unlink()
|
122
|
+
|
123
|
+
# Remove stop '*' codons from protein sequences
|
124
|
+
with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
|
125
|
+
for line in final_protein:
|
126
|
+
if line.startswith(">"):
|
127
|
+
temporal_file.write("{}".format(line))
|
128
|
+
else:
|
129
|
+
line = line.replace('*', '')
|
130
|
+
temporal_file.write("{}".format(line))
|
131
|
+
shutil.copy(str(temp_output), str(protein_output))
|
132
|
+
temp_output.unlink()
|
133
|
+
|
134
|
+
return str(protein_output)
|
135
|
+
# ------------------------------------------------------
|
136
|
+
|
137
|
+
# --- Run hmmsearch ---
|
138
|
+
# ------------------------------------------------------
|
139
|
+
def run_hmmsearch(input_file):
|
140
|
+
"""
|
141
|
+
Runs hmmsearch on the set of SCGs and select the
|
142
|
+
best Archaea or Bacterial model
|
143
|
+
|
144
|
+
Arguments:
|
145
|
+
input_file -- Path to protein FastA file
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
output -- Path to hmmsearch hits table
|
149
|
+
"""
|
150
|
+
file_path = Path(input_file)
|
151
|
+
folder = file_path.parent
|
152
|
+
name = file_path.name
|
153
|
+
hmm_output = folder / (name + '.hmm')
|
154
|
+
temp_output = folder / (name + '.temp')
|
155
|
+
script_path = Path(realpath(__file__))
|
156
|
+
script_dir = script_path.parent
|
157
|
+
hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
158
|
+
subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
|
159
|
+
str(hmm_complete_model), str(file_path)])
|
160
|
+
temp_output.unlink()
|
161
|
+
return str(hmm_output)
|
162
|
+
# ------------------------------------------------------
|
163
|
+
|
164
|
+
# --- Filter HMM results for best matches ---
|
165
|
+
# ------------------------------------------------------
|
166
|
+
def hmm_filter(scg_hmm_file, keep):
|
167
|
+
"""
|
168
|
+
Filters HMM results for best hits per protein
|
169
|
+
|
170
|
+
Arguments:
|
171
|
+
SCG_HMM_file {file path} -- Path to HMM results file
|
172
|
+
keep {bool} -- Keep HMM files
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
outfile -- Path to filtered files
|
176
|
+
"""
|
177
|
+
hmm_path = Path(scg_hmm_file)
|
178
|
+
name = hmm_path.name
|
179
|
+
folder = hmm_path.parent
|
180
|
+
outfile = folder / (name + '.filt')
|
181
|
+
hmm_hit_dict = {}
|
182
|
+
with open(scg_hmm_file, 'r') as hit_file:
|
183
|
+
for line in hit_file:
|
184
|
+
if line.startswith("#"):
|
185
|
+
continue
|
186
|
+
else:
|
187
|
+
hit = line.strip().split()
|
188
|
+
protein_name = hit[0]
|
189
|
+
score = float(hit[8])
|
190
|
+
if protein_name in hmm_hit_dict:
|
191
|
+
if score > hmm_hit_dict[protein_name][0]:
|
192
|
+
hmm_hit_dict[protein_name] = [score, line]
|
193
|
+
elif score < hmm_hit_dict[protein_name][0]:
|
194
|
+
continue
|
195
|
+
else:
|
196
|
+
if randint(2) > 0:
|
197
|
+
hmm_hit_dict[protein_name] = [score, line]
|
198
|
+
else:
|
199
|
+
hmm_hit_dict[protein_name] = [score, line]
|
200
|
+
with open(outfile, 'w') as output:
|
201
|
+
for hits in hmm_hit_dict.values():
|
202
|
+
output.write("{}".format(hits[1]))
|
203
|
+
return str(outfile)
|
204
|
+
# ------------------------------------------------------
|
205
|
+
|
206
|
+
# --- Find Kmers from HMM results ---
|
207
|
+
# ------------------------------------------------------
|
208
|
+
def kmer_extract(input_files):
|
209
|
+
"""
|
210
|
+
Extract kmers from protein files that have hits
|
211
|
+
in the HMM searches.
|
212
|
+
|
213
|
+
Arguments:
|
214
|
+
SCG_HMM_file {file path} -- Path to filtered HMM results.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
[genome_kmers] -- Dictionary of kmers per gene.
|
218
|
+
"""
|
219
|
+
final_filename = input_files[0]
|
220
|
+
protein_file = input_files[1]
|
221
|
+
scg_hmm_file = input_files[2]
|
222
|
+
positive_matches = {}
|
223
|
+
positive_proteins = []
|
224
|
+
with open(scg_hmm_file, 'r') as hmm_input:
|
225
|
+
for line in hmm_input:
|
226
|
+
line = line.strip().split()
|
227
|
+
protein_name = line[0]
|
228
|
+
model_name = line[3]
|
229
|
+
score = line[8]
|
230
|
+
if model_name in positive_matches:
|
231
|
+
if score > positive_matches[model_name][1]:
|
232
|
+
positive_matches[model_name] = [protein_name, score]
|
233
|
+
else:
|
234
|
+
continue
|
235
|
+
else:
|
236
|
+
positive_matches[model_name] = [protein_name, score]
|
237
|
+
for proteins in positive_matches.values():
|
238
|
+
positive_proteins.append(proteins[0])
|
239
|
+
scg_kmers = read_kmers_from_file(protein_file, positive_proteins, 4)
|
240
|
+
for accession, protein in positive_matches.items():
|
241
|
+
scg_kmers[accession] = scg_kmers.pop(protein[0])
|
242
|
+
genome_kmers = {final_filename : scg_kmers}
|
243
|
+
return genome_kmers
|
244
|
+
# ------------------------------------------------------
|
245
|
+
|
246
|
+
# --- Extract kmers from protein sequences ---
|
247
|
+
# ------------------------------------------------------
|
248
|
+
def read_kmers_from_file(filename, positive_hits, ksize):
|
249
|
+
scg_kmers = {}
|
250
|
+
store_sequence = False
|
251
|
+
protein_name = ""
|
252
|
+
protein_sequence = ""
|
253
|
+
with open(filename) as fasta_in:
|
254
|
+
for line in fasta_in:
|
255
|
+
if line.startswith(">"):
|
256
|
+
if store_sequence == True:
|
257
|
+
kmers = build_kmers(protein_sequence, ksize)
|
258
|
+
scg_kmers[protein_name] = kmers
|
259
|
+
protein_sequence = ""
|
260
|
+
store_sequence = False
|
261
|
+
line = line.replace(">", "")
|
262
|
+
protein_name = line.strip().split()[0]
|
263
|
+
if protein_name in positive_hits:
|
264
|
+
store_sequence = True
|
265
|
+
else:
|
266
|
+
if store_sequence == True:
|
267
|
+
protein_sequence += line.strip()
|
268
|
+
else:
|
269
|
+
continue
|
270
|
+
if store_sequence == True:
|
271
|
+
kmers = build_kmers(protein_sequence, ksize)
|
272
|
+
scg_kmers[protein_name] = kmers
|
273
|
+
return scg_kmers
|
274
|
+
# ------------------------------------------------------
|
275
|
+
|
276
|
+
# --- Extract kmers from viral protein sequences ---
|
277
|
+
# ------------------------------------------------------
|
278
|
+
def read_viral_kmers_from_file(input_information):
|
279
|
+
final_filename = input_information[0]
|
280
|
+
protein_file = input_information[1]
|
281
|
+
kmer_size = input_information[2]
|
282
|
+
scg_kmers = set()
|
283
|
+
protein_sequence = ""
|
284
|
+
store_sequence = False
|
285
|
+
with open(protein_file) as fasta_in:
|
286
|
+
for line in fasta_in:
|
287
|
+
if line.startswith(">"):
|
288
|
+
if store_sequence == True:
|
289
|
+
kmers = build_kmers(protein_sequence, kmer_size)
|
290
|
+
kmers = set(kmers.split(","))
|
291
|
+
scg_kmers.update(kmers)
|
292
|
+
protein_sequence = ""
|
293
|
+
else:
|
294
|
+
protein_sequence = ""
|
295
|
+
store_sequence = True
|
296
|
+
else:
|
297
|
+
protein_sequence += line.strip()
|
298
|
+
genome_kmers = {final_filename : list(scg_kmers)}
|
299
|
+
return genome_kmers
|
300
|
+
# ------------------------------------------------------
|
301
|
+
|
302
|
+
# --- Build Kmers ---
|
303
|
+
# ------------------------------------------------------
|
304
|
+
def build_kmers(sequence, ksize):
|
305
|
+
kmers = []
|
306
|
+
n_kmers = len(sequence) - ksize + 1
|
307
|
+
|
308
|
+
for i in range(n_kmers):
|
309
|
+
kmer = sequence[i:i + ksize]
|
310
|
+
kmers.append(kmer)
|
311
|
+
kmers_set = ','.join(set(kmers))
|
312
|
+
return kmers_set
|
313
|
+
# ------------------------------------------------------
|
314
|
+
|
315
|
+
# --- Parse kAAI when query == reference ---
|
316
|
+
#Carlos, This function is not used with the new changes
|
317
|
+
# ------------------------------------------------------
|
318
|
+
def single_kaai_parser(query_id):
|
319
|
+
"""
|
320
|
+
Calculates Jaccard distances on kmers from proteins shared
|
321
|
+
|
322
|
+
Arguments:
|
323
|
+
query_id {str} -- Id of the query genome
|
324
|
+
|
325
|
+
Returns:
|
326
|
+
[Path to output] -- Path to output file
|
327
|
+
"""
|
328
|
+
file_path = Path(query_id)
|
329
|
+
|
330
|
+
#Carlos, tempdir for safety
|
331
|
+
tmp_folder = tempfile.TemporaryDirectory()
|
332
|
+
running_folder = tmp_folder.name
|
333
|
+
|
334
|
+
|
335
|
+
temp_output = running_folder / file_path.with_suffix('.aai.temp')
|
336
|
+
# Get number and list of SCG detected in query
|
337
|
+
query_num_scg = len(query_kmer_dictionary[query_id])
|
338
|
+
query_scg_list = query_kmer_dictionary[query_id].keys()
|
339
|
+
# Start comparison with all genomes in the query dictionary
|
340
|
+
with open(temp_output, 'w') as out_file:
|
341
|
+
for target_genome, scg_ids in query_kmer_dictionary.items():
|
342
|
+
jaccard_similarities = []
|
343
|
+
# Get number and list of SCG detected in reference
|
344
|
+
target_num_scg = len(scg_ids)
|
345
|
+
target_scg_list = scg_ids.keys()
|
346
|
+
# Choose the smallest set of proteins
|
347
|
+
if query_num_scg > target_num_scg:
|
348
|
+
final_scg_list = target_scg_list
|
349
|
+
else:
|
350
|
+
final_scg_list = query_scg_list
|
351
|
+
# Compare all the proteins in the final SCG list
|
352
|
+
for accession in final_scg_list:
|
353
|
+
if accession in query_scg_list and accession in target_scg_list:
|
354
|
+
# Get set and list for each SCG accession
|
355
|
+
kmers_query = set(query_kmer_dictionary[query_id][accession].split(','))
|
356
|
+
kmers_target = query_kmer_dictionary[target_genome][accession].split(',')
|
357
|
+
# Calculate jaccard_similarity
|
358
|
+
intersection = len(kmers_query.intersection(kmers_target))
|
359
|
+
union = len(kmers_query.union(kmers_target))
|
360
|
+
jaccard_similarities.append(intersection / union)
|
361
|
+
else:
|
362
|
+
continue
|
363
|
+
try:
|
364
|
+
n = len(jaccard_similarities)
|
365
|
+
mean = sum(jaccard_similarities)/n
|
366
|
+
var = sum([ (x - mean)**2 for x in jaccard_similarities ])/(n - 1)
|
367
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
368
|
+
round(mean, 4), round(var**0.5, 4),
|
369
|
+
len(jaccard_similarities), len(final_scg_list)))
|
370
|
+
except:
|
371
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
372
|
+
"NA", "NA", "NA", "NA"))
|
373
|
+
|
374
|
+
return temp_output
|
375
|
+
# ------------------------------------------------------
|
376
|
+
|
377
|
+
# --- Parse viral kAAI when query == reference ---
|
378
|
+
# ------------------------------------------------------
|
379
|
+
def single_virus_kaai_parser(query_id):
|
380
|
+
"""
|
381
|
+
Calculates Jaccard distances on kmers from viral proteins
|
382
|
+
|
383
|
+
Arguments:
|
384
|
+
query_id {str} -- Id of the query genome
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
[Path to output] -- Path to output file
|
388
|
+
"""
|
389
|
+
file_path = Path(query_id)
|
390
|
+
|
391
|
+
#Carlos, tempdir for safety
|
392
|
+
tmp_folder = tempfile.TemporaryDirectory()
|
393
|
+
running_folder = tmp_folder.name
|
394
|
+
|
395
|
+
|
396
|
+
temp_output = running_folder / file_path.with_suffix('.aai.temp')
|
397
|
+
# Start comparison with all genomes in the query dictionary
|
398
|
+
with open(temp_output, 'w') as out_file:
|
399
|
+
for target_genome, kmers_target in query_kmer_dictionary.items():
|
400
|
+
jaccard_index = None
|
401
|
+
kmers_query = set(query_kmer_dictionary[query_id])
|
402
|
+
intersection = len(kmers_query.intersection(kmers_target))
|
403
|
+
union = len(kmers_query.union(kmers_target))
|
404
|
+
try:
|
405
|
+
jaccard_index = intersection / union
|
406
|
+
out_file.write("{}\t{}\t{}\n".format(query_id, target_genome, jaccard_index))
|
407
|
+
except:
|
408
|
+
out_file.write("{}\t{}\tNA\n".format(query_id, target_genome))
|
409
|
+
return temp_output
|
410
|
+
# ------------------------------------------------------
|
411
|
+
|
412
|
+
# --- Parse kAAI when query != reference ---
|
413
|
+
# ------------------------------------------------------
|
414
|
+
def double_kaai_parser(query_id):
|
415
|
+
"""
|
416
|
+
Calculates Jaccard distances on kmers from proteins shared
|
417
|
+
|
418
|
+
Arguments:
|
419
|
+
query_id {str} -- Id of the query genome
|
420
|
+
|
421
|
+
Returns:
|
422
|
+
[Path to output] -- Path to output file
|
423
|
+
"""
|
424
|
+
file_path = Path(query_id)
|
425
|
+
|
426
|
+
#Carlos, tempdir for safety
|
427
|
+
tmp_folder = tempfile.TemporaryDirectory()
|
428
|
+
running_folder = tmp_folder.name
|
429
|
+
|
430
|
+
|
431
|
+
temp_output = running_folder / file_path.with_suffix('.aai.temp')
|
432
|
+
# Get number and list of SCG detected in query
|
433
|
+
query_num_scg = len(query_kmer_dictionary[query_id])
|
434
|
+
query_scg_list = query_kmer_dictionary[query_id].keys()
|
435
|
+
# Start comparison with all genomes in the query dictionary
|
436
|
+
with open(temp_output, 'w') as out_file:
|
437
|
+
for target_genome, scg_ids in ref_kmer_dictionary.items():
|
438
|
+
jaccard_similarities = []
|
439
|
+
# Get number and list of SCG detected in reference
|
440
|
+
target_num_scg = len(scg_ids)
|
441
|
+
target_scg_list = scg_ids.keys()
|
442
|
+
# Choose the smallest set of proteins
|
443
|
+
if query_num_scg > target_num_scg:
|
444
|
+
final_scg_list = target_scg_list
|
445
|
+
else:
|
446
|
+
final_scg_list = query_scg_list
|
447
|
+
# Compare all the proteins in the final SCG list
|
448
|
+
for accession in final_scg_list:
|
449
|
+
if accession in query_scg_list and accession in target_scg_list:
|
450
|
+
# Get set and list for each SCG accession
|
451
|
+
kmers_query = set(query_kmer_dictionary[query_id][accession].split(','))
|
452
|
+
kmers_target = ref_kmer_dictionary[target_genome][accession].split(',')
|
453
|
+
# Calculate jaccard_similarity
|
454
|
+
intersection = len(kmers_query.intersection(kmers_target))
|
455
|
+
union = len(kmers_query.union(kmers_target))
|
456
|
+
jaccard_similarities.append(intersection / union)
|
457
|
+
else:
|
458
|
+
continue
|
459
|
+
try:
|
460
|
+
n = len(jaccard_similarities)
|
461
|
+
mean = sum(jaccard_similarities)/n
|
462
|
+
var = sum([ (x - mean)**2 for x in jaccard_similarities ])/(n - 1)
|
463
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
464
|
+
round(mean, 4), round(var**0.5, 4),
|
465
|
+
len(jaccard_similarities), len(final_scg_list)))
|
466
|
+
except:
|
467
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
468
|
+
"NA", "NA", "NA", "NA"))
|
469
|
+
return temp_output
|
470
|
+
# ------------------------------------------------------
|
471
|
+
|
472
|
+
# --- Parse viral kAAI when query != reference ---
|
473
|
+
# ------------------------------------------------------
|
474
|
+
def double_viral_kaai_parser(query_id):
|
475
|
+
"""
|
476
|
+
Calculates Jaccard distances on kmers from viral proteins
|
477
|
+
|
478
|
+
Arguments:
|
479
|
+
query_id {str} -- Id of the query genome
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
[Path to output] -- Path to output file
|
483
|
+
"""
|
484
|
+
file_path = Path(query_id)
|
485
|
+
|
486
|
+
#Carlos, tempdir for safety
|
487
|
+
tmp_folder = tempfile.TemporaryDirectory()
|
488
|
+
running_folder = tmp_folder.name
|
489
|
+
|
490
|
+
|
491
|
+
temp_output = running_folder / file_path.with_suffix('.aai.temp')
|
492
|
+
# Start comparison with all genomes in the query dictionary
|
493
|
+
with open(temp_output, 'w') as out_file:
|
494
|
+
for target_genome, kmers_target in ref_kmer_dictionary.items():
|
495
|
+
jaccard_index = None
|
496
|
+
kmers_query = set(query_kmer_dictionary[query_id])
|
497
|
+
intersection = len(kmers_query.intersection(kmers_target))
|
498
|
+
union = len(kmers_query.union(kmers_target))
|
499
|
+
try:
|
500
|
+
jaccard_index = intersection / union
|
501
|
+
out_file.write("{}\t{}\t{}\n".format(query_id, target_genome, jaccard_index))
|
502
|
+
except:
|
503
|
+
out_file.write("{}\t{}\tNA\n".format(query_id, target_genome))
|
504
|
+
return temp_output
|
505
|
+
# ------------------------------------------------------
|
506
|
+
|
507
|
+
# --- Query == Reference initializer function ---
|
508
|
+
# ------------------------------------------------------
|
509
|
+
def single_dictionary_initializer(_dictionary):
|
510
|
+
"""
|
511
|
+
Make dictionary available for multiprocessing
|
512
|
+
"""
|
513
|
+
global query_kmer_dictionary
|
514
|
+
query_kmer_dictionary = _dictionary
|
515
|
+
# ------------------------------------------------------
|
516
|
+
|
517
|
+
# --- Query != Reference initializer function ---
|
518
|
+
# ------------------------------------------------------
|
519
|
+
def two_dictionary_initializer(_query_dictionary, _ref_dictionary):
|
520
|
+
"""
|
521
|
+
Make dictionary available for multiprocessing
|
522
|
+
"""
|
523
|
+
global query_kmer_dictionary
|
524
|
+
global ref_kmer_dictionary
|
525
|
+
query_kmer_dictionary = _query_dictionary
|
526
|
+
ref_kmer_dictionary = _ref_dictionary
|
527
|
+
# ------------------------------------------------------
|
528
|
+
|
529
|
+
# --- Merge kmer dictionaries ---
|
530
|
+
# ------------------------------------------------------
|
531
|
+
def merge_dicts(dictionaries):
|
532
|
+
"""
|
533
|
+
Given any number of dicts, shallow copy and merge into a new dict,
|
534
|
+
precedence goes to key value pairs in latter dicts.
|
535
|
+
"""
|
536
|
+
result = {}
|
537
|
+
for kmer_dictionary in dictionaries:
|
538
|
+
result.update(kmer_dictionary)
|
539
|
+
return result
|
540
|
+
# ------------------------------------------------------
|
541
|
+
|
542
|
+
|
543
|
+
#My version 1 - numpy-ized
|
544
|
+
def single_kaai_parser_all_v_all(args):
|
545
|
+
"""
|
546
|
+
Calculates Jaccard distances on kmers from proteins shared
|
547
|
+
|
548
|
+
Arguments:
|
549
|
+
query_id {str} -- Id of the query genome
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
[Path to output] -- Path to output file
|
553
|
+
"""
|
554
|
+
#Use split as slice if true
|
555
|
+
|
556
|
+
query_id = args[0]
|
557
|
+
skip_first_n = args[1]
|
558
|
+
|
559
|
+
file_path = Path(query_id)
|
560
|
+
|
561
|
+
tmp_folder = tempfile.TemporaryDirectory()
|
562
|
+
running_folder = tmp_folder.name
|
563
|
+
|
564
|
+
#Just for my own testing. Temp dir is definitely the correct choice, here.
|
565
|
+
#running_folder = Path("faster_kaai")
|
566
|
+
|
567
|
+
temp_output = running_folder / file_path.with_suffix('.aai.temp')
|
568
|
+
|
569
|
+
|
570
|
+
#The goal is to numpy-ize the following loop in all possible aspects for a (hopeful) speed increase
|
571
|
+
|
572
|
+
|
573
|
+
#query_num_scg = len(query_kmer_dictionary[query_id])
|
574
|
+
|
575
|
+
query_scg_list = numpy.array(list(query_kmer_dictionary[query_id].keys()))
|
576
|
+
|
577
|
+
with open(temp_output, 'w') as out_file:
|
578
|
+
|
579
|
+
'''
|
580
|
+
Target genomes each control a set of protein family keys
|
581
|
+
|
582
|
+
The goal is to get the jaccard index for the kmers in all cases
|
583
|
+
of shared protein families for the two genomes in question, for
|
584
|
+
each pair of genomes
|
585
|
+
|
586
|
+
From above, we have the number of proteins in the query dict
|
587
|
+
and a list of the IDs
|
588
|
+
|
589
|
+
below we get the number of proteins in the target dict
|
590
|
+
and a list of the IDs
|
591
|
+
|
592
|
+
1 choose the shorter list (each item has to be in both to be used, after all)
|
593
|
+
2 check if each family is in both lists
|
594
|
+
(kind of an unnecessarily big search cost, yeah? O(n) time with very few n = 1 cases; maybe we can make a dict of dicts of IDs, and check with try: [ID] except: ?)
|
595
|
+
3 get all of the jaccard similarities for kmers in shared protein families
|
596
|
+
|
597
|
+
4 calculate the mean and variance for each similarity set
|
598
|
+
|
599
|
+
5 repeat for the remaining genomes.
|
600
|
+
|
601
|
+
'''
|
602
|
+
|
603
|
+
#for target_genome, scg_ids in query_kmer_dictionary.items():
|
604
|
+
for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
|
605
|
+
scg_ids = query_kmer_dictionary[target_genome]
|
606
|
+
|
607
|
+
#If self, 1.0 similarity.
|
608
|
+
if query_id == target_genome:
|
609
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
610
|
+
1.0, 0.0,
|
611
|
+
len(query_scg_list), len(query_scg_list)))
|
612
|
+
continue
|
613
|
+
|
614
|
+
jaccard_similarities = []
|
615
|
+
# Get number and list of SCG detected in reference
|
616
|
+
#target_num_scg = len(scg_ids)
|
617
|
+
target_scg_list = numpy.array(list(scg_ids.keys()))
|
618
|
+
|
619
|
+
final_scg_list = numpy.intersect1d(query_scg_list, target_scg_list)
|
620
|
+
|
621
|
+
#I would like to figure out how to vectorize this.
|
622
|
+
for accession in final_scg_list:
|
623
|
+
#Because of the prep work, these are already numpy arrays of numbers keying to the kmers they represent from the old kmer dict..
|
624
|
+
kmers_query = query_kmer_dictionary[query_id][accession]
|
625
|
+
kmers_target = query_kmer_dictionary[target_genome][accession]
|
626
|
+
|
627
|
+
# Calculate jaccard_similarity - intersection is by far the slowest step, so this is by far the best place to optimize.
|
628
|
+
if len(kmers_query) < len(kmers_target):
|
629
|
+
intersection = len(intersect1d_searchsorted(kmers_query, kmers_target))
|
630
|
+
else:
|
631
|
+
intersection = len(intersect1d_searchsorted(kmers_target, kmers_query))
|
632
|
+
|
633
|
+
union = len(numpy.union1d(kmers_query, kmers_target))
|
634
|
+
jaccard_similarities.append(intersection / union)
|
635
|
+
|
636
|
+
#Allow for numpy in-builts; they're a little faster.
|
637
|
+
jaccard_similarities = numpy.array(jaccard_similarities, dtype=numpy.float_)
|
638
|
+
|
639
|
+
try:
|
640
|
+
#No longer needed.
|
641
|
+
#n = len(jaccard_similarities)
|
642
|
+
mean = numpy.mean(jaccard_similarities)
|
643
|
+
var = numpy.std(jaccard_similarities)
|
644
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
645
|
+
round(mean, 4), round(var, 4),
|
646
|
+
len(jaccard_similarities), len(final_scg_list)))
|
647
|
+
except:
|
648
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
649
|
+
"NA", "NA", "NA", "NA"))
|
650
|
+
return temp_output
|
651
|
+
|
652
|
+
|
653
|
+
def initializer_tracker(_dictionary1, _dictionary2):
|
654
|
+
"""
|
655
|
+
Make dictionary available for multiprocessing
|
656
|
+
"""
|
657
|
+
global kmer_dict
|
658
|
+
global tracker_dict
|
659
|
+
kmer_dict = _dictionary1
|
660
|
+
tracker_dict = _dictionary2
|
661
|
+
|
662
|
+
|
663
|
+
def unique_kmers(kmer_dict):
|
664
|
+
|
665
|
+
tracker_dict = {}
|
666
|
+
|
667
|
+
counter = 0
|
668
|
+
|
669
|
+
for file in kmer_dict:
|
670
|
+
for id in kmer_dict[file]:
|
671
|
+
#These are the actual kmers
|
672
|
+
for kmer in kmer_dict[file][id].split(','):
|
673
|
+
#Hash might be fast?
|
674
|
+
try:
|
675
|
+
tracker_dict[kmer]
|
676
|
+
except:
|
677
|
+
tracker_dict[kmer] = counter
|
678
|
+
counter += 1
|
679
|
+
|
680
|
+
return tracker_dict
|
681
|
+
|
682
|
+
|
683
|
+
def convert_kmers_to_indices(kmer_dict):
|
684
|
+
for genome in kmer_dict:
|
685
|
+
inner_count = 0
|
686
|
+
cur_tup = string_to_tup(genome)
|
687
|
+
for pf in kmer_dict[genome]:
|
688
|
+
kmer_dict[genome][pf] = cur_tup[inner_count]
|
689
|
+
inner_count += 1
|
690
|
+
|
691
|
+
return kmer_dict
|
692
|
+
|
693
|
+
def string_to_tup(genome):
|
694
|
+
sets = []
|
695
|
+
for pf in kmer_dict[genome]:
|
696
|
+
curset = []
|
697
|
+
for kmer in kmer_dict[genome][pf].split(","):
|
698
|
+
curset.append(tracker_dict[kmer])
|
699
|
+
|
700
|
+
#Do all the overhead here, ONCE.
|
701
|
+
sets.append(numpy.sort(numpy.unique(numpy.array(curset, dtype=numpy.int32))))
|
702
|
+
|
703
|
+
return(sets)
|
704
|
+
|
705
|
+
def numpyize_kmers(kmer_dict):
|
706
|
+
#make kmer global for tracker
|
707
|
+
single_dictionary_initializer(kmer_dict)
|
708
|
+
#get a list of kmer - index for all unique kmers
|
709
|
+
print("Indexing unique kmers")
|
710
|
+
tracker = unique_kmers(kmer_dict)
|
711
|
+
#Make these global for other functions
|
712
|
+
initializer_tracker(kmer_dict, tracker)
|
713
|
+
#convert comma sep. strings of kmers to ascending sorted lists of unique integers corresponding to the kmers in each protein, for each genome
|
714
|
+
print("Keying kmers")
|
715
|
+
kmer_dict = convert_kmers_to_indices(kmer_dict)
|
716
|
+
|
717
|
+
#Get skip indices
|
718
|
+
smartargs = []
|
719
|
+
genome_ids = list(kmer_dict.keys())
|
720
|
+
for i in range(0, len(genome_ids)):
|
721
|
+
smartargs.append([genome_ids[i], i])
|
722
|
+
|
723
|
+
print("Beginning AAI calculations now.")
|
724
|
+
|
725
|
+
return kmer_dict, smartargs
|
726
|
+
|
727
|
+
#relies on assuming that the values in both of these arrays are unique and sorted, which I do in str_to_tup
|
728
|
+
def intersect1d_searchsorted(A,B):
|
729
|
+
idx = numpy.searchsorted(B,A)
|
730
|
+
idx[idx==len(B)] = 0
|
731
|
+
return A[B[idx] == A]
|
732
|
+
|
733
|
+
|
734
|
+
################################################################################
|
735
|
+
"""---2.0 Main Function---"""
|
736
|
+
|
737
|
+
def main():
|
738
|
+
# Setup parser for arguments.
|
739
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
740
|
+
description='''This script calculates the average amino acid identity using k-mers\n'''
|
741
|
+
'''from single copy genes. It is a faster version of the regular AAI '''
|
742
|
+
'''(Blast or Diamond) and the hAAI implemented in MiGA.'''
|
743
|
+
'''Usage: ''' + argv[0] + ''' -p [Protein Files] -t [Threads] -o [Output]\n'''
|
744
|
+
'''Global mandatory parameters: -g [Genome Files] OR -p [Protein Files] OR -s [SCG HMM Results] -o [AAI Table Output]\n'''
|
745
|
+
'''Optional Database Parameters: See ''' + argv[0] + ' -h')
|
746
|
+
mandatory_options = parser.add_argument_group('Mandatory i/o options. You must select an option for the queries and one for the references.')
|
747
|
+
mandatory_options.add_argument('--qg', dest='query_genomes', action='store', required=False,
|
748
|
+
help='File with list of query genomes.')
|
749
|
+
mandatory_options.add_argument('--qp', dest='query_proteins', action='store', required=False,
|
750
|
+
help='File with list of query proteins.')
|
751
|
+
mandatory_options.add_argument('--qh', dest='query_hmms', action='store', required=False,
|
752
|
+
help=textwrap.dedent('''
|
753
|
+
File with list of pre-computed query hmmsearch results.
|
754
|
+
If you select this option you must also provide a file with
|
755
|
+
a list of protein files for the queries (with --qp).
|
756
|
+
'''))
|
757
|
+
mandatory_options.add_argument('--qd', dest='query_database', action='store', required=False,
|
758
|
+
help='File with list of pre-indexed query databases.')
|
759
|
+
mandatory_options.add_argument('--rg', dest='reference_genomes', action='store', required=False,
|
760
|
+
help='File with list of reference genomes.')
|
761
|
+
mandatory_options.add_argument('--rp', dest='reference_proteins', action='store', required=False,
|
762
|
+
help='File with list of reference proteins.')
|
763
|
+
mandatory_options.add_argument('--rh', dest='reference_hmms', action='store', required=False,
|
764
|
+
help=textwrap.dedent('''
|
765
|
+
File with list of pre-computed reference hmmsearch results.
|
766
|
+
If you select this option you must also provide a file with
|
767
|
+
a list of protein files for the references (with --qp).
|
768
|
+
'''))
|
769
|
+
mandatory_options.add_argument('--rd', dest='reference_database', action='store', required=False,
|
770
|
+
help='File with list of pre-indexed reference databases.')
|
771
|
+
mandatory_options.add_argument('-o', '--output', dest='output', action='store', required=False, help='Output file. By default kaai_comparisons.txt')
|
772
|
+
additional_input_options = parser.add_argument_group('Behavior modification options.')
|
773
|
+
additional_input_options.add_argument('-e', '--ext', dest='extension', action='store', required=False,
|
774
|
+
help='Extension to remove from original filename, e.g. ".fasta"')
|
775
|
+
additional_input_options.add_argument('-i', '--index', dest='index_db', action='store_true', required=False,
|
776
|
+
help='Only index and store databases, i.e., do not perform comparisons.')
|
777
|
+
misc_options = parser.add_argument_group('Miscellaneous options')
|
778
|
+
misc_options.add_argument('--virus', dest='virus', action='store_true', required=False,
|
779
|
+
help='Toggle virus-virus comparisons. Use only with viral genomes or proteins.')
|
780
|
+
misc_options.add_argument('-t', '--threads', dest='threads', action='store', default=1, type=int, required=False,
|
781
|
+
help='Number of threads to use, by default 1')
|
782
|
+
misc_options.add_argument('-k', '--keep', dest='keep', action='store_false', required=False,
|
783
|
+
help='Keep intermediate files, by default true')
|
784
|
+
|
785
|
+
args = parser.parse_args()
|
786
|
+
|
787
|
+
query_genomes = args.query_genomes
|
788
|
+
reference_genomes = args.reference_genomes
|
789
|
+
query_proteins = args.query_proteins
|
790
|
+
reference_proteins = args.reference_proteins
|
791
|
+
query_hmms = args.query_hmms
|
792
|
+
reference_hmms = args.reference_hmms
|
793
|
+
query_database = args.query_database
|
794
|
+
reference_database = args.reference_database
|
795
|
+
output = args.output
|
796
|
+
if output == None:
|
797
|
+
output == "kaai_comparisons.txt"
|
798
|
+
extension = args.extension
|
799
|
+
index_db = args.index_db
|
800
|
+
threads = args.threads
|
801
|
+
keep = args.keep
|
802
|
+
virus = args.virus
|
803
|
+
|
804
|
+
print("kAAI started on {}".format(datetime.datetime.now()))
|
805
|
+
# Check user input
|
806
|
+
# ------------------------------------------------------
|
807
|
+
# Check if no query was provided
|
808
|
+
if query_genomes == None and query_proteins == None and query_hmms == None and query_database == None:
|
809
|
+
exit('Please prove a file with a list of queries, e.g., --qg, --qp, --qh, or --qd)')
|
810
|
+
# Check query inputs
|
811
|
+
query_input = None
|
812
|
+
if query_hmms != None:
|
813
|
+
if virus == True:
|
814
|
+
exit("If you are comparing viruses, please start from the genome or protein files.")
|
815
|
+
query_input = query_hmms
|
816
|
+
if query_proteins != None:
|
817
|
+
print("Starting from query hmmsearch results.")
|
818
|
+
print("You also provided the list of protein files used for hmmsearch.")
|
819
|
+
elif query_proteins == None:
|
820
|
+
print("You chose to start from pre-computed hmmsearch results for your queries (--qh).")
|
821
|
+
print("However, I also need the location of the query proteins used for hmmsearch.")
|
822
|
+
exit("Please provide them with --qp.")
|
823
|
+
elif query_proteins != None:
|
824
|
+
query_input = query_proteins
|
825
|
+
print("Starting from query proteins.")
|
826
|
+
elif query_genomes != None:
|
827
|
+
query_input = query_genomes
|
828
|
+
print("Starting from query genomes.")
|
829
|
+
elif query_database != None:
|
830
|
+
query_input = query_database
|
831
|
+
print("Starting from the pre-indexed query database.")
|
832
|
+
# Check if no reference was provided
|
833
|
+
if reference_genomes == None and reference_proteins == None and reference_hmms == None and reference_database == None:
|
834
|
+
exit('Please prove a file with a list of references, e.g., --rg, --rp, --rh, or --rd)')
|
835
|
+
# Check reference inputs
|
836
|
+
reference_input = None
|
837
|
+
if reference_hmms != None:
|
838
|
+
if virus == True:
|
839
|
+
exit("If you are comparing viruses, please start from the genome or protein files.")
|
840
|
+
reference_input = reference_hmms
|
841
|
+
if reference_proteins != None:
|
842
|
+
print("Starting from reference hmmsearch results.")
|
843
|
+
print("You also provided the list of protein files used for hmmsearch.")
|
844
|
+
elif reference_proteins == None:
|
845
|
+
print("You chose to start from pre-computed hmmsearch results for your references (--rh).")
|
846
|
+
print("However, I also need the location of the query proteins used for hmmsearch.")
|
847
|
+
exit("Please provide them with --rp.")
|
848
|
+
elif reference_proteins != None:
|
849
|
+
reference_input = reference_proteins
|
850
|
+
print("Starting from reference proteins.")
|
851
|
+
elif reference_genomes != None:
|
852
|
+
reference_input = reference_genomes
|
853
|
+
print("Starting from reference genomes.")
|
854
|
+
elif reference_database != None:
|
855
|
+
reference_input = reference_database
|
856
|
+
print("Starting from the pre-indexed reference database.")
|
857
|
+
# ------------------------------------------------------
|
858
|
+
|
859
|
+
# Check if queries are the same as references (an all-vs-all comparison)
|
860
|
+
# ------------------------------------------------------
|
861
|
+
same_inputs = False
|
862
|
+
if query_input == reference_input:
|
863
|
+
same_inputs = True
|
864
|
+
if same_inputs == True:
|
865
|
+
print('You specified the same query and reference files.')
|
866
|
+
print('I will perform an all vs all comparison :)')
|
867
|
+
# ------------------------------------------------------
|
868
|
+
|
869
|
+
#* Database Parsing is the same regardless of bacterial or viral genomes
|
870
|
+
# If using pre-indexed databases, check if they are valid files.
|
871
|
+
# ------------------------------------------------------
|
872
|
+
# If any of the starting points is from database, then store the
|
873
|
+
# kmer structures in the corresponding dictionaries.
|
874
|
+
# Otherwise read the file list and get the filenames
|
875
|
+
query_kmer_dict = None
|
876
|
+
query_kmer_dict_list = []
|
877
|
+
reference_kmer_dict = None
|
878
|
+
reference_kmer_dict_list = []
|
879
|
+
# If starting from database and query == reference
|
880
|
+
if same_inputs == True:
|
881
|
+
if query_database != None:
|
882
|
+
with open(query_database) as query_database_files:
|
883
|
+
for db_location in query_database_files:
|
884
|
+
if Path(db_location.strip()).is_file():
|
885
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
886
|
+
temp_dict = pickle.load(database_handle)
|
887
|
+
if isinstance(temp_dict,dict):
|
888
|
+
query_kmer_dict_list.append(temp_dict)
|
889
|
+
#Carlos, this line serves no purpose but does take a bunch of time and mem.
|
890
|
+
#print(query_kmer_dict_list)
|
891
|
+
else:
|
892
|
+
exit("One of the database files appear to have the wrong format. Please provide a correctly formated databases.")
|
893
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
894
|
+
else:
|
895
|
+
# If the inputs are not the same:
|
896
|
+
# If query and ref are provided
|
897
|
+
if query_database != None and reference_database != None:
|
898
|
+
with open(query_database, 'r') as query_database_files:
|
899
|
+
for db_location in query_database_files:
|
900
|
+
if Path(db_location.strip()).is_file():
|
901
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
902
|
+
temp_dict = pickle.load(database_handle)
|
903
|
+
if isinstance(temp_dict,dict):
|
904
|
+
query_kmer_dict_list.append(temp_dict)
|
905
|
+
else:
|
906
|
+
exit("One of the query database files appear to have the wrong format. Please provide a correctly formated databases.")
|
907
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
908
|
+
with open(reference_database) as reference_database_files:
|
909
|
+
for db_location in reference_database_files:
|
910
|
+
if Path(db_location.strip()).is_file():
|
911
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
912
|
+
temp_dict = pickle.load(database_handle)
|
913
|
+
if isinstance(temp_dict,dict):
|
914
|
+
reference_kmer_dict_list.append(temp_dict)
|
915
|
+
else:
|
916
|
+
exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated databases.")
|
917
|
+
reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
|
918
|
+
# If only the query has a db
|
919
|
+
elif query_database != None and reference_database == None:
|
920
|
+
with open(query_database) as query_database_files:
|
921
|
+
for db_location in query_database_files:
|
922
|
+
if Path(db_location.strip()).is_file():
|
923
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
924
|
+
temp_dict = pickle.load(database_handle)
|
925
|
+
if isinstance(temp_dict,dict):
|
926
|
+
query_kmer_dict_list.append(temp_dict)
|
927
|
+
else:
|
928
|
+
exit("One of the query database files appear to have the wrong format. Please provide a correctly formated databases.")
|
929
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
930
|
+
# If only the reference has a db
|
931
|
+
elif query_database == None and reference_database != None:
|
932
|
+
with open(reference_database) as reference_database_files:
|
933
|
+
for db_location in reference_database_files:
|
934
|
+
if Path(db_location.strip()).is_file():
|
935
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
936
|
+
temp_dict = pickle.load(database_handle)
|
937
|
+
if isinstance(temp_dict,dict):
|
938
|
+
reference_kmer_dict_list.append(temp_dict)
|
939
|
+
else:
|
940
|
+
exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated databases.")
|
941
|
+
reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
|
942
|
+
# ------------------------------------------------------
|
943
|
+
|
944
|
+
# Get files from the query and reference lists and then
|
945
|
+
# create a dictionary with resulting filenames and a list with dictionary keys
|
946
|
+
# The structure of the dictionary is:
|
947
|
+
# original_query, proteins, hmms, filtered_hmms
|
948
|
+
# ------------------------------------------------------
|
949
|
+
# First parse the query:
|
950
|
+
query_list = []
|
951
|
+
query_file_names = {}
|
952
|
+
# For bacterial genomes
|
953
|
+
if virus == False:
|
954
|
+
if query_database != None:
|
955
|
+
pass
|
956
|
+
else:
|
957
|
+
with open(query_input, 'r') as query_input_fh:
|
958
|
+
for line in query_input_fh:
|
959
|
+
query_list.append(line.strip())
|
960
|
+
for index, query in enumerate(query_list):
|
961
|
+
query_name = str(Path(query).name)
|
962
|
+
if extension != None:
|
963
|
+
query_name = query_name.replace(extension, "")
|
964
|
+
if query_hmms != None:
|
965
|
+
query_protein_list = []
|
966
|
+
with open(query_proteins, 'r') as query_protein_fh:
|
967
|
+
for line in query_protein_fh:
|
968
|
+
query_protein_list.append(line.strip())
|
969
|
+
query_file_names[query_name] = [None, query_protein_list[index], query, query + '.filt']
|
970
|
+
elif query_proteins != None:
|
971
|
+
query_file_names[query_name] = [None, query, query + '.hmm', query + '.hmm.filt']
|
972
|
+
elif query_genomes != None:
|
973
|
+
query_file_names[query_name] = [query, query + '.faa', query + '.faa.hmm', query + '.faa.hmm.filt']
|
974
|
+
# For viral genomes
|
975
|
+
else:
|
976
|
+
if query_database != None:
|
977
|
+
pass
|
978
|
+
else:
|
979
|
+
with open(query_input, 'r') as query_input_fh:
|
980
|
+
for line in query_input_fh:
|
981
|
+
query_list.append(line.strip())
|
982
|
+
for index, query in enumerate(query_list):
|
983
|
+
query_name = str(Path(query).name)
|
984
|
+
if extension != None:
|
985
|
+
query_name = query_name.replace(extension, "")
|
986
|
+
if query_proteins != None:
|
987
|
+
query_file_names[query_name] = [None, query]
|
988
|
+
elif query_genomes != None:
|
989
|
+
query_file_names[query_name] = [query, query + '.faa']
|
990
|
+
|
991
|
+
# Then parse the references:
|
992
|
+
reference_list = []
|
993
|
+
reference_file_names = {}
|
994
|
+
if same_inputs == True:
|
995
|
+
pass
|
996
|
+
else:
|
997
|
+
# For bacterial genomes
|
998
|
+
if virus == False:
|
999
|
+
if reference_database != None:
|
1000
|
+
pass
|
1001
|
+
else:
|
1002
|
+
with open(reference_input, 'r') as reference_input_fh:
|
1003
|
+
for line in reference_input_fh:
|
1004
|
+
reference_list.append(line.strip())
|
1005
|
+
for index, reference in enumerate(reference_list):
|
1006
|
+
reference_name = str(Path(reference).name)
|
1007
|
+
if extension != None:
|
1008
|
+
reference_name = reference_name.replace(extension, "")
|
1009
|
+
if reference_hmms != None:
|
1010
|
+
reference_protein_list = []
|
1011
|
+
with open(reference_proteins, 'r') as reference_protein_fh:
|
1012
|
+
for line in reference_protein_fh:
|
1013
|
+
reference_protein_list.append(line.strip())
|
1014
|
+
reference_file_names[reference_name] = [None, reference_protein_list[index], reference, reference + '.filt']
|
1015
|
+
elif reference_proteins != None:
|
1016
|
+
reference_file_names[reference_name] = [None, reference, reference + '.hmm', reference + '.hmm.filt']
|
1017
|
+
elif query_genomes != None:
|
1018
|
+
reference_file_names[reference_name] = [reference, reference + '.faa', reference + '.faa.hmm', reference + '.faa.hmm.filt']
|
1019
|
+
# For viral genomes
|
1020
|
+
else:
|
1021
|
+
if reference_database != None:
|
1022
|
+
pass
|
1023
|
+
else:
|
1024
|
+
with open(reference_input, 'r') as reference_input_fh:
|
1025
|
+
for line in reference_input_fh:
|
1026
|
+
reference_list.append(line.strip())
|
1027
|
+
for index, reference in enumerate(reference_list):
|
1028
|
+
reference_name = str(Path(reference).name)
|
1029
|
+
if extension != None:
|
1030
|
+
reference_name = reference_name.replace(extension, "")
|
1031
|
+
if reference_proteins != None:
|
1032
|
+
reference_file_names[reference_name] = [None, reference]
|
1033
|
+
elif query_genomes != None:
|
1034
|
+
reference_file_names[reference_name] = [reference, reference + '.faa']
|
1035
|
+
# ------------------------------------------------------
|
1036
|
+
|
1037
|
+
# Pre-index and store databases
|
1038
|
+
# ------------------------------------------------------
|
1039
|
+
# Pre-index queries
|
1040
|
+
if query_kmer_dict == None:
|
1041
|
+
print("Processing queries...")
|
1042
|
+
# If using bacterial genomes
|
1043
|
+
if virus == False:
|
1044
|
+
if query_hmms != None:
|
1045
|
+
query_hmm_results = query_list
|
1046
|
+
elif query_proteins != None:
|
1047
|
+
query_protein_files = query_list
|
1048
|
+
print("Searching against HMM models...")
|
1049
|
+
try:
|
1050
|
+
pool = multiprocessing.Pool(threads)
|
1051
|
+
query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
|
1052
|
+
finally:
|
1053
|
+
pool.close()
|
1054
|
+
pool.join()
|
1055
|
+
elif query_genomes != None:
|
1056
|
+
print("Predicting proteins...")
|
1057
|
+
# Predict query proteins
|
1058
|
+
try:
|
1059
|
+
pool = multiprocessing.Pool(threads)
|
1060
|
+
query_protein_files = pool.map(run_prodigal, query_list)
|
1061
|
+
finally:
|
1062
|
+
pool.close()
|
1063
|
+
pool.join()
|
1064
|
+
print("Done!")
|
1065
|
+
print("Searching against HMM models...")
|
1066
|
+
# Run hmmsearch against proteins predicted
|
1067
|
+
try:
|
1068
|
+
pool = multiprocessing.Pool(threads)
|
1069
|
+
query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
|
1070
|
+
finally:
|
1071
|
+
pool.close()
|
1072
|
+
pool.join()
|
1073
|
+
print("Done!")
|
1074
|
+
print("Filtering query hmmsearch results...")
|
1075
|
+
# Filter query HMM search results
|
1076
|
+
try:
|
1077
|
+
pool = multiprocessing.Pool(threads)
|
1078
|
+
pool.map(partial(hmm_filter, keep=keep), query_hmm_results)
|
1079
|
+
finally:
|
1080
|
+
pool.close()
|
1081
|
+
pool.join()
|
1082
|
+
print("Extracting kmers from query proteins...")
|
1083
|
+
# Finding kmers for all queries
|
1084
|
+
query_information = []
|
1085
|
+
for name, values in query_file_names.items():
|
1086
|
+
query_information.append((name, values[1], values[3]))
|
1087
|
+
try:
|
1088
|
+
pool = multiprocessing.Pool(threads)
|
1089
|
+
kmer_results = pool.map(kmer_extract, query_information)
|
1090
|
+
finally:
|
1091
|
+
pool.close()
|
1092
|
+
pool.join()
|
1093
|
+
query_kmer_dict = merge_dicts(kmer_results)
|
1094
|
+
del kmer_results
|
1095
|
+
# If using viral genomes
|
1096
|
+
else:
|
1097
|
+
if query_genomes != None:
|
1098
|
+
print("Predicting proteins...")
|
1099
|
+
# Predict query proteins
|
1100
|
+
try:
|
1101
|
+
pool = multiprocessing.Pool(threads)
|
1102
|
+
query_protein_files = pool.map(run_prodigal_virus, query_list)
|
1103
|
+
finally:
|
1104
|
+
pool.close()
|
1105
|
+
pool.join()
|
1106
|
+
print("Done!")
|
1107
|
+
elif query_proteins != None:
|
1108
|
+
query_protein_files = query_list
|
1109
|
+
print("Extracting kmers from query proteins...")
|
1110
|
+
query_information = []
|
1111
|
+
for name, values in query_file_names.items():
|
1112
|
+
query_information.append((name, values[1], 4))
|
1113
|
+
try:
|
1114
|
+
pool = multiprocessing.Pool(threads)
|
1115
|
+
kmer_results = pool.map(read_viral_kmers_from_file, query_information)
|
1116
|
+
finally:
|
1117
|
+
pool.close()
|
1118
|
+
pool.join()
|
1119
|
+
query_kmer_dict = merge_dicts(kmer_results)
|
1120
|
+
del kmer_results
|
1121
|
+
|
1122
|
+
# Pre-index references (if different from queries)
|
1123
|
+
if same_inputs == False and reference_kmer_dict == None:
|
1124
|
+
print("Processing references...")
|
1125
|
+
# If using bacterial genomes
|
1126
|
+
if virus == False:
|
1127
|
+
if reference_hmms != None:
|
1128
|
+
reference_hmm_results = reference_list
|
1129
|
+
elif reference_proteins != None:
|
1130
|
+
reference_protein_files = reference_list
|
1131
|
+
print("Searching against HMM models... ")
|
1132
|
+
try:
|
1133
|
+
pool = multiprocessing.Pool(threads)
|
1134
|
+
reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
|
1135
|
+
finally:
|
1136
|
+
pool.close()
|
1137
|
+
pool.join()
|
1138
|
+
if reference_genomes != None:
|
1139
|
+
print("Predicting proteins...")
|
1140
|
+
# Predict reference proteins
|
1141
|
+
try:
|
1142
|
+
pool = multiprocessing.Pool(threads)
|
1143
|
+
reference_protein_files = pool.map(run_prodigal, reference_list)
|
1144
|
+
finally:
|
1145
|
+
pool.close()
|
1146
|
+
pool.join()
|
1147
|
+
print("Done!")
|
1148
|
+
print("Searching against HMM models...")
|
1149
|
+
# Run hmmsearch against proteins predicted
|
1150
|
+
try:
|
1151
|
+
pool = multiprocessing.Pool(threads)
|
1152
|
+
reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
|
1153
|
+
finally:
|
1154
|
+
pool.close()
|
1155
|
+
pool.join()
|
1156
|
+
print("Done!")
|
1157
|
+
print("Filtering reference hmmsearch results...")
|
1158
|
+
# Filter reference HMM search results
|
1159
|
+
try:
|
1160
|
+
pool = multiprocessing.Pool(threads)
|
1161
|
+
pool.map(partial(hmm_filter, keep=keep), reference_hmm_results)
|
1162
|
+
finally:
|
1163
|
+
pool.close()
|
1164
|
+
pool.join()
|
1165
|
+
print("Extracting kmers from reference proteins...")
|
1166
|
+
# Finding kmers for all queries
|
1167
|
+
reference_information = []
|
1168
|
+
for name, values in reference_file_names.items():
|
1169
|
+
reference_information.append((name, values[1], values[3]))
|
1170
|
+
try:
|
1171
|
+
pool = multiprocessing.Pool(threads)
|
1172
|
+
kmer_results = pool.map(kmer_extract, reference_information)
|
1173
|
+
finally:
|
1174
|
+
pool.close()
|
1175
|
+
pool.join()
|
1176
|
+
reference_kmer_dict = merge_dicts(kmer_results)
|
1177
|
+
del kmer_results
|
1178
|
+
# If using viral genomes
|
1179
|
+
else:
|
1180
|
+
if query_genomes != None:
|
1181
|
+
print("Predicting proteins...")
|
1182
|
+
# Predict query proteins
|
1183
|
+
try:
|
1184
|
+
pool = multiprocessing.Pool(threads)
|
1185
|
+
query_protein_files = pool.map(run_prodigal, query_list)
|
1186
|
+
finally:
|
1187
|
+
pool.close()
|
1188
|
+
pool.join()
|
1189
|
+
print("Done!")
|
1190
|
+
elif query_proteins != None:
|
1191
|
+
query_protein_files = query_list
|
1192
|
+
print("Extracting kmers from query proteins...")
|
1193
|
+
reference_information = []
|
1194
|
+
for name, values in reference_file_names.items():
|
1195
|
+
reference_information.append((name, values[1], 4))
|
1196
|
+
try:
|
1197
|
+
pool = multiprocessing.Pool(threads)
|
1198
|
+
kmer_results = pool.map(read_viral_kmers_from_file, reference_information)
|
1199
|
+
finally:
|
1200
|
+
pool.close()
|
1201
|
+
pool.join()
|
1202
|
+
query_kmer_dict = merge_dicts(kmer_results)
|
1203
|
+
del kmer_results
|
1204
|
+
# ------------------------------------------------------
|
1205
|
+
|
1206
|
+
# Create or database(s) and compress it(them)
|
1207
|
+
# ------------------------------------------------------
|
1208
|
+
if same_inputs == True and query_database == None:
|
1209
|
+
print("Saving pre-indexed database...")
|
1210
|
+
query_database_name = query_input + '.db.gz'
|
1211
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1212
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1213
|
+
if same_inputs == False and query_database == None and reference_database == None:
|
1214
|
+
print("Saving pre-indexed databases...")
|
1215
|
+
query_database_name = query_input + '.db.gz'
|
1216
|
+
reference_database_name = reference_input + '.db.gz'
|
1217
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1218
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1219
|
+
with gzip.open(reference_database_name, 'wb') as database_handle:
|
1220
|
+
pickle.dump(reference_kmer_dict, database_handle, protocol=4)
|
1221
|
+
elif same_inputs == False and query_database == None:
|
1222
|
+
print("Saving pre-indexed query database...")
|
1223
|
+
query_database_name = query_input + '.db.gz'
|
1224
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1225
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1226
|
+
elif same_inputs == False and reference_database == None:
|
1227
|
+
print("Saving pre-indexed reference database...")
|
1228
|
+
reference_database_name = reference_input + '.db.gz'
|
1229
|
+
with gzip.open(reference_database_name, 'wb') as database_handle:
|
1230
|
+
pickle.dump(reference_kmer_dict, database_handle, protocol=4)
|
1231
|
+
# ------------------------------------------------------
|
1232
|
+
# Calculate Jaccard distances
|
1233
|
+
# ------------------------------------------------------
|
1234
|
+
if index_db == True:
|
1235
|
+
print("Finished pre-indexing databases.")
|
1236
|
+
print("Next time you can run the program using only these files with --qd and(or) --rd.")
|
1237
|
+
else:
|
1238
|
+
print("Calculating shared Kmer fraction...")
|
1239
|
+
if virus == False:
|
1240
|
+
if same_inputs == True:
|
1241
|
+
query_id_list = query_kmer_dict.keys()
|
1242
|
+
try:
|
1243
|
+
|
1244
|
+
fixed_dict, smart_args = numpyize_kmers(query_kmer_dict)
|
1245
|
+
#single_dictionary_initializer(fixed_dict)
|
1246
|
+
|
1247
|
+
pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (fixed_dict,))
|
1248
|
+
Fraction_Results = pool.map(single_kaai_parser_all_v_all, smart_args)
|
1249
|
+
finally:
|
1250
|
+
pool.close()
|
1251
|
+
pool.join()
|
1252
|
+
else:
|
1253
|
+
query_id_list = query_kmer_dict.keys()
|
1254
|
+
try:
|
1255
|
+
pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
|
1256
|
+
Fraction_Results = pool.map(double_kaai_parser, query_id_list)
|
1257
|
+
finally:
|
1258
|
+
pool.close()
|
1259
|
+
pool.join()
|
1260
|
+
else:
|
1261
|
+
if same_inputs == True:
|
1262
|
+
query_id_list = query_kmer_dict.keys()
|
1263
|
+
try:
|
1264
|
+
pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
|
1265
|
+
Fraction_Results = pool.map(single_virus_kaai_parser, query_id_list)
|
1266
|
+
finally:
|
1267
|
+
pool.close()
|
1268
|
+
pool.join()
|
1269
|
+
else:
|
1270
|
+
query_id_list = query_kmer_dict.keys()
|
1271
|
+
try:
|
1272
|
+
pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
|
1273
|
+
Fraction_Results = pool.map(double_viral_kaai_parser, query_id_list)
|
1274
|
+
finally:
|
1275
|
+
pool.close()
|
1276
|
+
pool.join()
|
1277
|
+
# ------------------------------------------------------
|
1278
|
+
|
1279
|
+
# Merge results into a single output
|
1280
|
+
# ------------------------------------------------------
|
1281
|
+
print("Merging results...")
|
1282
|
+
with open(output, 'w') as outfile:
|
1283
|
+
for file in Fraction_Results:
|
1284
|
+
with open(file) as Temp:
|
1285
|
+
shutil.copyfileobj(Temp, outfile)
|
1286
|
+
file.unlink()
|
1287
|
+
print("kAAI finishied correctly on {}".format(datetime.datetime.now()))
|
1288
|
+
# ------------------------------------------------------
|
1289
|
+
# If comparing viral genomes
|
1290
|
+
|
1291
|
+
|
1292
|
+
|
1293
|
+
|
1294
|
+
|
1295
|
+
if __name__ == "__main__":
|
1296
|
+
main()
|