miga-base 0.7.26.0 → 1.0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +154 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
- data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
- data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +55 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +419 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +100 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
- data/utils/enveomics/enveomics.R/R/utils.R +80 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +304 -3
@@ -0,0 +1,1336 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
########################################################################
|
5
|
+
# Author: Carlos Ruiz
|
6
|
+
# Intitution: Georgia Institute of Technology
|
7
|
+
# Version: 1.0
|
8
|
+
# Date: Dec 10, 2020
|
9
|
+
|
10
|
+
# Description: Calculates the average amino acid identity using k-mers
|
11
|
+
from single copy genes. It is a faster version of the regular AAI (Blast
|
12
|
+
or Diamond) and the hAAI implemented in MiGA.
|
13
|
+
########################################################################
|
14
|
+
"""
|
15
|
+
|
16
|
+
################################################################################
|
17
|
+
"""---0.0 Import Modules---"""
|
18
|
+
import subprocess, argparse, multiprocessing, datetime, shutil
|
19
|
+
import textwrap, pickle, gzip
|
20
|
+
import numpy as np
|
21
|
+
from tempfile import TemporaryDirectory
|
22
|
+
from random import randint
|
23
|
+
from pathlib import Path
|
24
|
+
from sys import argv
|
25
|
+
from sys import exit
|
26
|
+
from functools import partial
|
27
|
+
import time
|
28
|
+
|
29
|
+
|
30
|
+
################################################################################
|
31
|
+
"""---1.0 Define Functions---"""
|
32
|
+
# --- Run prodigal ---
|
33
|
+
# ------------------------------------------------------
|
34
|
+
def run_prodigal(input_file):
|
35
|
+
"""
|
36
|
+
Runs prodigal, compares translation tables and stores faa files
|
37
|
+
|
38
|
+
Arguments:
|
39
|
+
input_file -- Path to genome FastA file
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
output -- Path to amino acid fasta result
|
43
|
+
"""
|
44
|
+
# Predict proteins with translation tables 4 and 11
|
45
|
+
file_path = Path(input_file)
|
46
|
+
filename = file_path.name
|
47
|
+
folder = file_path.parent
|
48
|
+
protein_output = folder / (filename + '.faa')
|
49
|
+
output_11 = folder / (filename + '.faa.11')
|
50
|
+
temp_output = folder / (filename + '.temp')
|
51
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_11),
|
52
|
+
"-p", "meta", "-q", "-o", str(temp_output)])
|
53
|
+
output_4 = folder / (filename + '.faa.4')
|
54
|
+
temp_output = folder / (filename + '.temp')
|
55
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_4),
|
56
|
+
"-p", "meta", "-g", "4", "-q", "-o", str(temp_output)])
|
57
|
+
|
58
|
+
# Compare translation tables
|
59
|
+
length_4 = 0
|
60
|
+
length_11 = 0
|
61
|
+
with open(output_4, 'r') as table_4:
|
62
|
+
for line in table_4:
|
63
|
+
if line.startswith(">"):
|
64
|
+
continue
|
65
|
+
else:
|
66
|
+
length_4 += len(line.strip())
|
67
|
+
|
68
|
+
with open(output_11, 'r') as table_11:
|
69
|
+
for line in table_11:
|
70
|
+
if line.startswith(">"):
|
71
|
+
continue
|
72
|
+
else:
|
73
|
+
length_11 += len(line.strip())
|
74
|
+
|
75
|
+
if (length_4 / length_11) >= 1.1:
|
76
|
+
shutil.copy(output_4, protein_output)
|
77
|
+
else:
|
78
|
+
shutil.copy(str(output_11), str(protein_output))
|
79
|
+
|
80
|
+
# Remove intermediate files
|
81
|
+
output_4.unlink()
|
82
|
+
output_11.unlink()
|
83
|
+
temp_output.unlink()
|
84
|
+
|
85
|
+
# Remove stop '*' codons from protein sequences
|
86
|
+
with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
|
87
|
+
for line in final_protein:
|
88
|
+
if line.startswith(">"):
|
89
|
+
temporal_file.write("{}".format(line))
|
90
|
+
else:
|
91
|
+
line = line.replace('*', '')
|
92
|
+
temporal_file.write("{}".format(line))
|
93
|
+
shutil.copy(str(temp_output), str(protein_output))
|
94
|
+
temp_output.unlink()
|
95
|
+
|
96
|
+
return str(protein_output)
|
97
|
+
# ------------------------------------------------------
|
98
|
+
|
99
|
+
# --- Run prodigal for viruses ---
|
100
|
+
# ------------------------------------------------------
|
101
|
+
def run_prodigal_virus(input_file):
|
102
|
+
"""
|
103
|
+
Runs prodigal, compares translation tables and stores faa files
|
104
|
+
|
105
|
+
Arguments:
|
106
|
+
input_file -- Path to genome FastA file
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
output -- Path to amino acid fasta result
|
110
|
+
"""
|
111
|
+
# Predict proteins with translation tables 4 and 11
|
112
|
+
file_path = Path(input_file)
|
113
|
+
filename = file_path.name
|
114
|
+
folder = file_path.parent
|
115
|
+
protein_output = folder / (filename + '.faa')
|
116
|
+
temp_output = folder / (filename + '.temp')
|
117
|
+
subprocess.call(["prodigal", "-i", str(file_path), "-a", str(protein_output),
|
118
|
+
"-p", "meta", "-q", "-o", str(temp_output)])
|
119
|
+
|
120
|
+
# Remove intermediate files
|
121
|
+
temp_output.unlink()
|
122
|
+
|
123
|
+
# Remove stop '*' codons from protein sequences
|
124
|
+
with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
|
125
|
+
for line in final_protein:
|
126
|
+
if line.startswith(">"):
|
127
|
+
temporal_file.write("{}".format(line))
|
128
|
+
else:
|
129
|
+
line = line.replace('*', '')
|
130
|
+
temporal_file.write("{}".format(line))
|
131
|
+
shutil.copy(str(temp_output), str(protein_output))
|
132
|
+
temp_output.unlink()
|
133
|
+
|
134
|
+
return str(protein_output)
|
135
|
+
# ------------------------------------------------------
|
136
|
+
|
137
|
+
# --- Run hmmsearch ---
|
138
|
+
# ------------------------------------------------------
|
139
|
+
def run_hmmsearch(input_file):
|
140
|
+
"""
|
141
|
+
Runs hmmsearch on the set of SCGs and select the
|
142
|
+
best Archaea or Bacterial model
|
143
|
+
|
144
|
+
Arguments:
|
145
|
+
input_file -- Path to protein FastA file
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
output -- Path to hmmsearch hits table
|
149
|
+
"""
|
150
|
+
file_path = Path(input_file)
|
151
|
+
folder = file_path.parent
|
152
|
+
name = file_path.name
|
153
|
+
hmm_output = folder / (name + '.hmm')
|
154
|
+
temp_output = folder / (name + '.temp')
|
155
|
+
script_path = Path(__file__)
|
156
|
+
script_dir = script_path.parent
|
157
|
+
hmm_complete_model = script_dir / "../00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
158
|
+
subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
|
159
|
+
str(hmm_complete_model), str(file_path)])
|
160
|
+
temp_output.unlink()
|
161
|
+
return str(hmm_output)
|
162
|
+
# ------------------------------------------------------
|
163
|
+
|
164
|
+
# --- Filter HMM results for best matches ---
|
165
|
+
# ------------------------------------------------------
|
166
|
+
def hmm_filter(scg_hmm_file, keep):
|
167
|
+
"""
|
168
|
+
Filters HMM results for best hits per protein
|
169
|
+
|
170
|
+
Arguments:
|
171
|
+
SCG_HMM_file {file path} -- Path to HMM results file
|
172
|
+
keep {bool} -- Keep HMM files
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
outfile -- Path to filtered files
|
176
|
+
"""
|
177
|
+
hmm_path = Path(scg_hmm_file)
|
178
|
+
name = hmm_path.name
|
179
|
+
folder = hmm_path.parent
|
180
|
+
outfile = folder / (name + '.filt')
|
181
|
+
hmm_hit_dict = {}
|
182
|
+
with open(scg_hmm_file, 'r') as hit_file:
|
183
|
+
for line in hit_file:
|
184
|
+
if line.startswith("#"):
|
185
|
+
continue
|
186
|
+
else:
|
187
|
+
hit = line.strip().split()
|
188
|
+
protein_name = hit[0]
|
189
|
+
score = float(hit[8])
|
190
|
+
if protein_name in hmm_hit_dict:
|
191
|
+
if score > hmm_hit_dict[protein_name][0]:
|
192
|
+
hmm_hit_dict[protein_name] = [score, line]
|
193
|
+
elif score < hmm_hit_dict[protein_name][0]:
|
194
|
+
continue
|
195
|
+
else:
|
196
|
+
if randint(2) > 0:
|
197
|
+
hmm_hit_dict[protein_name] = [score, line]
|
198
|
+
else:
|
199
|
+
hmm_hit_dict[protein_name] = [score, line]
|
200
|
+
with open(outfile, 'w') as output:
|
201
|
+
for hits in hmm_hit_dict.values():
|
202
|
+
output.write("{}".format(hits[1]))
|
203
|
+
return str(outfile)
|
204
|
+
# ------------------------------------------------------
|
205
|
+
|
206
|
+
# --- Find Kmers from HMM results ---
|
207
|
+
# ------------------------------------------------------
|
208
|
+
def kmer_extract(input_files):
|
209
|
+
"""
|
210
|
+
Extract kmers from protein files that have hits
|
211
|
+
in the HMM searches.
|
212
|
+
|
213
|
+
Arguments:
|
214
|
+
SCG_HMM_file {file path} -- Path to filtered HMM results.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
[genome_kmers] -- Dictionary of kmers per gene.
|
218
|
+
"""
|
219
|
+
final_filename = input_files[0]
|
220
|
+
protein_file = input_files[1]
|
221
|
+
scg_hmm_file = input_files[2]
|
222
|
+
positive_matches = {}
|
223
|
+
positive_proteins = []
|
224
|
+
with open(scg_hmm_file, 'r') as hmm_input:
|
225
|
+
for line in hmm_input:
|
226
|
+
line = line.strip().split()
|
227
|
+
protein_name = line[0]
|
228
|
+
model_name = line[3]
|
229
|
+
score = line[8]
|
230
|
+
if model_name in positive_matches:
|
231
|
+
if score > positive_matches[model_name][1]:
|
232
|
+
positive_matches[model_name] = [protein_name, score]
|
233
|
+
else:
|
234
|
+
continue
|
235
|
+
else:
|
236
|
+
positive_matches[model_name] = [protein_name, score]
|
237
|
+
for proteins in positive_matches.values():
|
238
|
+
positive_proteins.append(proteins[0])
|
239
|
+
scg_kmers = read_kmers_from_file(protein_file, positive_proteins, 4)
|
240
|
+
for accession, protein in positive_matches.items():
|
241
|
+
scg_kmers[accession] = scg_kmers.pop(protein[0])
|
242
|
+
genome_kmers = {final_filename : scg_kmers}
|
243
|
+
return genome_kmers
|
244
|
+
# ------------------------------------------------------
|
245
|
+
|
246
|
+
# --- Extract kmers from protein sequences ---
|
247
|
+
# ------------------------------------------------------
|
248
|
+
def read_kmers_from_file(filename, positive_hits, ksize):
|
249
|
+
scg_kmers = {}
|
250
|
+
store_sequence = False
|
251
|
+
protein_name = ""
|
252
|
+
protein_sequence = ""
|
253
|
+
with open(filename) as fasta_in:
|
254
|
+
for line in fasta_in:
|
255
|
+
if line.startswith(">"):
|
256
|
+
if store_sequence == True:
|
257
|
+
kmers = build_kmers(protein_sequence, ksize)
|
258
|
+
scg_kmers[protein_name] = kmers
|
259
|
+
protein_sequence = ""
|
260
|
+
store_sequence = False
|
261
|
+
line = line.replace(">", "")
|
262
|
+
protein_name = line.strip().split()[0]
|
263
|
+
if protein_name in positive_hits:
|
264
|
+
store_sequence = True
|
265
|
+
else:
|
266
|
+
if store_sequence == True:
|
267
|
+
protein_sequence += line.strip()
|
268
|
+
else:
|
269
|
+
continue
|
270
|
+
if store_sequence == True:
|
271
|
+
kmers = build_kmers(protein_sequence, ksize)
|
272
|
+
scg_kmers[protein_name] = kmers
|
273
|
+
return scg_kmers
|
274
|
+
# ------------------------------------------------------
|
275
|
+
|
276
|
+
# --- Extract kmers from viral protein sequences ---
|
277
|
+
# ------------------------------------------------------
|
278
|
+
def read_viral_kmers_from_file(input_information):
|
279
|
+
final_filename = input_information[0]
|
280
|
+
protein_file = input_information[1]
|
281
|
+
kmer_size = input_information[2]
|
282
|
+
|
283
|
+
scg_kmers = set()
|
284
|
+
protein_sequence = ""
|
285
|
+
store_sequence = False
|
286
|
+
number_of_proteins = 0
|
287
|
+
with open(protein_file) as fasta_in:
|
288
|
+
for line in fasta_in:
|
289
|
+
if line.startswith(">"):
|
290
|
+
number_of_proteins += 1
|
291
|
+
if store_sequence == True:
|
292
|
+
kmers = build_viral_kmers(protein_sequence, kmer_size)
|
293
|
+
scg_kmers.update(kmers)
|
294
|
+
protein_sequence = ""
|
295
|
+
else:
|
296
|
+
protein_sequence = ""
|
297
|
+
store_sequence = True
|
298
|
+
else:
|
299
|
+
protein_sequence += line.strip()
|
300
|
+
if store_sequence == True:
|
301
|
+
kmers = build_viral_kmers(protein_sequence, kmer_size)
|
302
|
+
scg_kmers.update(kmers)
|
303
|
+
genome_kmers = {final_filename : [number_of_proteins, ','.join(list(scg_kmers))]}
|
304
|
+
return genome_kmers
|
305
|
+
# ------------------------------------------------------
|
306
|
+
|
307
|
+
# --- Build Kmers ---
|
308
|
+
# ------------------------------------------------------
|
309
|
+
def build_kmers(sequence, ksize):
|
310
|
+
kmers = []
|
311
|
+
n_kmers = len(sequence) - ksize + 1
|
312
|
+
|
313
|
+
for i in range(n_kmers):
|
314
|
+
kmer = sequence[i:i + ksize]
|
315
|
+
kmers.append(kmer)
|
316
|
+
kmers_set = ','.join(set(kmers))
|
317
|
+
return kmers_set
|
318
|
+
# ------------------------------------------------------
|
319
|
+
|
320
|
+
# --- Build Viral Kmers ---
|
321
|
+
# ------------------------------------------------------
|
322
|
+
def build_viral_kmers(sequence, ksize):
|
323
|
+
kmers = []
|
324
|
+
n_kmers = len(sequence) - ksize + 1
|
325
|
+
|
326
|
+
for i in range(n_kmers):
|
327
|
+
kmer = sequence[i:i + ksize]
|
328
|
+
kmers.append(kmer)
|
329
|
+
kmers_set = set(kmers)
|
330
|
+
return kmers_set
|
331
|
+
# ------------------------------------------------------
|
332
|
+
|
333
|
+
# --- Create global dictionary with unique kmers and indices for each one ---
|
334
|
+
# ------------------------------------------------------
|
335
|
+
def global_unique_kmers(kmer_dictionaries):
|
336
|
+
"""
|
337
|
+
Extract every kmer in the whole dataset
|
338
|
+
Create global dictionary with unique kmers and indices for each one
|
339
|
+
|
340
|
+
Arguments:
|
341
|
+
kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
[global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
|
345
|
+
"""
|
346
|
+
# Make this dictionary global regardless of quer == reference or not
|
347
|
+
print("Indexing unique kmers")
|
348
|
+
global global_kmer_index_dictionary
|
349
|
+
global_kmer_index_dictionary = {}
|
350
|
+
counter = 0
|
351
|
+
for kmer_dict in kmer_dictionaries:
|
352
|
+
for marker_protein_id in kmer_dict.values():
|
353
|
+
for kmer_list in marker_protein_id.values():
|
354
|
+
kmer_list = kmer_list.split(',')
|
355
|
+
for kmer in kmer_list:
|
356
|
+
try:
|
357
|
+
global_kmer_index_dictionary[kmer]
|
358
|
+
except:
|
359
|
+
global_kmer_index_dictionary[kmer] = counter
|
360
|
+
counter += 1
|
361
|
+
# ------------------------------------------------------
|
362
|
+
|
363
|
+
# --- Create global viral dictionary with unique kmers and indices for each one ---
|
364
|
+
# ------------------------------------------------------
|
365
|
+
def global_unique_viral_kmers(kmer_dictionaries):
|
366
|
+
"""
|
367
|
+
Extract every kmer in the whole dataset
|
368
|
+
Create global dictionary with unique kmers and indices for each one
|
369
|
+
|
370
|
+
Arguments:
|
371
|
+
kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
|
372
|
+
|
373
|
+
Returns:
|
374
|
+
[global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
|
375
|
+
"""
|
376
|
+
# Make this dictionary global regardless of quer == reference or not
|
377
|
+
print("Indexing unique kmers")
|
378
|
+
global global_kmer_index_dictionary
|
379
|
+
global_kmer_index_dictionary = {}
|
380
|
+
counter = 0
|
381
|
+
for kmer_dict in kmer_dictionaries:
|
382
|
+
for kmer_list in kmer_dict.values():
|
383
|
+
for kmer in kmer_list[1].split(','):
|
384
|
+
try:
|
385
|
+
global_kmer_index_dictionary[kmer]
|
386
|
+
except:
|
387
|
+
global_kmer_index_dictionary[kmer] = counter
|
388
|
+
counter += 1
|
389
|
+
# ------------------------------------------------------
|
390
|
+
|
391
|
+
# --- Convert kmers to indices ---
|
392
|
+
# ------------------------------------------------------
|
393
|
+
def convert_kmers_to_indices(kmer_dict):
|
394
|
+
print("Converting kmers to indices")
|
395
|
+
for genome in kmer_dict:
|
396
|
+
for protein_marker in kmer_dict[genome]:
|
397
|
+
kmer_index = []
|
398
|
+
for kmer in kmer_dict[genome][protein_marker].split(','):
|
399
|
+
kmer_index.append(global_kmer_index_dictionary[kmer])
|
400
|
+
kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
|
401
|
+
kmer_dict[genome][protein_marker] = kmer_index
|
402
|
+
|
403
|
+
return kmer_dict
|
404
|
+
# ------------------------------------------------------
|
405
|
+
|
406
|
+
# --- Convert viral kmers to indices ---
|
407
|
+
# ------------------------------------------------------
|
408
|
+
def convert_viral_kmers_to_indices(kmer_dict):
|
409
|
+
print("Converting kmers to indices")
|
410
|
+
for genome in kmer_dict:
|
411
|
+
kmer_index = []
|
412
|
+
for kmer in kmer_dict[genome][1].split(','):
|
413
|
+
kmer_index.append(global_kmer_index_dictionary[kmer])
|
414
|
+
kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
|
415
|
+
kmer_dict[genome][1] = kmer_index
|
416
|
+
|
417
|
+
return kmer_dict
|
418
|
+
# ------------------------------------------------------
|
419
|
+
|
420
|
+
# --- Transform kmer dictionaries to index dictionaries ---
|
421
|
+
# ------------------------------------------------------
|
422
|
+
def transform_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
|
423
|
+
kmer_dict = convert_kmers_to_indices(kmer_dict)
|
424
|
+
#Get skip indices
|
425
|
+
smartargs = []
|
426
|
+
genome_ids = list(kmer_dict.keys())
|
427
|
+
for i in range(0, len(genome_ids)):
|
428
|
+
if single_dataset == True:
|
429
|
+
smartargs.append((temporal_working_directory, genome_ids[i], i))
|
430
|
+
else:
|
431
|
+
smartargs.append((temporal_working_directory, genome_ids[i]))
|
432
|
+
|
433
|
+
return kmer_dict, smartargs
|
434
|
+
# ------------------------------------------------------
|
435
|
+
|
436
|
+
# --- Transform viral kmer dictionaries to index dictionaries ---
|
437
|
+
# ------------------------------------------------------
|
438
|
+
def transform_viral_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
|
439
|
+
kmer_dict = convert_viral_kmers_to_indices(kmer_dict)
|
440
|
+
#Get skip indices
|
441
|
+
smartargs = []
|
442
|
+
genome_ids = list(kmer_dict.keys())
|
443
|
+
for i in range(0, len(genome_ids)):
|
444
|
+
if single_dataset == True:
|
445
|
+
smartargs.append((temporal_working_directory, genome_ids[i], i))
|
446
|
+
else:
|
447
|
+
smartargs.append((temporal_working_directory, genome_ids[i]))
|
448
|
+
|
449
|
+
return kmer_dict, smartargs
|
450
|
+
# ------------------------------------------------------
|
451
|
+
|
452
|
+
# --- Parse kAAI when query == reference ---
|
453
|
+
# ------------------------------------------------------
|
454
|
+
def single_kaai_parser(arguments):
|
455
|
+
"""
|
456
|
+
Calculates the Jaccard distances using single protein markers shared by two genomes
|
457
|
+
|
458
|
+
Arguments:
|
459
|
+
arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
|
460
|
+
|
461
|
+
Returns:
|
462
|
+
[Path to output] -- Path to output file
|
463
|
+
"""
|
464
|
+
temporal_folder = arguments[0]
|
465
|
+
query_id = arguments[1]
|
466
|
+
skip_first_n = arguments[2]
|
467
|
+
|
468
|
+
temporal_folder = Path(str(temporal_folder.name))
|
469
|
+
temporal_file = Path(query_id).name + '.faai.temp'
|
470
|
+
temporal_output = temporal_folder / temporal_file
|
471
|
+
|
472
|
+
query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
|
473
|
+
with open(temporal_output, 'w') as out_file:
|
474
|
+
#for target_genome, scg_ids in query_kmer_dictionary.items():
|
475
|
+
for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
|
476
|
+
# Get number and list of SCG detected in reference
|
477
|
+
target_scg_list = np.array(list(query_kmer_dictionary[target_genome].keys()))
|
478
|
+
shorter_genome = min(len(query_scg_list), len(target_scg_list))
|
479
|
+
#If self, 1.0 similarity.
|
480
|
+
if query_id == target_genome:
|
481
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
482
|
+
1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
|
483
|
+
continue
|
484
|
+
|
485
|
+
jaccard_similarities = []
|
486
|
+
# Get shared proteins (scgs)
|
487
|
+
final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
|
488
|
+
# Extract a list of kmers for each SCG in the list
|
489
|
+
query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
|
490
|
+
reference_kmer_list = list(map(query_kmer_dictionary[target_genome].get, final_scg_list))
|
491
|
+
# Calculate the jaccard index
|
492
|
+
for accession in range(len(query_kmer_list)):
|
493
|
+
union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
|
494
|
+
intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
|
495
|
+
jaccard_similarities.append(intersection / union)
|
496
|
+
|
497
|
+
# Allow for numpy in-builts; they're a little faster.
|
498
|
+
if len(jaccard_similarities) > 0:
|
499
|
+
jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
|
500
|
+
try:
|
501
|
+
mean = np.mean(jaccard_similarities)
|
502
|
+
var = np.std(jaccard_similarities)
|
503
|
+
if mean >= 0.9:
|
504
|
+
aai_est = ">90%"
|
505
|
+
elif mean == 0:
|
506
|
+
aai_est = "<30%"
|
507
|
+
else:
|
508
|
+
aai_est = kaai_to_aai(mean)
|
509
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
510
|
+
round(mean, 4), round(var, 4),
|
511
|
+
len(jaccard_similarities), shorter_genome, aai_est))
|
512
|
+
except:
|
513
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
514
|
+
"NA", "NA", "NA", "NA", "NA"))
|
515
|
+
else:
|
516
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
517
|
+
"NA", "NA", "NA", "NA", "NA"))
|
518
|
+
return temporal_output
|
519
|
+
# ------------------------------------------------------
|
520
|
+
|
521
|
+
# --- Parse viral kAAI when query == reference ---
|
522
|
+
# ------------------------------------------------------
|
523
|
+
def single_virus_kaai_parser(arguments):
|
524
|
+
"""
|
525
|
+
Calculates Jaccard distances on kmers from viral proteins
|
526
|
+
|
527
|
+
Arguments:
|
528
|
+
query_id {str} -- Id of the query genome
|
529
|
+
|
530
|
+
Returns:
|
531
|
+
[Path to output] -- Path to output file
|
532
|
+
"""
|
533
|
+
|
534
|
+
temporal_folder = arguments[0]
|
535
|
+
query_id = arguments[1]
|
536
|
+
skip_first_n = arguments[2]
|
537
|
+
|
538
|
+
temporal_folder = Path(str(temporal_folder.name))
|
539
|
+
temporal_file = Path(query_id).name + '.faai.temp'
|
540
|
+
temporal_output = temporal_folder / temporal_file
|
541
|
+
# Get query kmers
|
542
|
+
proteins_query = query_kmer_dictionary[query_id][0]
|
543
|
+
kmers_query = query_kmer_dictionary[query_id][1]
|
544
|
+
|
545
|
+
# Start comparison with all genomes in the query dictionary
|
546
|
+
with open(temporal_output, 'w') as out_file:
|
547
|
+
for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
|
548
|
+
# If self, 1.0 similarity
|
549
|
+
if query_id == target_genome:
|
550
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
551
|
+
1.0, proteins_query, proteins_query))
|
552
|
+
continue
|
553
|
+
|
554
|
+
jaccard_index = None
|
555
|
+
proteins_reference = query_kmer_dictionary[target_genome][0]
|
556
|
+
kmers_reference = query_kmer_dictionary[target_genome][1]
|
557
|
+
# Calculate the Jaccard Index
|
558
|
+
union = len(np.union1d(kmers_query, kmers_reference))
|
559
|
+
intersection = len(kmers_query) + len(kmers_reference) - union
|
560
|
+
jaccard_index = intersection/union
|
561
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
562
|
+
jaccard_index, proteins_query, proteins_reference))
|
563
|
+
return temporal_output
|
564
|
+
# ------------------------------------------------------
|
565
|
+
|
566
|
+
# --- Parse kAAI when query != reference ---
|
567
|
+
# ------------------------------------------------------
|
568
|
+
def double_kaai_parser(arguments):
|
569
|
+
"""
|
570
|
+
Calculates the Jaccard distances using single protein markers shared by two genomes
|
571
|
+
|
572
|
+
Arguments:
|
573
|
+
arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
|
574
|
+
|
575
|
+
Returns:
|
576
|
+
[Path to output] -- Path to output file
|
577
|
+
"""
|
578
|
+
temporal_folder = arguments[0]
|
579
|
+
query_id = arguments[1]
|
580
|
+
|
581
|
+
temporal_folder = Path(str(temporal_folder.name))
|
582
|
+
temporal_file = Path(query_id).name + '.faai.temp'
|
583
|
+
temporal_output = temporal_folder / temporal_file
|
584
|
+
|
585
|
+
query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
|
586
|
+
|
587
|
+
with open(temporal_output, 'w') as out_file:
|
588
|
+
for target_genome in list(reference_kmer_dictionary.keys()):
|
589
|
+
# Get number and list of SCG detected in reference
|
590
|
+
target_scg_list = np.array(list(reference_kmer_dictionary[target_genome].keys()))
|
591
|
+
shorter_genome = min(len(query_scg_list), len(target_scg_list))
|
592
|
+
#If self, 1.0 similarity.
|
593
|
+
if query_id == target_genome:
|
594
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
595
|
+
1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
|
596
|
+
continue
|
597
|
+
|
598
|
+
jaccard_similarities = []
|
599
|
+
# Get shared proteins (scgs)
|
600
|
+
final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
|
601
|
+
# Extract a list of kmers for each SCG in the list
|
602
|
+
query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
|
603
|
+
reference_kmer_list = list(map(reference_kmer_dictionary[target_genome].get, final_scg_list))
|
604
|
+
# Calculate the jaccard index
|
605
|
+
for accession in range(len(query_kmer_list)):
|
606
|
+
union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
|
607
|
+
intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
|
608
|
+
jaccard_similarities.append(intersection / union)
|
609
|
+
|
610
|
+
# Allow for numpy in-builts; they're a little faster.
|
611
|
+
if len(jaccard_similarities) > 0:
|
612
|
+
jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
|
613
|
+
try:
|
614
|
+
mean = np.mean(jaccard_similarities)
|
615
|
+
var = np.std(jaccard_similarities)
|
616
|
+
if mean >= 0.9:
|
617
|
+
aai_est = ">90%"
|
618
|
+
elif mean == 0:
|
619
|
+
aai_est = "<30%"
|
620
|
+
else:
|
621
|
+
aai_est = kaai_to_aai(mean)
|
622
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
623
|
+
round(mean, 4), round(var, 4),
|
624
|
+
len(jaccard_similarities), shorter_genome, aai_est))
|
625
|
+
except:
|
626
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
627
|
+
"NA", "NA", "NA", "NA", "NA"))
|
628
|
+
else:
|
629
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
630
|
+
"NA", "NA", "NA", "NA", "NA"))
|
631
|
+
return temporal_output
|
632
|
+
# ------------------------------------------------------
|
633
|
+
|
634
|
+
# --- Parse viral kAAI when query != reference ---
|
635
|
+
# ------------------------------------------------------
|
636
|
+
def double_viral_kaai_parser(arguments):
|
637
|
+
"""
|
638
|
+
Calculates Jaccard distances on kmers from viral proteins
|
639
|
+
|
640
|
+
Arguments:
|
641
|
+
query_id {str} -- Id of the query genome
|
642
|
+
|
643
|
+
Returns:
|
644
|
+
[Path to output] -- Path to output file
|
645
|
+
"""
|
646
|
+
temporal_folder = arguments[0]
|
647
|
+
query_id = arguments[1]
|
648
|
+
|
649
|
+
temporal_folder = Path(str(temporal_folder.name))
|
650
|
+
temporal_file = Path(query_id).name + '.faai.temp'
|
651
|
+
temporal_output = temporal_folder / temporal_file
|
652
|
+
# Get query kmers
|
653
|
+
proteins_query = query_kmer_dictionary[query_id][0]
|
654
|
+
kmers_query = query_kmer_dictionary[query_id][1]
|
655
|
+
|
656
|
+
# Start comparison with all genomes in the query dictionary
|
657
|
+
with open(temporal_output, 'w') as out_file:
|
658
|
+
for target_genome in reference_kmer_dictionary.keys():
|
659
|
+
# If self, 1.0 similarity
|
660
|
+
if query_id == target_genome:
|
661
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
662
|
+
1.0, proteins_query, proteins_query))
|
663
|
+
continue
|
664
|
+
|
665
|
+
jaccard_index = None
|
666
|
+
proteins_reference = reference_kmer_dictionary[target_genome][0]
|
667
|
+
kmers_reference = reference_kmer_dictionary[target_genome][1]
|
668
|
+
# Calculate the Jaccard Index
|
669
|
+
union = len(np.union1d(kmers_query, kmers_reference))
|
670
|
+
intersection = len(kmers_query) + len(kmers_reference) - union
|
671
|
+
jaccard_index = intersection/union
|
672
|
+
out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
|
673
|
+
jaccard_index, proteins_query, proteins_reference))
|
674
|
+
return temporal_output
|
675
|
+
# ------------------------------------------------------
|
676
|
+
|
677
|
+
# --- Query == Reference initializer function ---
|
678
|
+
# ------------------------------------------------------
|
679
|
+
def single_dictionary_initializer(_dictionary):
|
680
|
+
"""
|
681
|
+
Make dictionary available for multiprocessing
|
682
|
+
"""
|
683
|
+
global query_kmer_dictionary
|
684
|
+
query_kmer_dictionary = _dictionary
|
685
|
+
# ------------------------------------------------------
|
686
|
+
|
687
|
+
# --- Query != Reference initializer function ---
|
688
|
+
# ------------------------------------------------------
|
689
|
+
def two_dictionary_initializer(_query_dictionary, _reference_dictionary):
|
690
|
+
"""
|
691
|
+
Make dictionary available for multiprocessing
|
692
|
+
"""
|
693
|
+
global query_kmer_dictionary
|
694
|
+
global reference_kmer_dictionary
|
695
|
+
query_kmer_dictionary = _query_dictionary
|
696
|
+
reference_kmer_dictionary = _reference_dictionary
|
697
|
+
# ------------------------------------------------------
|
698
|
+
|
699
|
+
# --- Merge kmer dictionaries ---
|
700
|
+
# ------------------------------------------------------
|
701
|
+
def merge_dicts(dictionaries):
|
702
|
+
"""
|
703
|
+
Given any number of dicts, shallow copy and merge into a new dict,
|
704
|
+
precedence goes to key value pairs in latter dicts.
|
705
|
+
"""
|
706
|
+
result = {}
|
707
|
+
for kmer_dictionary in dictionaries:
|
708
|
+
result.update(kmer_dictionary)
|
709
|
+
return result
|
710
|
+
# ------------------------------------------------------
|
711
|
+
|
712
|
+
# --- Merge kmer dictionaries ---
|
713
|
+
# ------------------------------------------------------
|
714
|
+
def kaai_to_aai(kaai):
|
715
|
+
# Transform the kAAI into estimated AAI values
|
716
|
+
aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
717
|
+
return aai_hat
|
718
|
+
# ------------------------------------------------------
|
719
|
+
|
720
|
+
|
721
|
+
################################################################################
|
722
|
+
"""---2.0 Main Function---"""
|
723
|
+
|
724
|
+
def main():
|
725
|
+
# Setup parser for arguments.
|
726
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
727
|
+
description='''This script calculates the average amino acid identity using k-mers\n'''
|
728
|
+
'''from single copy genes. It is a faster version of the regular AAI '''
|
729
|
+
'''(Blast or Diamond) and the hAAI implemented in MiGA.'''
|
730
|
+
'''Usage: ''' + argv[0] + ''' -p [Protein Files] -t [Threads] -o [Output]\n'''
|
731
|
+
'''Global mandatory parameters: -g [Genome Files] OR -p [Protein Files] OR -s [SCG HMM Results] -o [AAI Table Output]\n'''
|
732
|
+
'''Optional Database Parameters: See ''' + argv[0] + ' -h')
|
733
|
+
mandatory_options = parser.add_argument_group('Mandatory i/o options. You must select an option for the queries and one for the references.')
|
734
|
+
mandatory_options.add_argument('--qg', dest='query_genomes', action='store', required=False,
|
735
|
+
help='File with list of query genomes.')
|
736
|
+
mandatory_options.add_argument('--qp', dest='query_proteins', action='store', required=False,
|
737
|
+
help='File with list of query proteins.')
|
738
|
+
mandatory_options.add_argument('--qh', dest='query_hmms', action='store', required=False,
|
739
|
+
help=textwrap.dedent('''
|
740
|
+
File with list of pre-computed query hmmsearch results.
|
741
|
+
If you select this option you must also provide a file with
|
742
|
+
a list of protein files for the queries (with --qp).
|
743
|
+
'''))
|
744
|
+
mandatory_options.add_argument('--qd', dest='query_database', action='store', required=False,
|
745
|
+
help='File with list of pre-indexed query databases.')
|
746
|
+
mandatory_options.add_argument('--rg', dest='reference_genomes', action='store', required=False,
|
747
|
+
help='File with list of reference genomes.')
|
748
|
+
mandatory_options.add_argument('--rp', dest='reference_proteins', action='store', required=False,
|
749
|
+
help='File with list of reference proteins.')
|
750
|
+
mandatory_options.add_argument('--rh', dest='reference_hmms', action='store', required=False,
|
751
|
+
help=textwrap.dedent('''
|
752
|
+
File with list of pre-computed reference hmmsearch results.
|
753
|
+
If you select this option you must also provide a file with
|
754
|
+
a list of protein files for the references (with --qp).
|
755
|
+
'''))
|
756
|
+
mandatory_options.add_argument('--rd', dest='reference_database', action='store', required=False,
|
757
|
+
help='File with list of pre-indexed reference databases.')
|
758
|
+
mandatory_options.add_argument('-o', '--output', dest='output', action='store', required=False, help='Output file. By default kaai_comparisons.txt')
|
759
|
+
additional_input_options = parser.add_argument_group('Behavior modification options.')
|
760
|
+
additional_input_options.add_argument('-e', '--ext', dest='extension', action='store', required=False,
|
761
|
+
help='Extension to remove from original filename, e.g. ".fasta"')
|
762
|
+
additional_input_options.add_argument('-i', '--index', dest='index_db', action='store_true', required=False,
|
763
|
+
help='Only index and store databases, i.e., do not perform comparisons.')
|
764
|
+
additional_input_options.add_argument('-a', '--all-vs-all', dest='all_vs_all',
|
765
|
+
action='store_true', required=False,
|
766
|
+
help='Perform all-vs-all comparison, using only query input.')
|
767
|
+
additional_input_options.add_argument('--input-paths', dest='input_paths',
|
768
|
+
action='store_true', required=False,
|
769
|
+
help='The input files are direct paths to the data, not lists of files.')
|
770
|
+
misc_options = parser.add_argument_group('Miscellaneous options')
|
771
|
+
misc_options.add_argument('--virus', dest='virus', action='store_true', required=False,
|
772
|
+
help='Toggle virus-virus comparisons. Use only with viral genomes or proteins.')
|
773
|
+
misc_options.add_argument('-t', '--threads', dest='threads', action='store', default=1, type=int, required=False,
|
774
|
+
help='Number of threads to use, by default 1')
|
775
|
+
misc_options.add_argument('-k', '--keep', dest='keep', action='store_false', required=False,
|
776
|
+
help='Keep intermediate files, by default true')
|
777
|
+
|
778
|
+
args = parser.parse_args()
|
779
|
+
|
780
|
+
query_genomes = args.query_genomes
|
781
|
+
query_proteins = args.query_proteins
|
782
|
+
query_hmms = args.query_hmms
|
783
|
+
query_database = args.query_database
|
784
|
+
if args.all_vs_all:
|
785
|
+
reference_genomes = query_genomes
|
786
|
+
reference_proteins = query_proteins
|
787
|
+
reference_hmms = query_hmms
|
788
|
+
reference_database = query_database
|
789
|
+
else:
|
790
|
+
reference_genomes = args.reference_genomes
|
791
|
+
reference_proteins = args.reference_proteins
|
792
|
+
reference_hmms = args.reference_hmms
|
793
|
+
reference_database = args.reference_database
|
794
|
+
output = args.output
|
795
|
+
if output == None:
|
796
|
+
output == "kaai_comparisons.txt"
|
797
|
+
extension = args.extension
|
798
|
+
index_db = args.index_db
|
799
|
+
threads = args.threads
|
800
|
+
keep = args.keep
|
801
|
+
virus = args.virus
|
802
|
+
input_paths = args.input_paths
|
803
|
+
|
804
|
+
print("FastAAI started on {}".format(datetime.datetime.now()))
|
805
|
+
# Check user input
|
806
|
+
# ------------------------------------------------------
|
807
|
+
# Check if no query was provided
|
808
|
+
if query_genomes == None and query_proteins == None and query_hmms == None and query_database == None:
|
809
|
+
exit('Please prove a file with a list of queries, e.g., --qg, --qp, --qh, or --qd)')
|
810
|
+
# Check query inputs
|
811
|
+
query_input = None
|
812
|
+
if query_hmms != None:
|
813
|
+
if virus == True:
|
814
|
+
exit("If you are comparing viruses, please start from the genome or protein files.")
|
815
|
+
query_input = query_hmms
|
816
|
+
if query_proteins != None:
|
817
|
+
print("Starting from query hmmsearch results.")
|
818
|
+
print("You also provided the list of protein files used for hmmsearch.")
|
819
|
+
elif query_proteins == None:
|
820
|
+
print("You chose to start from pre-computed hmmsearch results for your queries (--qh).")
|
821
|
+
print("However, I also need the location of the query proteins used for hmmsearch.")
|
822
|
+
exit("Please provide them with --qp.")
|
823
|
+
elif query_proteins != None:
|
824
|
+
query_input = query_proteins
|
825
|
+
print("Starting from query proteins.")
|
826
|
+
elif query_genomes != None:
|
827
|
+
query_input = query_genomes
|
828
|
+
print("Starting from query genomes.")
|
829
|
+
elif query_database != None:
|
830
|
+
query_input = query_database
|
831
|
+
print("Starting from the pre-indexed query database.")
|
832
|
+
# Check if no reference was provided
|
833
|
+
if reference_genomes == None and reference_proteins == None and reference_hmms == None and reference_database == None:
|
834
|
+
exit('Please prove a file with a list of references, e.g., --rg, --rp, --rh, or --rd)')
|
835
|
+
# Check reference inputs
|
836
|
+
reference_input = None
|
837
|
+
if reference_hmms != None:
|
838
|
+
if virus == True:
|
839
|
+
exit("If you are comparing viruses, please start from the genome or protein files.")
|
840
|
+
reference_input = reference_hmms
|
841
|
+
if reference_proteins != None:
|
842
|
+
print("Starting from reference hmmsearch results.")
|
843
|
+
print("You also provided the list of protein files used for hmmsearch.")
|
844
|
+
elif reference_proteins == None:
|
845
|
+
print("You chose to start from pre-computed hmmsearch results for your references (--rh).")
|
846
|
+
print("However, I also need the location of the query proteins used for hmmsearch.")
|
847
|
+
exit("Please provide them with --rp.")
|
848
|
+
elif reference_proteins != None:
|
849
|
+
reference_input = reference_proteins
|
850
|
+
print("Starting from reference proteins.")
|
851
|
+
elif reference_genomes != None:
|
852
|
+
reference_input = reference_genomes
|
853
|
+
print("Starting from reference genomes.")
|
854
|
+
elif reference_database != None:
|
855
|
+
reference_input = reference_database
|
856
|
+
print("Starting from the pre-indexed reference database.")
|
857
|
+
# ------------------------------------------------------
|
858
|
+
|
859
|
+
# Create temporal working directory
|
860
|
+
temporal_working_directory = TemporaryDirectory()
|
861
|
+
# ------------------------------------------------------
|
862
|
+
|
863
|
+
# Check if queries are the same as references (an all-vs-all comparison)
|
864
|
+
# ------------------------------------------------------
|
865
|
+
same_inputs = False
|
866
|
+
if query_input == reference_input:
|
867
|
+
same_inputs = True
|
868
|
+
if same_inputs == True:
|
869
|
+
print('You specified the same query and reference files.')
|
870
|
+
print('I will perform an all vs all comparison :)')
|
871
|
+
# ------------------------------------------------------
|
872
|
+
|
873
|
+
#* Database Parsing is the same regardless of bacterial or viral genomes
|
874
|
+
# If using pre-indexed databases, check if they are valid files.
|
875
|
+
# ------------------------------------------------------
|
876
|
+
# If any of the starting points is from database, then store the
|
877
|
+
# kmer structures in the corresponding dictionaries.
|
878
|
+
# Otherwise read the file list and get the filenames
|
879
|
+
query_kmer_dict = None
|
880
|
+
query_kmer_dict_list = []
|
881
|
+
reference_kmer_dict = None
|
882
|
+
reference_kmer_dict_list = []
|
883
|
+
query_database_files = []
|
884
|
+
reference_database_files = []
|
885
|
+
if query_database != None:
|
886
|
+
if input_paths == True:
|
887
|
+
query_database_files.append(query_database)
|
888
|
+
else:
|
889
|
+
with open(query_database) as database_files:
|
890
|
+
for db_location in database_files:
|
891
|
+
query_database_files.append(db_location)
|
892
|
+
if reference_database != None:
|
893
|
+
if input_paths == True:
|
894
|
+
reference_database_files.append(reference_database)
|
895
|
+
else:
|
896
|
+
with open(reference_database) as database_files:
|
897
|
+
for db_location in database_files:
|
898
|
+
reference_database_files.append(db_location)
|
899
|
+
|
900
|
+
# If starting from database and query == reference
|
901
|
+
if same_inputs == True:
|
902
|
+
if query_database != None:
|
903
|
+
for db_location in query_database_files:
|
904
|
+
if Path(db_location.strip()).is_file():
|
905
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
906
|
+
temp_dict = pickle.load(database_handle)
|
907
|
+
if isinstance(temp_dict,dict):
|
908
|
+
query_kmer_dict_list.append(temp_dict)
|
909
|
+
#Carlos, this line serves no purpose but does take a bunch of time and mem.
|
910
|
+
#print(query_kmer_dict_list)
|
911
|
+
else:
|
912
|
+
exit("One of the database files appear to have the wrong format. Please provide a correctly formated database.")
|
913
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
914
|
+
else:
|
915
|
+
# If the inputs are not the same:
|
916
|
+
# If query and ref are provided
|
917
|
+
if query_database != None and reference_database != None:
|
918
|
+
for db_location in query_database_files:
|
919
|
+
if Path(db_location.strip()).is_file():
|
920
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
921
|
+
temp_dict = pickle.load(database_handle)
|
922
|
+
if isinstance(temp_dict,dict):
|
923
|
+
query_kmer_dict_list.append(temp_dict)
|
924
|
+
else:
|
925
|
+
exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
|
926
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
927
|
+
for db_location in reference_database_files:
|
928
|
+
if Path(db_location.strip()).is_file():
|
929
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
930
|
+
temp_dict = pickle.load(database_handle)
|
931
|
+
if isinstance(temp_dict,dict):
|
932
|
+
reference_kmer_dict_list.append(temp_dict)
|
933
|
+
else:
|
934
|
+
exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
|
935
|
+
reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
|
936
|
+
# If only the query has a db
|
937
|
+
elif query_database != None and reference_database == None:
|
938
|
+
for db_location in query_database_files:
|
939
|
+
if Path(db_location.strip()).is_file():
|
940
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
941
|
+
temp_dict = pickle.load(database_handle)
|
942
|
+
if isinstance(temp_dict,dict):
|
943
|
+
query_kmer_dict_list.append(temp_dict)
|
944
|
+
else:
|
945
|
+
exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
|
946
|
+
query_kmer_dict = merge_dicts(query_kmer_dict_list)
|
947
|
+
# If only the reference has a db
|
948
|
+
elif query_database == None and reference_database != None:
|
949
|
+
for db_location in reference_database_files:
|
950
|
+
if Path(db_location.strip()).is_file():
|
951
|
+
with gzip.open(db_location.strip(), 'rb') as database_handle:
|
952
|
+
temp_dict = pickle.load(database_handle)
|
953
|
+
if isinstance(temp_dict,dict):
|
954
|
+
reference_kmer_dict_list.append(temp_dict)
|
955
|
+
else:
|
956
|
+
exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
|
957
|
+
reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
|
958
|
+
# ------------------------------------------------------
|
959
|
+
|
960
|
+
# Get files from the query and reference lists and then
|
961
|
+
# create a dictionary with resulting filenames and a list with dictionary keys
|
962
|
+
# The structure of the dictionary is:
|
963
|
+
# original_query, proteins, hmms, filtered_hmms
|
964
|
+
# ------------------------------------------------------
|
965
|
+
# First parse the query:
|
966
|
+
query_list = []
|
967
|
+
query_file_names = {}
|
968
|
+
# For bacterial genomes
|
969
|
+
if virus == False:
|
970
|
+
if query_database != None:
|
971
|
+
pass
|
972
|
+
else:
|
973
|
+
if input_paths == True:
|
974
|
+
query_list.append(query_input)
|
975
|
+
else:
|
976
|
+
with open(query_input, 'r') as query_input_fh:
|
977
|
+
for line in query_input_fh:
|
978
|
+
query_list.append(line.strip())
|
979
|
+
for index, query in enumerate(query_list):
|
980
|
+
query_name = str(Path(query).name)
|
981
|
+
if extension != None:
|
982
|
+
query_name = query_name.replace(extension, "")
|
983
|
+
if query_hmms != None:
|
984
|
+
query_protein_list = []
|
985
|
+
with open(query_proteins, 'r') as query_protein_fh:
|
986
|
+
for line in query_protein_fh:
|
987
|
+
query_protein_list.append(line.strip())
|
988
|
+
query_file_names[query_name] = [None, query_protein_list[index], query, query + '.filt']
|
989
|
+
elif query_proteins != None:
|
990
|
+
query_file_names[query_name] = [None, query, query + '.hmm', query + '.hmm.filt']
|
991
|
+
elif query_genomes != None:
|
992
|
+
query_file_names[query_name] = [query, query + '.faa', query + '.faa.hmm', query + '.faa.hmm.filt']
|
993
|
+
# For viral genomes
|
994
|
+
else:
|
995
|
+
if query_database != None:
|
996
|
+
pass
|
997
|
+
else:
|
998
|
+
if input_paths == True:
|
999
|
+
query_list.append(query_input)
|
1000
|
+
else:
|
1001
|
+
with open(query_input, 'r') as query_input_fh:
|
1002
|
+
for line in query_input_fh:
|
1003
|
+
query_list.append(line.strip())
|
1004
|
+
for index, query in enumerate(query_list):
|
1005
|
+
query_name = str(Path(query).name)
|
1006
|
+
if extension != None:
|
1007
|
+
query_name = query_name.replace(extension, "")
|
1008
|
+
if query_proteins != None:
|
1009
|
+
query_file_names[query_name] = [None, query]
|
1010
|
+
elif query_genomes != None:
|
1011
|
+
query_file_names[query_name] = [query, query + '.faa']
|
1012
|
+
|
1013
|
+
# Then parse the references:
|
1014
|
+
reference_list = []
|
1015
|
+
reference_file_names = {}
|
1016
|
+
if same_inputs == True:
|
1017
|
+
pass
|
1018
|
+
else:
|
1019
|
+
# For bacterial genomes
|
1020
|
+
if virus == False:
|
1021
|
+
if reference_database != None:
|
1022
|
+
pass
|
1023
|
+
else:
|
1024
|
+
if input_paths == True:
|
1025
|
+
reference_list.append(reference_input)
|
1026
|
+
else:
|
1027
|
+
with open(reference_input, 'r') as reference_input_fh:
|
1028
|
+
for line in reference_input_fh:
|
1029
|
+
reference_list.append(line.strip())
|
1030
|
+
for index, reference in enumerate(reference_list):
|
1031
|
+
reference_name = str(Path(reference).name)
|
1032
|
+
if extension != None:
|
1033
|
+
reference_name = reference_name.replace(extension, "")
|
1034
|
+
if reference_hmms != None:
|
1035
|
+
reference_protein_list = []
|
1036
|
+
with open(reference_proteins, 'r') as reference_protein_fh:
|
1037
|
+
for line in reference_protein_fh:
|
1038
|
+
reference_protein_list.append(line.strip())
|
1039
|
+
reference_file_names[reference_name] = [None, reference_protein_list[index], reference, reference + '.filt']
|
1040
|
+
elif reference_proteins != None:
|
1041
|
+
reference_file_names[reference_name] = [None, reference, reference + '.hmm', reference + '.hmm.filt']
|
1042
|
+
elif query_genomes != None:
|
1043
|
+
reference_file_names[reference_name] = [reference, reference + '.faa', reference + '.faa.hmm', reference + '.faa.hmm.filt']
|
1044
|
+
# For viral genomes
|
1045
|
+
else:
|
1046
|
+
if reference_database != None:
|
1047
|
+
pass
|
1048
|
+
else:
|
1049
|
+
if input_paths == True:
|
1050
|
+
reference_list.append(reference_input)
|
1051
|
+
else:
|
1052
|
+
with open(reference_input, 'r') as reference_input_fh:
|
1053
|
+
for line in reference_input_fh:
|
1054
|
+
reference_list.append(line.strip())
|
1055
|
+
for index, reference in enumerate(reference_list):
|
1056
|
+
reference_name = str(Path(reference).name)
|
1057
|
+
if extension != None:
|
1058
|
+
reference_name = reference_name.replace(extension, "")
|
1059
|
+
if reference_proteins != None:
|
1060
|
+
reference_file_names[reference_name] = [None, reference]
|
1061
|
+
elif query_genomes != None:
|
1062
|
+
reference_file_names[reference_name] = [reference, reference + '.faa']
|
1063
|
+
# ------------------------------------------------------
|
1064
|
+
|
1065
|
+
# Pre-index and store databases
|
1066
|
+
# ------------------------------------------------------
|
1067
|
+
# Pre-index queries
|
1068
|
+
if query_kmer_dict == None:
|
1069
|
+
print("Processing queries...")
|
1070
|
+
# If using bacterial genomes
|
1071
|
+
if virus == False:
|
1072
|
+
if query_hmms != None:
|
1073
|
+
query_hmm_results = query_list
|
1074
|
+
elif query_proteins != None:
|
1075
|
+
query_protein_files = query_list
|
1076
|
+
print("Searching against HMM models...")
|
1077
|
+
try:
|
1078
|
+
pool = multiprocessing.Pool(threads)
|
1079
|
+
query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
|
1080
|
+
finally:
|
1081
|
+
pool.close()
|
1082
|
+
pool.join()
|
1083
|
+
elif query_genomes != None:
|
1084
|
+
print("Predicting proteins...")
|
1085
|
+
# Predict query proteins
|
1086
|
+
try:
|
1087
|
+
pool = multiprocessing.Pool(threads)
|
1088
|
+
query_protein_files = pool.map(run_prodigal, query_list)
|
1089
|
+
finally:
|
1090
|
+
pool.close()
|
1091
|
+
pool.join()
|
1092
|
+
print("Done!")
|
1093
|
+
print("Searching against HMM models...")
|
1094
|
+
# Run hmmsearch against proteins predicted
|
1095
|
+
try:
|
1096
|
+
pool = multiprocessing.Pool(threads)
|
1097
|
+
query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
|
1098
|
+
finally:
|
1099
|
+
pool.close()
|
1100
|
+
pool.join()
|
1101
|
+
print("Done!")
|
1102
|
+
print("Filtering query hmmsearch results...")
|
1103
|
+
# Filter query HMM search results
|
1104
|
+
try:
|
1105
|
+
pool = multiprocessing.Pool(threads)
|
1106
|
+
pool.map(partial(hmm_filter, keep=keep), query_hmm_results)
|
1107
|
+
finally:
|
1108
|
+
pool.close()
|
1109
|
+
pool.join()
|
1110
|
+
print("Extracting kmers from query proteins...")
|
1111
|
+
# Finding kmers for all queries
|
1112
|
+
query_information = []
|
1113
|
+
for name, values in query_file_names.items():
|
1114
|
+
query_information.append((name, values[1], values[3]))
|
1115
|
+
try:
|
1116
|
+
pool = multiprocessing.Pool(threads)
|
1117
|
+
kmer_results = pool.map(kmer_extract, query_information)
|
1118
|
+
finally:
|
1119
|
+
pool.close()
|
1120
|
+
pool.join()
|
1121
|
+
query_kmer_dict = merge_dicts(kmer_results)
|
1122
|
+
del kmer_results
|
1123
|
+
# If using viral genomes
|
1124
|
+
else:
|
1125
|
+
if query_genomes != None:
|
1126
|
+
print("Predicting proteins...")
|
1127
|
+
# Predict query proteins
|
1128
|
+
try:
|
1129
|
+
pool = multiprocessing.Pool(threads)
|
1130
|
+
query_protein_files = pool.map(run_prodigal_virus, query_list)
|
1131
|
+
finally:
|
1132
|
+
pool.close()
|
1133
|
+
pool.join()
|
1134
|
+
print("Done!")
|
1135
|
+
elif query_proteins != None:
|
1136
|
+
query_protein_files = query_list
|
1137
|
+
print("Extracting kmers from query proteins...")
|
1138
|
+
query_information = []
|
1139
|
+
for name, values in query_file_names.items():
|
1140
|
+
query_information.append((name, values[1], 4))
|
1141
|
+
try:
|
1142
|
+
pool = multiprocessing.Pool(threads)
|
1143
|
+
kmer_results = pool.map(read_viral_kmers_from_file, query_information)
|
1144
|
+
finally:
|
1145
|
+
pool.close()
|
1146
|
+
pool.join()
|
1147
|
+
query_kmer_dict = merge_dicts(kmer_results)
|
1148
|
+
del kmer_results
|
1149
|
+
|
1150
|
+
# Pre-index references (if different from queries)
|
1151
|
+
if same_inputs == False and reference_kmer_dict == None:
|
1152
|
+
print("Processing references...")
|
1153
|
+
# If using bacterial genomes
|
1154
|
+
if virus == False:
|
1155
|
+
if reference_hmms != None:
|
1156
|
+
reference_hmm_results = reference_list
|
1157
|
+
elif reference_proteins != None:
|
1158
|
+
reference_protein_files = reference_list
|
1159
|
+
print("Searching against HMM models... ")
|
1160
|
+
try:
|
1161
|
+
pool = multiprocessing.Pool(threads)
|
1162
|
+
reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
|
1163
|
+
finally:
|
1164
|
+
pool.close()
|
1165
|
+
pool.join()
|
1166
|
+
if reference_genomes != None:
|
1167
|
+
print("Predicting proteins...")
|
1168
|
+
# Predict reference proteins
|
1169
|
+
try:
|
1170
|
+
pool = multiprocessing.Pool(threads)
|
1171
|
+
reference_protein_files = pool.map(run_prodigal, reference_list)
|
1172
|
+
finally:
|
1173
|
+
pool.close()
|
1174
|
+
pool.join()
|
1175
|
+
print("Done!")
|
1176
|
+
print("Searching against HMM models...")
|
1177
|
+
# Run hmmsearch against proteins predicted
|
1178
|
+
try:
|
1179
|
+
pool = multiprocessing.Pool(threads)
|
1180
|
+
reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
|
1181
|
+
finally:
|
1182
|
+
pool.close()
|
1183
|
+
pool.join()
|
1184
|
+
print("Done!")
|
1185
|
+
print("Filtering reference hmmsearch results...")
|
1186
|
+
# Filter reference HMM search results
|
1187
|
+
try:
|
1188
|
+
pool = multiprocessing.Pool(threads)
|
1189
|
+
pool.map(partial(hmm_filter, keep=keep), reference_hmm_results)
|
1190
|
+
finally:
|
1191
|
+
pool.close()
|
1192
|
+
pool.join()
|
1193
|
+
print("Extracting kmers from reference proteins...")
|
1194
|
+
# Finding kmers for all queries
|
1195
|
+
reference_information = []
|
1196
|
+
for name, values in reference_file_names.items():
|
1197
|
+
reference_information.append((name, values[1], values[3]))
|
1198
|
+
try:
|
1199
|
+
pool = multiprocessing.Pool(threads)
|
1200
|
+
kmer_results = pool.map(kmer_extract, reference_information)
|
1201
|
+
finally:
|
1202
|
+
pool.close()
|
1203
|
+
pool.join()
|
1204
|
+
reference_kmer_dict = merge_dicts(kmer_results)
|
1205
|
+
del kmer_results
|
1206
|
+
# If using viral genomes
|
1207
|
+
else:
|
1208
|
+
if query_genomes != None:
|
1209
|
+
print("Predicting proteins...")
|
1210
|
+
# Predict query proteins
|
1211
|
+
try:
|
1212
|
+
pool = multiprocessing.Pool(threads)
|
1213
|
+
query_protein_files = pool.map(run_prodigal, query_list)
|
1214
|
+
finally:
|
1215
|
+
pool.close()
|
1216
|
+
pool.join()
|
1217
|
+
print("Done!")
|
1218
|
+
elif query_proteins != None:
|
1219
|
+
query_protein_files = query_list
|
1220
|
+
print("Extracting kmers from query proteins...")
|
1221
|
+
reference_information = []
|
1222
|
+
for name, values in reference_file_names.items():
|
1223
|
+
reference_information.append((name, values[1], 4))
|
1224
|
+
try:
|
1225
|
+
pool = multiprocessing.Pool(threads)
|
1226
|
+
kmer_results = pool.map(read_viral_kmers_from_file, reference_information)
|
1227
|
+
finally:
|
1228
|
+
pool.close()
|
1229
|
+
pool.join()
|
1230
|
+
reference_kmer_dict = merge_dicts(kmer_results)
|
1231
|
+
del kmer_results
|
1232
|
+
# ------------------------------------------------------
|
1233
|
+
|
1234
|
+
# Create or database(s) and compress it(them)
|
1235
|
+
# ------------------------------------------------------
|
1236
|
+
if same_inputs == True and query_database == None:
|
1237
|
+
print("Saving pre-indexed database...")
|
1238
|
+
query_database_name = query_input + '.db.gz'
|
1239
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1240
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1241
|
+
if same_inputs == False and query_database == None and reference_database == None:
|
1242
|
+
print("Saving pre-indexed databases...")
|
1243
|
+
query_database_name = query_input + '.db.gz'
|
1244
|
+
reference_database_name = reference_input + '.db.gz'
|
1245
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1246
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1247
|
+
with gzip.open(reference_database_name, 'wb') as database_handle:
|
1248
|
+
pickle.dump(reference_kmer_dict, database_handle, protocol=4)
|
1249
|
+
elif same_inputs == False and query_database == None:
|
1250
|
+
print("Saving pre-indexed query database...")
|
1251
|
+
query_database_name = query_input + '.db.gz'
|
1252
|
+
with gzip.open(query_database_name, 'wb') as database_handle:
|
1253
|
+
pickle.dump(query_kmer_dict, database_handle, protocol=4)
|
1254
|
+
elif same_inputs == False and reference_database == None:
|
1255
|
+
print("Saving pre-indexed reference database...")
|
1256
|
+
reference_database_name = reference_input + '.db.gz'
|
1257
|
+
with gzip.open(reference_database_name, 'wb') as database_handle:
|
1258
|
+
pickle.dump(reference_kmer_dict, database_handle, protocol=4)
|
1259
|
+
# ------------------------------------------------------
|
1260
|
+
# Calculate Jaccard distances
|
1261
|
+
# ------------------------------------------------------
|
1262
|
+
if index_db == True:
|
1263
|
+
print("Finished pre-indexing databases.")
|
1264
|
+
print("Next time you can run the program using only these files with --qd and(or) --rd.")
|
1265
|
+
else:
|
1266
|
+
print("Calculating shared kmer fraction...")
|
1267
|
+
if virus == False:
|
1268
|
+
if same_inputs == True:
|
1269
|
+
# Create global kmer index dictionary "global_kmer_index_dictionary"
|
1270
|
+
print(temporal_working_directory)
|
1271
|
+
global_unique_kmers([query_kmer_dict])
|
1272
|
+
query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
|
1273
|
+
print("Beginning FastAAI pairwise calculations now.")
|
1274
|
+
try:
|
1275
|
+
pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
|
1276
|
+
Fraction_Results = pool.map(single_kaai_parser, query_smart_args_tempdir)
|
1277
|
+
finally:
|
1278
|
+
pool.close()
|
1279
|
+
pool.join()
|
1280
|
+
else:
|
1281
|
+
print(temporal_working_directory)
|
1282
|
+
global_unique_kmers([query_kmer_dict, reference_kmer_dict])
|
1283
|
+
query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
|
1284
|
+
reference_kmer_dict, _ref_smart_args_tempdir = transform_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
|
1285
|
+
print("Beginning FastAAI pairwise calculations now.")
|
1286
|
+
try:
|
1287
|
+
pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
|
1288
|
+
Fraction_Results = pool.map(double_kaai_parser, query_smart_args_tempdir)
|
1289
|
+
finally:
|
1290
|
+
pool.close()
|
1291
|
+
pool.join()
|
1292
|
+
else:
|
1293
|
+
if same_inputs == True:
|
1294
|
+
print(temporal_working_directory)
|
1295
|
+
global_unique_viral_kmers([query_kmer_dict])
|
1296
|
+
query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
|
1297
|
+
print("Beginning FastAAI pairwise calculations now.")
|
1298
|
+
try:
|
1299
|
+
pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
|
1300
|
+
Fraction_Results = pool.map(single_virus_kaai_parser, query_smart_args_tempdir)
|
1301
|
+
finally:
|
1302
|
+
pool.close()
|
1303
|
+
pool.join()
|
1304
|
+
else:
|
1305
|
+
print(temporal_working_directory)
|
1306
|
+
global_unique_viral_kmers([query_kmer_dict, reference_kmer_dict])
|
1307
|
+
query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
|
1308
|
+
reference_kmer_dict, _ref_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
|
1309
|
+
print("Beginning FastAAI pairwise calculations now.")
|
1310
|
+
try:
|
1311
|
+
pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
|
1312
|
+
Fraction_Results = pool.map(double_viral_kaai_parser, query_smart_args_tempdir)
|
1313
|
+
finally:
|
1314
|
+
pool.close()
|
1315
|
+
pool.join()
|
1316
|
+
# ------------------------------------------------------
|
1317
|
+
|
1318
|
+
# Merge results into a single output
|
1319
|
+
# ------------------------------------------------------
|
1320
|
+
print("Merging results...")
|
1321
|
+
print(temporal_working_directory)
|
1322
|
+
with open(output, 'w') as outfile:
|
1323
|
+
for file in Fraction_Results:
|
1324
|
+
with open(file) as Temp:
|
1325
|
+
shutil.copyfileobj(Temp, outfile)
|
1326
|
+
file.unlink()
|
1327
|
+
print("FastAAI finishied correctly on {}".format(datetime.datetime.now()))
|
1328
|
+
# ------------------------------------------------------
|
1329
|
+
# If comparing viral genomes
|
1330
|
+
|
1331
|
+
|
1332
|
+
|
1333
|
+
|
1334
|
+
|
1335
|
+
if __name__ == "__main__":
|
1336
|
+
main()
|