miga-base 1.2.17.0 → 1.2.17.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (265) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +2 -2
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI +3659 -0
  7. data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
  8. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/FastAAI/README.md +84 -0
  10. data/utils/enveomics/Docs/recplot2.md +244 -0
  11. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  13. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  14. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  15. data/utils/enveomics/LICENSE.txt +73 -0
  16. data/utils/enveomics/Makefile +52 -0
  17. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  18. data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
  19. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  20. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  21. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  22. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  23. data/utils/enveomics/Manifest/Tasks/mapping.json +165 -0
  24. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  25. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  26. data/utils/enveomics/Manifest/Tasks/remote.json +356 -0
  27. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
  28. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  29. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  30. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  31. data/utils/enveomics/Manifest/categories.json +165 -0
  32. data/utils/enveomics/Manifest/examples.json +162 -0
  33. data/utils/enveomics/Manifest/tasks.json +4 -0
  34. data/utils/enveomics/README.md +42 -0
  35. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  36. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  37. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  38. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  39. data/utils/enveomics/Scripts/BedGraph.tad.rb +138 -0
  40. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  41. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  42. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  43. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  44. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  45. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  46. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  47. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  48. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  49. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  50. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  51. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  52. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  53. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  54. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  55. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  56. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  57. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  58. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
  59. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  60. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  61. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  62. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  63. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  64. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  65. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  66. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  67. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  68. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  69. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  70. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  71. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  72. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  73. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  74. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  75. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  76. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  77. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  78. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  79. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  80. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  81. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  82. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  83. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  84. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  85. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  86. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  87. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  88. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  89. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  90. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  91. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  92. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  93. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  94. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  95. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  96. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  97. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  98. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  99. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  100. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  101. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  102. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  103. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  104. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  105. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  106. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  107. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  108. data/utils/enveomics/Scripts/SRA.download.bash +67 -0
  109. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  110. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  111. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  112. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  113. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  114. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  115. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  116. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  117. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  118. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  119. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  120. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  121. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  122. data/utils/enveomics/Scripts/aai.rb +421 -0
  123. data/utils/enveomics/Scripts/ani.rb +362 -0
  124. data/utils/enveomics/Scripts/anir.rb +137 -0
  125. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  126. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  127. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  128. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  129. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  130. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  131. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  132. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  133. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  134. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  135. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  136. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  137. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
  138. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  139. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  140. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  141. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  142. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  143. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  144. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  145. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
  146. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  147. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  148. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  149. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  150. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  151. data/utils/enveomics/Scripts/ogs.rb +104 -0
  152. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  153. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  154. data/utils/enveomics/Scripts/rbm.rb +108 -0
  155. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  156. data/utils/enveomics/Tests/Makefile +10 -0
  157. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  158. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  159. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  160. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  161. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  162. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  163. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  164. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  165. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  166. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  167. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  168. data/utils/enveomics/Tests/alkB.nwk +1 -0
  169. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  170. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  171. data/utils/enveomics/Tests/hiv1.faa +59 -0
  172. data/utils/enveomics/Tests/hiv1.fna +134 -0
  173. data/utils/enveomics/Tests/hiv2.faa +70 -0
  174. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  175. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  176. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  177. data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
  178. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  179. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  180. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  181. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  182. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  183. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  184. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  185. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  186. data/utils/enveomics/build_enveomics_r.bash +45 -0
  187. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  188. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  189. data/utils/enveomics/enveomics.R/R/autoprune.R +167 -0
  190. data/utils/enveomics/enveomics.R/R/barplot.R +203 -0
  191. data/utils/enveomics/enveomics.R/R/cliopts.R +141 -0
  192. data/utils/enveomics/enveomics.R/R/df2dist.R +192 -0
  193. data/utils/enveomics/enveomics.R/R/growthcurve.R +349 -0
  194. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  195. data/utils/enveomics/enveomics.R/R/recplot.R +419 -0
  196. data/utils/enveomics/enveomics.R/R/recplot2.R +1698 -0
  197. data/utils/enveomics/enveomics.R/R/tribs.R +638 -0
  198. data/utils/enveomics/enveomics.R/R/utils.R +90 -0
  199. data/utils/enveomics/enveomics.R/README.md +81 -0
  200. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  201. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  202. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  203. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  204. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  205. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  206. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +47 -0
  207. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  208. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  209. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +26 -0
  210. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +26 -0
  211. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +44 -0
  212. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +111 -0
  213. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  214. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +34 -0
  215. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +25 -0
  216. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +59 -0
  217. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +63 -0
  218. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +46 -0
  219. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +78 -0
  220. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  221. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  222. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +147 -0
  223. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  224. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +27 -0
  225. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  226. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +28 -0
  227. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +24 -0
  228. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +22 -0
  229. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +22 -0
  230. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +52 -0
  231. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  232. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +21 -0
  233. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  234. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +34 -0
  235. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +23 -0
  236. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +24 -0
  237. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +31 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +56 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +20 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  242. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  243. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  244. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  245. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  246. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  247. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  248. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  249. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  250. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  251. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  252. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +81 -0
  253. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +49 -0
  254. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +48 -0
  255. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  256. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +22 -0
  257. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +22 -0
  258. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +22 -0
  259. data/utils/enveomics/globals.mk +8 -0
  260. data/utils/enveomics/manifest.json +9 -0
  261. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  262. data/utils/multitrim/README.md +67 -0
  263. data/utils/multitrim/multitrim.py +1555 -0
  264. data/utils/multitrim/multitrim.yml +13 -0
  265. metadata +268 -6
@@ -0,0 +1,1336 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ ########################################################################
5
+ # Author: Carlos Ruiz
6
+ # Intitution: Georgia Institute of Technology
7
+ # Version: 1.0
8
+ # Date: Dec 10, 2020
9
+
10
+ # Description: Calculates the average amino acid identity using k-mers
11
+ from single copy genes. It is a faster version of the regular AAI (Blast
12
+ or Diamond) and the hAAI implemented in MiGA.
13
+ ########################################################################
14
+ """
15
+
16
+ ################################################################################
17
+ """---0.0 Import Modules---"""
18
+ import subprocess, argparse, multiprocessing, datetime, shutil
19
+ import textwrap, pickle, gzip
20
+ import numpy as np
21
+ from tempfile import TemporaryDirectory
22
+ from random import randint
23
+ from pathlib import Path
24
+ from sys import argv
25
+ from sys import exit
26
+ from functools import partial
27
+ import time
28
+
29
+
30
+ ################################################################################
31
+ """---1.0 Define Functions---"""
32
+ # --- Run prodigal ---
33
+ # ------------------------------------------------------
34
+ def run_prodigal(input_file):
35
+ """
36
+ Runs prodigal, compares translation tables and stores faa files
37
+
38
+ Arguments:
39
+ input_file -- Path to genome FastA file
40
+
41
+ Returns:
42
+ output -- Path to amino acid fasta result
43
+ """
44
+ # Predict proteins with translation tables 4 and 11
45
+ file_path = Path(input_file)
46
+ filename = file_path.name
47
+ folder = file_path.parent
48
+ protein_output = folder / (filename + '.faa')
49
+ output_11 = folder / (filename + '.faa.11')
50
+ temp_output = folder / (filename + '.temp')
51
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_11),
52
+ "-p", "meta", "-q", "-o", str(temp_output)])
53
+ output_4 = folder / (filename + '.faa.4')
54
+ temp_output = folder / (filename + '.temp')
55
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_4),
56
+ "-p", "meta", "-g", "4", "-q", "-o", str(temp_output)])
57
+
58
+ # Compare translation tables
59
+ length_4 = 0
60
+ length_11 = 0
61
+ with open(output_4, 'r') as table_4:
62
+ for line in table_4:
63
+ if line.startswith(">"):
64
+ continue
65
+ else:
66
+ length_4 += len(line.strip())
67
+
68
+ with open(output_11, 'r') as table_11:
69
+ for line in table_11:
70
+ if line.startswith(">"):
71
+ continue
72
+ else:
73
+ length_11 += len(line.strip())
74
+
75
+ if (length_4 / length_11) >= 1.1:
76
+ shutil.copy(output_4, protein_output)
77
+ else:
78
+ shutil.copy(str(output_11), str(protein_output))
79
+
80
+ # Remove intermediate files
81
+ output_4.unlink()
82
+ output_11.unlink()
83
+ temp_output.unlink()
84
+
85
+ # Remove stop '*' codons from protein sequences
86
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
87
+ for line in final_protein:
88
+ if line.startswith(">"):
89
+ temporal_file.write("{}".format(line))
90
+ else:
91
+ line = line.replace('*', '')
92
+ temporal_file.write("{}".format(line))
93
+ shutil.copy(str(temp_output), str(protein_output))
94
+ temp_output.unlink()
95
+
96
+ return str(protein_output)
97
+ # ------------------------------------------------------
98
+
99
+ # --- Run prodigal for viruses ---
100
+ # ------------------------------------------------------
101
+ def run_prodigal_virus(input_file):
102
+ """
103
+ Runs prodigal, compares translation tables and stores faa files
104
+
105
+ Arguments:
106
+ input_file -- Path to genome FastA file
107
+
108
+ Returns:
109
+ output -- Path to amino acid fasta result
110
+ """
111
+ # Predict proteins with translation tables 4 and 11
112
+ file_path = Path(input_file)
113
+ filename = file_path.name
114
+ folder = file_path.parent
115
+ protein_output = folder / (filename + '.faa')
116
+ temp_output = folder / (filename + '.temp')
117
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(protein_output),
118
+ "-p", "meta", "-q", "-o", str(temp_output)])
119
+
120
+ # Remove intermediate files
121
+ temp_output.unlink()
122
+
123
+ # Remove stop '*' codons from protein sequences
124
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
125
+ for line in final_protein:
126
+ if line.startswith(">"):
127
+ temporal_file.write("{}".format(line))
128
+ else:
129
+ line = line.replace('*', '')
130
+ temporal_file.write("{}".format(line))
131
+ shutil.copy(str(temp_output), str(protein_output))
132
+ temp_output.unlink()
133
+
134
+ return str(protein_output)
135
+ # ------------------------------------------------------
136
+
137
+ # --- Run hmmsearch ---
138
+ # ------------------------------------------------------
139
+ def run_hmmsearch(input_file):
140
+ """
141
+ Runs hmmsearch on the set of SCGs and select the
142
+ best Archaea or Bacterial model
143
+
144
+ Arguments:
145
+ input_file -- Path to protein FastA file
146
+
147
+ Returns:
148
+ output -- Path to hmmsearch hits table
149
+ """
150
+ file_path = Path(input_file)
151
+ folder = file_path.parent
152
+ name = file_path.name
153
+ hmm_output = folder / (name + '.hmm')
154
+ temp_output = folder / (name + '.temp')
155
+ script_path = Path(__file__)
156
+ script_dir = script_path.parent
157
+ hmm_complete_model = script_dir / "../00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
158
+ subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
159
+ str(hmm_complete_model), str(file_path)])
160
+ temp_output.unlink()
161
+ return str(hmm_output)
162
+ # ------------------------------------------------------
163
+
164
+ # --- Filter HMM results for best matches ---
165
+ # ------------------------------------------------------
166
+ def hmm_filter(scg_hmm_file, keep):
167
+ """
168
+ Filters HMM results for best hits per protein
169
+
170
+ Arguments:
171
+ SCG_HMM_file {file path} -- Path to HMM results file
172
+ keep {bool} -- Keep HMM files
173
+
174
+ Returns:
175
+ outfile -- Path to filtered files
176
+ """
177
+ hmm_path = Path(scg_hmm_file)
178
+ name = hmm_path.name
179
+ folder = hmm_path.parent
180
+ outfile = folder / (name + '.filt')
181
+ hmm_hit_dict = {}
182
+ with open(scg_hmm_file, 'r') as hit_file:
183
+ for line in hit_file:
184
+ if line.startswith("#"):
185
+ continue
186
+ else:
187
+ hit = line.strip().split()
188
+ protein_name = hit[0]
189
+ score = float(hit[8])
190
+ if protein_name in hmm_hit_dict:
191
+ if score > hmm_hit_dict[protein_name][0]:
192
+ hmm_hit_dict[protein_name] = [score, line]
193
+ elif score < hmm_hit_dict[protein_name][0]:
194
+ continue
195
+ else:
196
+ if randint(2) > 0:
197
+ hmm_hit_dict[protein_name] = [score, line]
198
+ else:
199
+ hmm_hit_dict[protein_name] = [score, line]
200
+ with open(outfile, 'w') as output:
201
+ for hits in hmm_hit_dict.values():
202
+ output.write("{}".format(hits[1]))
203
+ return str(outfile)
204
+ # ------------------------------------------------------
205
+
206
+ # --- Find Kmers from HMM results ---
207
+ # ------------------------------------------------------
208
+ def kmer_extract(input_files):
209
+ """
210
+ Extract kmers from protein files that have hits
211
+ in the HMM searches.
212
+
213
+ Arguments:
214
+ SCG_HMM_file {file path} -- Path to filtered HMM results.
215
+
216
+ Returns:
217
+ [genome_kmers] -- Dictionary of kmers per gene.
218
+ """
219
+ final_filename = input_files[0]
220
+ protein_file = input_files[1]
221
+ scg_hmm_file = input_files[2]
222
+ positive_matches = {}
223
+ positive_proteins = []
224
+ with open(scg_hmm_file, 'r') as hmm_input:
225
+ for line in hmm_input:
226
+ line = line.strip().split()
227
+ protein_name = line[0]
228
+ model_name = line[3]
229
+ score = float(line[8])
230
+ if model_name in positive_matches:
231
+ if score > positive_matches[model_name][1]:
232
+ positive_matches[model_name] = [protein_name, score]
233
+ else:
234
+ continue
235
+ else:
236
+ positive_matches[model_name] = [protein_name, score]
237
+ for proteins in positive_matches.values():
238
+ positive_proteins.append(proteins[0])
239
+ scg_kmers = read_kmers_from_file(protein_file, positive_proteins, 4)
240
+ for accession, protein in positive_matches.items():
241
+ scg_kmers[accession] = scg_kmers.pop(protein[0])
242
+ genome_kmers = {final_filename : scg_kmers}
243
+ return genome_kmers
244
+ # ------------------------------------------------------
245
+
246
+ # --- Extract kmers from protein sequences ---
247
+ # ------------------------------------------------------
248
+ def read_kmers_from_file(filename, positive_hits, ksize):
249
+ scg_kmers = {}
250
+ store_sequence = False
251
+ protein_name = ""
252
+ protein_sequence = ""
253
+ with open(filename) as fasta_in:
254
+ for line in fasta_in:
255
+ if line.startswith(">"):
256
+ if store_sequence == True:
257
+ kmers = build_kmers(protein_sequence, ksize)
258
+ scg_kmers[protein_name] = kmers
259
+ protein_sequence = ""
260
+ store_sequence = False
261
+ line = line.replace(">", "")
262
+ protein_name = line.strip().split()[0]
263
+ if protein_name in positive_hits:
264
+ store_sequence = True
265
+ else:
266
+ if store_sequence == True:
267
+ protein_sequence += line.strip()
268
+ else:
269
+ continue
270
+ if store_sequence == True:
271
+ kmers = build_kmers(protein_sequence, ksize)
272
+ scg_kmers[protein_name] = kmers
273
+ return scg_kmers
274
+ # ------------------------------------------------------
275
+
276
+ # --- Extract kmers from viral protein sequences ---
277
+ # ------------------------------------------------------
278
+ def read_viral_kmers_from_file(input_information):
279
+ final_filename = input_information[0]
280
+ protein_file = input_information[1]
281
+ kmer_size = input_information[2]
282
+
283
+ scg_kmers = set()
284
+ protein_sequence = ""
285
+ store_sequence = False
286
+ number_of_proteins = 0
287
+ with open(protein_file) as fasta_in:
288
+ for line in fasta_in:
289
+ if line.startswith(">"):
290
+ number_of_proteins += 1
291
+ if store_sequence == True:
292
+ kmers = build_viral_kmers(protein_sequence, kmer_size)
293
+ scg_kmers.update(kmers)
294
+ protein_sequence = ""
295
+ else:
296
+ protein_sequence = ""
297
+ store_sequence = True
298
+ else:
299
+ protein_sequence += line.strip()
300
+ if store_sequence == True:
301
+ kmers = build_viral_kmers(protein_sequence, kmer_size)
302
+ scg_kmers.update(kmers)
303
+ genome_kmers = {final_filename : [number_of_proteins, ','.join(list(scg_kmers))]}
304
+ return genome_kmers
305
+ # ------------------------------------------------------
306
+
307
+ # --- Build Kmers ---
308
+ # ------------------------------------------------------
309
+ def build_kmers(sequence, ksize):
310
+ kmers = []
311
+ n_kmers = len(sequence) - ksize + 1
312
+
313
+ for i in range(n_kmers):
314
+ kmer = sequence[i:i + ksize]
315
+ kmers.append(kmer)
316
+ kmers_set = ','.join(set(kmers))
317
+ return kmers_set
318
+ # ------------------------------------------------------
319
+
320
+ # --- Build Viral Kmers ---
321
+ # ------------------------------------------------------
322
+ def build_viral_kmers(sequence, ksize):
323
+ kmers = []
324
+ n_kmers = len(sequence) - ksize + 1
325
+
326
+ for i in range(n_kmers):
327
+ kmer = sequence[i:i + ksize]
328
+ kmers.append(kmer)
329
+ kmers_set = set(kmers)
330
+ return kmers_set
331
+ # ------------------------------------------------------
332
+
333
+ # --- Create global dictionary with unique kmers and indices for each one ---
334
+ # ------------------------------------------------------
335
+ def global_unique_kmers(kmer_dictionaries):
336
+ """
337
+ Extract every kmer in the whole dataset
338
+ Create global dictionary with unique kmers and indices for each one
339
+
340
+ Arguments:
341
+ kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
342
+
343
+ Returns:
344
+ [global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
345
+ """
346
+ # Make this dictionary global regardless of quer == reference or not
347
+ print("Indexing unique kmers")
348
+ global global_kmer_index_dictionary
349
+ global_kmer_index_dictionary = {}
350
+ counter = 0
351
+ for kmer_dict in kmer_dictionaries:
352
+ for marker_protein_id in kmer_dict.values():
353
+ for kmer_list in marker_protein_id.values():
354
+ kmer_list = kmer_list.split(',')
355
+ for kmer in kmer_list:
356
+ try:
357
+ global_kmer_index_dictionary[kmer]
358
+ except:
359
+ global_kmer_index_dictionary[kmer] = counter
360
+ counter += 1
361
+ # ------------------------------------------------------
362
+
363
+ # --- Create global viral dictionary with unique kmers and indices for each one ---
364
+ # ------------------------------------------------------
365
+ def global_unique_viral_kmers(kmer_dictionaries):
366
+ """
367
+ Extract every kmer in the whole dataset
368
+ Create global dictionary with unique kmers and indices for each one
369
+
370
+ Arguments:
371
+ kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
372
+
373
+ Returns:
374
+ [global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
375
+ """
376
+ # Make this dictionary global regardless of quer == reference or not
377
+ print("Indexing unique kmers")
378
+ global global_kmer_index_dictionary
379
+ global_kmer_index_dictionary = {}
380
+ counter = 0
381
+ for kmer_dict in kmer_dictionaries:
382
+ for kmer_list in kmer_dict.values():
383
+ for kmer in kmer_list[1].split(','):
384
+ try:
385
+ global_kmer_index_dictionary[kmer]
386
+ except:
387
+ global_kmer_index_dictionary[kmer] = counter
388
+ counter += 1
389
+ # ------------------------------------------------------
390
+
391
+ # --- Convert kmers to indices ---
392
+ # ------------------------------------------------------
393
+ def convert_kmers_to_indices(kmer_dict):
394
+ print("Converting kmers to indices")
395
+ for genome in kmer_dict:
396
+ for protein_marker in kmer_dict[genome]:
397
+ kmer_index = []
398
+ for kmer in kmer_dict[genome][protein_marker].split(','):
399
+ kmer_index.append(global_kmer_index_dictionary[kmer])
400
+ kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
401
+ kmer_dict[genome][protein_marker] = kmer_index
402
+
403
+ return kmer_dict
404
+ # ------------------------------------------------------
405
+
406
+ # --- Convert viral kmers to indices ---
407
+ # ------------------------------------------------------
408
+ def convert_viral_kmers_to_indices(kmer_dict):
409
+ print("Converting kmers to indices")
410
+ for genome in kmer_dict:
411
+ kmer_index = []
412
+ for kmer in kmer_dict[genome][1].split(','):
413
+ kmer_index.append(global_kmer_index_dictionary[kmer])
414
+ kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
415
+ kmer_dict[genome][1] = kmer_index
416
+
417
+ return kmer_dict
418
+ # ------------------------------------------------------
419
+
420
+ # --- Transform kmer dictionaries to index dictionaries ---
421
+ # ------------------------------------------------------
422
+ def transform_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
423
+ kmer_dict = convert_kmers_to_indices(kmer_dict)
424
+ #Get skip indices
425
+ smartargs = []
426
+ genome_ids = list(kmer_dict.keys())
427
+ for i in range(0, len(genome_ids)):
428
+ if single_dataset == True:
429
+ smartargs.append((temporal_working_directory, genome_ids[i], i))
430
+ else:
431
+ smartargs.append((temporal_working_directory, genome_ids[i]))
432
+
433
+ return kmer_dict, smartargs
434
+ # ------------------------------------------------------
435
+
436
+ # --- Transform viral kmer dictionaries to index dictionaries ---
437
+ # ------------------------------------------------------
438
+ def transform_viral_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
439
+ kmer_dict = convert_viral_kmers_to_indices(kmer_dict)
440
+ #Get skip indices
441
+ smartargs = []
442
+ genome_ids = list(kmer_dict.keys())
443
+ for i in range(0, len(genome_ids)):
444
+ if single_dataset == True:
445
+ smartargs.append((temporal_working_directory, genome_ids[i], i))
446
+ else:
447
+ smartargs.append((temporal_working_directory, genome_ids[i]))
448
+
449
+ return kmer_dict, smartargs
450
+ # ------------------------------------------------------
451
+
452
+ # --- Parse kAAI when query == reference ---
453
+ # ------------------------------------------------------
454
+ def single_kaai_parser(arguments):
455
+ """
456
+ Calculates the Jaccard distances using single protein markers shared by two genomes
457
+
458
+ Arguments:
459
+ arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
460
+
461
+ Returns:
462
+ [Path to output] -- Path to output file
463
+ """
464
+ temporal_folder = arguments[0]
465
+ query_id = arguments[1]
466
+ skip_first_n = arguments[2]
467
+
468
+ temporal_folder = Path(str(temporal_folder.name))
469
+ temporal_file = Path(query_id).name + '.faai.temp'
470
+ temporal_output = temporal_folder / temporal_file
471
+
472
+ query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
473
+ with open(temporal_output, 'w') as out_file:
474
+ #for target_genome, scg_ids in query_kmer_dictionary.items():
475
+ for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
476
+ # Get number and list of SCG detected in reference
477
+ target_scg_list = np.array(list(query_kmer_dictionary[target_genome].keys()))
478
+ shorter_genome = min(len(query_scg_list), len(target_scg_list))
479
+ #If self, 1.0 similarity.
480
+ if query_id == target_genome:
481
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
482
+ 1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
483
+ continue
484
+
485
+ jaccard_similarities = []
486
+ # Get shared proteins (scgs)
487
+ final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
488
+ # Extract a list of kmers for each SCG in the list
489
+ query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
490
+ reference_kmer_list = list(map(query_kmer_dictionary[target_genome].get, final_scg_list))
491
+ # Calculate the jaccard index
492
+ for accession in range(len(query_kmer_list)):
493
+ union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
494
+ intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
495
+ jaccard_similarities.append(intersection / union)
496
+
497
+ # Allow for numpy in-builts; they're a little faster.
498
+ if len(jaccard_similarities) > 0:
499
+ jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
500
+ try:
501
+ mean = np.mean(jaccard_similarities)
502
+ var = np.std(jaccard_similarities)
503
+ if mean >= 0.9:
504
+ aai_est = ">90%"
505
+ elif mean == 0:
506
+ aai_est = "<30%"
507
+ else:
508
+ aai_est = kaai_to_aai(mean)
509
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
510
+ round(mean, 4), round(var, 4),
511
+ len(jaccard_similarities), shorter_genome, aai_est))
512
+ except:
513
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
514
+ "NA", "NA", "NA", "NA", "NA"))
515
+ else:
516
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
517
+ "NA", "NA", "NA", "NA", "NA"))
518
+ return temporal_output
519
+ # ------------------------------------------------------
520
+
521
+ # --- Parse viral kAAI when query == reference ---
522
+ # ------------------------------------------------------
523
+ def single_virus_kaai_parser(arguments):
524
+ """
525
+ Calculates Jaccard distances on kmers from viral proteins
526
+
527
+ Arguments:
528
+ query_id {str} -- Id of the query genome
529
+
530
+ Returns:
531
+ [Path to output] -- Path to output file
532
+ """
533
+
534
+ temporal_folder = arguments[0]
535
+ query_id = arguments[1]
536
+ skip_first_n = arguments[2]
537
+
538
+ temporal_folder = Path(str(temporal_folder.name))
539
+ temporal_file = Path(query_id).name + '.faai.temp'
540
+ temporal_output = temporal_folder / temporal_file
541
+ # Get query kmers
542
+ proteins_query = query_kmer_dictionary[query_id][0]
543
+ kmers_query = query_kmer_dictionary[query_id][1]
544
+
545
+ # Start comparison with all genomes in the query dictionary
546
+ with open(temporal_output, 'w') as out_file:
547
+ for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
548
+ # If self, 1.0 similarity
549
+ if query_id == target_genome:
550
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
551
+ 1.0, proteins_query, proteins_query))
552
+ continue
553
+
554
+ jaccard_index = None
555
+ proteins_reference = query_kmer_dictionary[target_genome][0]
556
+ kmers_reference = query_kmer_dictionary[target_genome][1]
557
+ # Calculate the Jaccard Index
558
+ union = len(np.union1d(kmers_query, kmers_reference))
559
+ intersection = len(kmers_query) + len(kmers_reference) - union
560
+ jaccard_index = intersection/union
561
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
562
+ jaccard_index, proteins_query, proteins_reference))
563
+ return temporal_output
564
+ # ------------------------------------------------------
565
+
566
+ # --- Parse kAAI when query != reference ---
567
+ # ------------------------------------------------------
568
+ def double_kaai_parser(arguments):
569
+ """
570
+ Calculates the Jaccard distances using single protein markers shared by two genomes
571
+
572
+ Arguments:
573
+ arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
574
+
575
+ Returns:
576
+ [Path to output] -- Path to output file
577
+ """
578
+ temporal_folder = arguments[0]
579
+ query_id = arguments[1]
580
+
581
+ temporal_folder = Path(str(temporal_folder.name))
582
+ temporal_file = Path(query_id).name + '.faai.temp'
583
+ temporal_output = temporal_folder / temporal_file
584
+
585
+ query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
586
+
587
+ with open(temporal_output, 'w') as out_file:
588
+ for target_genome in list(reference_kmer_dictionary.keys()):
589
+ # Get number and list of SCG detected in reference
590
+ target_scg_list = np.array(list(reference_kmer_dictionary[target_genome].keys()))
591
+ shorter_genome = min(len(query_scg_list), len(target_scg_list))
592
+ #If self, 1.0 similarity.
593
+ if query_id == target_genome:
594
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
595
+ 1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
596
+ continue
597
+
598
+ jaccard_similarities = []
599
+ # Get shared proteins (scgs)
600
+ final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
601
+ # Extract a list of kmers for each SCG in the list
602
+ query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
603
+ reference_kmer_list = list(map(reference_kmer_dictionary[target_genome].get, final_scg_list))
604
+ # Calculate the jaccard index
605
+ for accession in range(len(query_kmer_list)):
606
+ union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
607
+ intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
608
+ jaccard_similarities.append(intersection / union)
609
+
610
+ # Allow for numpy in-builts; they're a little faster.
611
+ if len(jaccard_similarities) > 0:
612
+ jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
613
+ try:
614
+ mean = np.mean(jaccard_similarities)
615
+ var = np.std(jaccard_similarities)
616
+ if mean >= 0.9:
617
+ aai_est = ">90%"
618
+ elif mean == 0:
619
+ aai_est = "<30%"
620
+ else:
621
+ aai_est = kaai_to_aai(mean)
622
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
623
+ round(mean, 4), round(var, 4),
624
+ len(jaccard_similarities), shorter_genome, aai_est))
625
+ except:
626
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
627
+ "NA", "NA", "NA", "NA", "NA"))
628
+ else:
629
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
630
+ "NA", "NA", "NA", "NA", "NA"))
631
+ return temporal_output
632
+ # ------------------------------------------------------
633
+
634
+ # --- Parse viral kAAI when query != reference ---
635
+ # ------------------------------------------------------
636
+ def double_viral_kaai_parser(arguments):
637
+ """
638
+ Calculates Jaccard distances on kmers from viral proteins
639
+
640
+ Arguments:
641
+ query_id {str} -- Id of the query genome
642
+
643
+ Returns:
644
+ [Path to output] -- Path to output file
645
+ """
646
+ temporal_folder = arguments[0]
647
+ query_id = arguments[1]
648
+
649
+ temporal_folder = Path(str(temporal_folder.name))
650
+ temporal_file = Path(query_id).name + '.faai.temp'
651
+ temporal_output = temporal_folder / temporal_file
652
+ # Get query kmers
653
+ proteins_query = query_kmer_dictionary[query_id][0]
654
+ kmers_query = query_kmer_dictionary[query_id][1]
655
+
656
+ # Start comparison with all genomes in the query dictionary
657
+ with open(temporal_output, 'w') as out_file:
658
+ for target_genome in reference_kmer_dictionary.keys():
659
+ # If self, 1.0 similarity
660
+ if query_id == target_genome:
661
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
662
+ 1.0, proteins_query, proteins_query))
663
+ continue
664
+
665
+ jaccard_index = None
666
+ proteins_reference = reference_kmer_dictionary[target_genome][0]
667
+ kmers_reference = reference_kmer_dictionary[target_genome][1]
668
+ # Calculate the Jaccard Index
669
+ union = len(np.union1d(kmers_query, kmers_reference))
670
+ intersection = len(kmers_query) + len(kmers_reference) - union
671
+ jaccard_index = intersection/union
672
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
673
+ jaccard_index, proteins_query, proteins_reference))
674
+ return temporal_output
675
+ # ------------------------------------------------------
676
+
677
+ # --- Query == Reference initializer function ---
678
+ # ------------------------------------------------------
679
+ def single_dictionary_initializer(_dictionary):
680
+ """
681
+ Make dictionary available for multiprocessing
682
+ """
683
+ global query_kmer_dictionary
684
+ query_kmer_dictionary = _dictionary
685
+ # ------------------------------------------------------
686
+
687
+ # --- Query != Reference initializer function ---
688
+ # ------------------------------------------------------
689
+ def two_dictionary_initializer(_query_dictionary, _reference_dictionary):
690
+ """
691
+ Make dictionary available for multiprocessing
692
+ """
693
+ global query_kmer_dictionary
694
+ global reference_kmer_dictionary
695
+ query_kmer_dictionary = _query_dictionary
696
+ reference_kmer_dictionary = _reference_dictionary
697
+ # ------------------------------------------------------
698
+
699
+ # --- Merge kmer dictionaries ---
700
+ # ------------------------------------------------------
701
+ def merge_dicts(dictionaries):
702
+ """
703
+ Given any number of dicts, shallow copy and merge into a new dict,
704
+ precedence goes to key value pairs in latter dicts.
705
+ """
706
+ result = {}
707
+ for kmer_dictionary in dictionaries:
708
+ result.update(kmer_dictionary)
709
+ return result
710
+ # ------------------------------------------------------
711
+
712
+ # --- Merge kmer dictionaries ---
713
+ # ------------------------------------------------------
714
+ def kaai_to_aai(kaai):
715
+ # Transform the kAAI into estimated AAI values
716
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
717
+ return aai_hat
718
+ # ------------------------------------------------------
719
+
720
+
721
+ ################################################################################
722
+ """---2.0 Main Function---"""
723
+
724
+ def main():
725
+ # Setup parser for arguments.
726
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
727
+ description='''This script calculates the average amino acid identity using k-mers\n'''
728
+ '''from single copy genes. It is a faster version of the regular AAI '''
729
+ '''(Blast or Diamond) and the hAAI implemented in MiGA.'''
730
+ '''Usage: ''' + argv[0] + ''' -p [Protein Files] -t [Threads] -o [Output]\n'''
731
+ '''Global mandatory parameters: -g [Genome Files] OR -p [Protein Files] OR -s [SCG HMM Results] -o [AAI Table Output]\n'''
732
+ '''Optional Database Parameters: See ''' + argv[0] + ' -h')
733
+ mandatory_options = parser.add_argument_group('Mandatory i/o options. You must select an option for the queries and one for the references.')
734
+ mandatory_options.add_argument('--qg', dest='query_genomes', action='store', required=False,
735
+ help='File with list of query genomes.')
736
+ mandatory_options.add_argument('--qp', dest='query_proteins', action='store', required=False,
737
+ help='File with list of query proteins.')
738
+ mandatory_options.add_argument('--qh', dest='query_hmms', action='store', required=False,
739
+ help=textwrap.dedent('''
740
+ File with list of pre-computed query hmmsearch results.
741
+ If you select this option you must also provide a file with
742
+ a list of protein files for the queries (with --qp).
743
+ '''))
744
+ mandatory_options.add_argument('--qd', dest='query_database', action='store', required=False,
745
+ help='File with list of pre-indexed query databases.')
746
+ mandatory_options.add_argument('--rg', dest='reference_genomes', action='store', required=False,
747
+ help='File with list of reference genomes.')
748
+ mandatory_options.add_argument('--rp', dest='reference_proteins', action='store', required=False,
749
+ help='File with list of reference proteins.')
750
+ mandatory_options.add_argument('--rh', dest='reference_hmms', action='store', required=False,
751
+ help=textwrap.dedent('''
752
+ File with list of pre-computed reference hmmsearch results.
753
+ If you select this option you must also provide a file with
754
+ a list of protein files for the references (with --qp).
755
+ '''))
756
+ mandatory_options.add_argument('--rd', dest='reference_database', action='store', required=False,
757
+ help='File with list of pre-indexed reference databases.')
758
+ mandatory_options.add_argument('-o', '--output', dest='output', action='store', required=False, help='Output file. By default kaai_comparisons.txt')
759
+ additional_input_options = parser.add_argument_group('Behavior modification options.')
760
+ additional_input_options.add_argument('-e', '--ext', dest='extension', action='store', required=False,
761
+ help='Extension to remove from original filename, e.g. ".fasta"')
762
+ additional_input_options.add_argument('-i', '--index', dest='index_db', action='store_true', required=False,
763
+ help='Only index and store databases, i.e., do not perform comparisons.')
764
+ additional_input_options.add_argument('-a', '--all-vs-all', dest='all_vs_all',
765
+ action='store_true', required=False,
766
+ help='Perform all-vs-all comparison, using only query input.')
767
+ additional_input_options.add_argument('--input-paths', dest='input_paths',
768
+ action='store_true', required=False,
769
+ help='The input files are direct paths to the data, not lists of files.')
770
+ misc_options = parser.add_argument_group('Miscellaneous options')
771
+ misc_options.add_argument('--virus', dest='virus', action='store_true', required=False,
772
+ help='Toggle virus-virus comparisons. Use only with viral genomes or proteins.')
773
+ misc_options.add_argument('-t', '--threads', dest='threads', action='store', default=1, type=int, required=False,
774
+ help='Number of threads to use, by default 1')
775
+ misc_options.add_argument('-k', '--keep', dest='keep', action='store_false', required=False,
776
+ help='Keep intermediate files, by default true')
777
+
778
+ args = parser.parse_args()
779
+
780
+ query_genomes = args.query_genomes
781
+ query_proteins = args.query_proteins
782
+ query_hmms = args.query_hmms
783
+ query_database = args.query_database
784
+ if args.all_vs_all:
785
+ reference_genomes = query_genomes
786
+ reference_proteins = query_proteins
787
+ reference_hmms = query_hmms
788
+ reference_database = query_database
789
+ else:
790
+ reference_genomes = args.reference_genomes
791
+ reference_proteins = args.reference_proteins
792
+ reference_hmms = args.reference_hmms
793
+ reference_database = args.reference_database
794
+ output = args.output
795
+ if output == None:
796
+ output == "kaai_comparisons.txt"
797
+ extension = args.extension
798
+ index_db = args.index_db
799
+ threads = args.threads
800
+ keep = args.keep
801
+ virus = args.virus
802
+ input_paths = args.input_paths
803
+
804
+ print("FastAAI started on {}".format(datetime.datetime.now()))
805
+ # Check user input
806
+ # ------------------------------------------------------
807
+ # Check if no query was provided
808
+ if query_genomes == None and query_proteins == None and query_hmms == None and query_database == None:
809
+ exit('Please prove a file with a list of queries, e.g., --qg, --qp, --qh, or --qd)')
810
+ # Check query inputs
811
+ query_input = None
812
+ if query_hmms != None:
813
+ if virus == True:
814
+ exit("If you are comparing viruses, please start from the genome or protein files.")
815
+ query_input = query_hmms
816
+ if query_proteins != None:
817
+ print("Starting from query hmmsearch results.")
818
+ print("You also provided the list of protein files used for hmmsearch.")
819
+ elif query_proteins == None:
820
+ print("You chose to start from pre-computed hmmsearch results for your queries (--qh).")
821
+ print("However, I also need the location of the query proteins used for hmmsearch.")
822
+ exit("Please provide them with --qp.")
823
+ elif query_proteins != None:
824
+ query_input = query_proteins
825
+ print("Starting from query proteins.")
826
+ elif query_genomes != None:
827
+ query_input = query_genomes
828
+ print("Starting from query genomes.")
829
+ elif query_database != None:
830
+ query_input = query_database
831
+ print("Starting from the pre-indexed query database.")
832
+ # Check if no reference was provided
833
+ if reference_genomes == None and reference_proteins == None and reference_hmms == None and reference_database == None:
834
+ exit('Please prove a file with a list of references, e.g., --rg, --rp, --rh, or --rd)')
835
+ # Check reference inputs
836
+ reference_input = None
837
+ if reference_hmms != None:
838
+ if virus == True:
839
+ exit("If you are comparing viruses, please start from the genome or protein files.")
840
+ reference_input = reference_hmms
841
+ if reference_proteins != None:
842
+ print("Starting from reference hmmsearch results.")
843
+ print("You also provided the list of protein files used for hmmsearch.")
844
+ elif reference_proteins == None:
845
+ print("You chose to start from pre-computed hmmsearch results for your references (--rh).")
846
+ print("However, I also need the location of the query proteins used for hmmsearch.")
847
+ exit("Please provide them with --rp.")
848
+ elif reference_proteins != None:
849
+ reference_input = reference_proteins
850
+ print("Starting from reference proteins.")
851
+ elif reference_genomes != None:
852
+ reference_input = reference_genomes
853
+ print("Starting from reference genomes.")
854
+ elif reference_database != None:
855
+ reference_input = reference_database
856
+ print("Starting from the pre-indexed reference database.")
857
+ # ------------------------------------------------------
858
+
859
+ # Create temporal working directory
860
+ temporal_working_directory = TemporaryDirectory()
861
+ # ------------------------------------------------------
862
+
863
+ # Check if queries are the same as references (an all-vs-all comparison)
864
+ # ------------------------------------------------------
865
+ same_inputs = False
866
+ if query_input == reference_input:
867
+ same_inputs = True
868
+ if same_inputs == True:
869
+ print('You specified the same query and reference files.')
870
+ print('I will perform an all vs all comparison :)')
871
+ # ------------------------------------------------------
872
+
873
+ #* Database Parsing is the same regardless of bacterial or viral genomes
874
+ # If using pre-indexed databases, check if they are valid files.
875
+ # ------------------------------------------------------
876
+ # If any of the starting points is from database, then store the
877
+ # kmer structures in the corresponding dictionaries.
878
+ # Otherwise read the file list and get the filenames
879
+ query_kmer_dict = None
880
+ query_kmer_dict_list = []
881
+ reference_kmer_dict = None
882
+ reference_kmer_dict_list = []
883
+ query_database_files = []
884
+ reference_database_files = []
885
+ if query_database != None:
886
+ if input_paths == True:
887
+ query_database_files.append(query_database)
888
+ else:
889
+ with open(query_database) as database_files:
890
+ for db_location in database_files:
891
+ query_database_files.append(db_location)
892
+ if reference_database != None:
893
+ if input_paths == True:
894
+ reference_database_files.append(reference_database)
895
+ else:
896
+ with open(reference_database) as database_files:
897
+ for db_location in database_files:
898
+ reference_database_files.append(db_location)
899
+
900
+ # If starting from database and query == reference
901
+ if same_inputs == True:
902
+ if query_database != None:
903
+ for db_location in query_database_files:
904
+ if Path(db_location.strip()).is_file():
905
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
906
+ temp_dict = pickle.load(database_handle)
907
+ if isinstance(temp_dict,dict):
908
+ query_kmer_dict_list.append(temp_dict)
909
+ #Carlos, this line serves no purpose but does take a bunch of time and mem.
910
+ #print(query_kmer_dict_list)
911
+ else:
912
+ exit("One of the database files appear to have the wrong format. Please provide a correctly formated database.")
913
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
914
+ else:
915
+ # If the inputs are not the same:
916
+ # If query and ref are provided
917
+ if query_database != None and reference_database != None:
918
+ for db_location in query_database_files:
919
+ if Path(db_location.strip()).is_file():
920
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
921
+ temp_dict = pickle.load(database_handle)
922
+ if isinstance(temp_dict,dict):
923
+ query_kmer_dict_list.append(temp_dict)
924
+ else:
925
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
926
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
927
+ for db_location in reference_database_files:
928
+ if Path(db_location.strip()).is_file():
929
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
930
+ temp_dict = pickle.load(database_handle)
931
+ if isinstance(temp_dict,dict):
932
+ reference_kmer_dict_list.append(temp_dict)
933
+ else:
934
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
935
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
936
+ # If only the query has a db
937
+ elif query_database != None and reference_database == None:
938
+ for db_location in query_database_files:
939
+ if Path(db_location.strip()).is_file():
940
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
941
+ temp_dict = pickle.load(database_handle)
942
+ if isinstance(temp_dict,dict):
943
+ query_kmer_dict_list.append(temp_dict)
944
+ else:
945
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
946
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
947
+ # If only the reference has a db
948
+ elif query_database == None and reference_database != None:
949
+ for db_location in reference_database_files:
950
+ if Path(db_location.strip()).is_file():
951
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
952
+ temp_dict = pickle.load(database_handle)
953
+ if isinstance(temp_dict,dict):
954
+ reference_kmer_dict_list.append(temp_dict)
955
+ else:
956
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
957
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
958
+ # ------------------------------------------------------
959
+
960
+ # Get files from the query and reference lists and then
961
+ # create a dictionary with resulting filenames and a list with dictionary keys
962
+ # The structure of the dictionary is:
963
+ # original_query, proteins, hmms, filtered_hmms
964
+ # ------------------------------------------------------
965
+ # First parse the query:
966
+ query_list = []
967
+ query_file_names = {}
968
+ # For bacterial genomes
969
+ if virus == False:
970
+ if query_database != None:
971
+ pass
972
+ else:
973
+ if input_paths == True:
974
+ query_list.append(query_input)
975
+ else:
976
+ with open(query_input, 'r') as query_input_fh:
977
+ for line in query_input_fh:
978
+ query_list.append(line.strip())
979
+ for index, query in enumerate(query_list):
980
+ query_name = str(Path(query).name)
981
+ if extension != None:
982
+ query_name = query_name.replace(extension, "")
983
+ if query_hmms != None:
984
+ query_protein_list = []
985
+ with open(query_proteins, 'r') as query_protein_fh:
986
+ for line in query_protein_fh:
987
+ query_protein_list.append(line.strip())
988
+ query_file_names[query_name] = [None, query_protein_list[index], query, query + '.filt']
989
+ elif query_proteins != None:
990
+ query_file_names[query_name] = [None, query, query + '.hmm', query + '.hmm.filt']
991
+ elif query_genomes != None:
992
+ query_file_names[query_name] = [query, query + '.faa', query + '.faa.hmm', query + '.faa.hmm.filt']
993
+ # For viral genomes
994
+ else:
995
+ if query_database != None:
996
+ pass
997
+ else:
998
+ if input_paths == True:
999
+ query_list.append(query_input)
1000
+ else:
1001
+ with open(query_input, 'r') as query_input_fh:
1002
+ for line in query_input_fh:
1003
+ query_list.append(line.strip())
1004
+ for index, query in enumerate(query_list):
1005
+ query_name = str(Path(query).name)
1006
+ if extension != None:
1007
+ query_name = query_name.replace(extension, "")
1008
+ if query_proteins != None:
1009
+ query_file_names[query_name] = [None, query]
1010
+ elif query_genomes != None:
1011
+ query_file_names[query_name] = [query, query + '.faa']
1012
+
1013
+ # Then parse the references:
1014
+ reference_list = []
1015
+ reference_file_names = {}
1016
+ if same_inputs == True:
1017
+ pass
1018
+ else:
1019
+ # For bacterial genomes
1020
+ if virus == False:
1021
+ if reference_database != None:
1022
+ pass
1023
+ else:
1024
+ if input_paths == True:
1025
+ reference_list.append(reference_input)
1026
+ else:
1027
+ with open(reference_input, 'r') as reference_input_fh:
1028
+ for line in reference_input_fh:
1029
+ reference_list.append(line.strip())
1030
+ for index, reference in enumerate(reference_list):
1031
+ reference_name = str(Path(reference).name)
1032
+ if extension != None:
1033
+ reference_name = reference_name.replace(extension, "")
1034
+ if reference_hmms != None:
1035
+ reference_protein_list = []
1036
+ with open(reference_proteins, 'r') as reference_protein_fh:
1037
+ for line in reference_protein_fh:
1038
+ reference_protein_list.append(line.strip())
1039
+ reference_file_names[reference_name] = [None, reference_protein_list[index], reference, reference + '.filt']
1040
+ elif reference_proteins != None:
1041
+ reference_file_names[reference_name] = [None, reference, reference + '.hmm', reference + '.hmm.filt']
1042
+ elif query_genomes != None:
1043
+ reference_file_names[reference_name] = [reference, reference + '.faa', reference + '.faa.hmm', reference + '.faa.hmm.filt']
1044
+ # For viral genomes
1045
+ else:
1046
+ if reference_database != None:
1047
+ pass
1048
+ else:
1049
+ if input_paths == True:
1050
+ reference_list.append(reference_input)
1051
+ else:
1052
+ with open(reference_input, 'r') as reference_input_fh:
1053
+ for line in reference_input_fh:
1054
+ reference_list.append(line.strip())
1055
+ for index, reference in enumerate(reference_list):
1056
+ reference_name = str(Path(reference).name)
1057
+ if extension != None:
1058
+ reference_name = reference_name.replace(extension, "")
1059
+ if reference_proteins != None:
1060
+ reference_file_names[reference_name] = [None, reference]
1061
+ elif query_genomes != None:
1062
+ reference_file_names[reference_name] = [reference, reference + '.faa']
1063
+ # ------------------------------------------------------
1064
+
1065
+ # Pre-index and store databases
1066
+ # ------------------------------------------------------
1067
+ # Pre-index queries
1068
+ if query_kmer_dict == None:
1069
+ print("Processing queries...")
1070
+ # If using bacterial genomes
1071
+ if virus == False:
1072
+ if query_hmms != None:
1073
+ query_hmm_results = query_list
1074
+ elif query_proteins != None:
1075
+ query_protein_files = query_list
1076
+ print("Searching against HMM models...")
1077
+ try:
1078
+ pool = multiprocessing.Pool(threads)
1079
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1080
+ finally:
1081
+ pool.close()
1082
+ pool.join()
1083
+ elif query_genomes != None:
1084
+ print("Predicting proteins...")
1085
+ # Predict query proteins
1086
+ try:
1087
+ pool = multiprocessing.Pool(threads)
1088
+ query_protein_files = pool.map(run_prodigal, query_list)
1089
+ finally:
1090
+ pool.close()
1091
+ pool.join()
1092
+ print("Done!")
1093
+ print("Searching against HMM models...")
1094
+ # Run hmmsearch against proteins predicted
1095
+ try:
1096
+ pool = multiprocessing.Pool(threads)
1097
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1098
+ finally:
1099
+ pool.close()
1100
+ pool.join()
1101
+ print("Done!")
1102
+ print("Filtering query hmmsearch results...")
1103
+ # Filter query HMM search results
1104
+ try:
1105
+ pool = multiprocessing.Pool(threads)
1106
+ pool.map(partial(hmm_filter, keep=keep), query_hmm_results)
1107
+ finally:
1108
+ pool.close()
1109
+ pool.join()
1110
+ print("Extracting kmers from query proteins...")
1111
+ # Finding kmers for all queries
1112
+ query_information = []
1113
+ for name, values in query_file_names.items():
1114
+ query_information.append((name, values[1], values[3]))
1115
+ try:
1116
+ pool = multiprocessing.Pool(threads)
1117
+ kmer_results = pool.map(kmer_extract, query_information)
1118
+ finally:
1119
+ pool.close()
1120
+ pool.join()
1121
+ query_kmer_dict = merge_dicts(kmer_results)
1122
+ del kmer_results
1123
+ # If using viral genomes
1124
+ else:
1125
+ if query_genomes != None:
1126
+ print("Predicting proteins...")
1127
+ # Predict query proteins
1128
+ try:
1129
+ pool = multiprocessing.Pool(threads)
1130
+ query_protein_files = pool.map(run_prodigal_virus, query_list)
1131
+ finally:
1132
+ pool.close()
1133
+ pool.join()
1134
+ print("Done!")
1135
+ elif query_proteins != None:
1136
+ query_protein_files = query_list
1137
+ print("Extracting kmers from query proteins...")
1138
+ query_information = []
1139
+ for name, values in query_file_names.items():
1140
+ query_information.append((name, values[1], 4))
1141
+ try:
1142
+ pool = multiprocessing.Pool(threads)
1143
+ kmer_results = pool.map(read_viral_kmers_from_file, query_information)
1144
+ finally:
1145
+ pool.close()
1146
+ pool.join()
1147
+ query_kmer_dict = merge_dicts(kmer_results)
1148
+ del kmer_results
1149
+
1150
+ # Pre-index references (if different from queries)
1151
+ if same_inputs == False and reference_kmer_dict == None:
1152
+ print("Processing references...")
1153
+ # If using bacterial genomes
1154
+ if virus == False:
1155
+ if reference_hmms != None:
1156
+ reference_hmm_results = reference_list
1157
+ elif reference_proteins != None:
1158
+ reference_protein_files = reference_list
1159
+ print("Searching against HMM models... ")
1160
+ try:
1161
+ pool = multiprocessing.Pool(threads)
1162
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1163
+ finally:
1164
+ pool.close()
1165
+ pool.join()
1166
+ if reference_genomes != None:
1167
+ print("Predicting proteins...")
1168
+ # Predict reference proteins
1169
+ try:
1170
+ pool = multiprocessing.Pool(threads)
1171
+ reference_protein_files = pool.map(run_prodigal, reference_list)
1172
+ finally:
1173
+ pool.close()
1174
+ pool.join()
1175
+ print("Done!")
1176
+ print("Searching against HMM models...")
1177
+ # Run hmmsearch against proteins predicted
1178
+ try:
1179
+ pool = multiprocessing.Pool(threads)
1180
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1181
+ finally:
1182
+ pool.close()
1183
+ pool.join()
1184
+ print("Done!")
1185
+ print("Filtering reference hmmsearch results...")
1186
+ # Filter reference HMM search results
1187
+ try:
1188
+ pool = multiprocessing.Pool(threads)
1189
+ pool.map(partial(hmm_filter, keep=keep), reference_hmm_results)
1190
+ finally:
1191
+ pool.close()
1192
+ pool.join()
1193
+ print("Extracting kmers from reference proteins...")
1194
+ # Finding kmers for all queries
1195
+ reference_information = []
1196
+ for name, values in reference_file_names.items():
1197
+ reference_information.append((name, values[1], values[3]))
1198
+ try:
1199
+ pool = multiprocessing.Pool(threads)
1200
+ kmer_results = pool.map(kmer_extract, reference_information)
1201
+ finally:
1202
+ pool.close()
1203
+ pool.join()
1204
+ reference_kmer_dict = merge_dicts(kmer_results)
1205
+ del kmer_results
1206
+ # If using viral genomes
1207
+ else:
1208
+ if query_genomes != None:
1209
+ print("Predicting proteins...")
1210
+ # Predict query proteins
1211
+ try:
1212
+ pool = multiprocessing.Pool(threads)
1213
+ query_protein_files = pool.map(run_prodigal, query_list)
1214
+ finally:
1215
+ pool.close()
1216
+ pool.join()
1217
+ print("Done!")
1218
+ elif query_proteins != None:
1219
+ query_protein_files = query_list
1220
+ print("Extracting kmers from query proteins...")
1221
+ reference_information = []
1222
+ for name, values in reference_file_names.items():
1223
+ reference_information.append((name, values[1], 4))
1224
+ try:
1225
+ pool = multiprocessing.Pool(threads)
1226
+ kmer_results = pool.map(read_viral_kmers_from_file, reference_information)
1227
+ finally:
1228
+ pool.close()
1229
+ pool.join()
1230
+ reference_kmer_dict = merge_dicts(kmer_results)
1231
+ del kmer_results
1232
+ # ------------------------------------------------------
1233
+
1234
+ # Create or database(s) and compress it(them)
1235
+ # ------------------------------------------------------
1236
+ if same_inputs == True and query_database == None:
1237
+ print("Saving pre-indexed database...")
1238
+ query_database_name = query_input + '.db.gz'
1239
+ with gzip.open(query_database_name, 'wb') as database_handle:
1240
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1241
+ if same_inputs == False and query_database == None and reference_database == None:
1242
+ print("Saving pre-indexed databases...")
1243
+ query_database_name = query_input + '.db.gz'
1244
+ reference_database_name = reference_input + '.db.gz'
1245
+ with gzip.open(query_database_name, 'wb') as database_handle:
1246
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1247
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1248
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1249
+ elif same_inputs == False and query_database == None:
1250
+ print("Saving pre-indexed query database...")
1251
+ query_database_name = query_input + '.db.gz'
1252
+ with gzip.open(query_database_name, 'wb') as database_handle:
1253
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1254
+ elif same_inputs == False and reference_database == None:
1255
+ print("Saving pre-indexed reference database...")
1256
+ reference_database_name = reference_input + '.db.gz'
1257
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1258
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1259
+ # ------------------------------------------------------
1260
+ # Calculate Jaccard distances
1261
+ # ------------------------------------------------------
1262
+ if index_db == True:
1263
+ print("Finished pre-indexing databases.")
1264
+ print("Next time you can run the program using only these files with --qd and(or) --rd.")
1265
+ else:
1266
+ print("Calculating shared kmer fraction...")
1267
+ if virus == False:
1268
+ if same_inputs == True:
1269
+ # Create global kmer index dictionary "global_kmer_index_dictionary"
1270
+ print(temporal_working_directory)
1271
+ global_unique_kmers([query_kmer_dict])
1272
+ query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
1273
+ print("Beginning FastAAI pairwise calculations now.")
1274
+ try:
1275
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
1276
+ Fraction_Results = pool.map(single_kaai_parser, query_smart_args_tempdir)
1277
+ finally:
1278
+ pool.close()
1279
+ pool.join()
1280
+ else:
1281
+ print(temporal_working_directory)
1282
+ global_unique_kmers([query_kmer_dict, reference_kmer_dict])
1283
+ query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
1284
+ reference_kmer_dict, _ref_smart_args_tempdir = transform_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
1285
+ print("Beginning FastAAI pairwise calculations now.")
1286
+ try:
1287
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1288
+ Fraction_Results = pool.map(double_kaai_parser, query_smart_args_tempdir)
1289
+ finally:
1290
+ pool.close()
1291
+ pool.join()
1292
+ else:
1293
+ if same_inputs == True:
1294
+ print(temporal_working_directory)
1295
+ global_unique_viral_kmers([query_kmer_dict])
1296
+ query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
1297
+ print("Beginning FastAAI pairwise calculations now.")
1298
+ try:
1299
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
1300
+ Fraction_Results = pool.map(single_virus_kaai_parser, query_smart_args_tempdir)
1301
+ finally:
1302
+ pool.close()
1303
+ pool.join()
1304
+ else:
1305
+ print(temporal_working_directory)
1306
+ global_unique_viral_kmers([query_kmer_dict, reference_kmer_dict])
1307
+ query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
1308
+ reference_kmer_dict, _ref_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
1309
+ print("Beginning FastAAI pairwise calculations now.")
1310
+ try:
1311
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1312
+ Fraction_Results = pool.map(double_viral_kaai_parser, query_smart_args_tempdir)
1313
+ finally:
1314
+ pool.close()
1315
+ pool.join()
1316
+ # ------------------------------------------------------
1317
+
1318
+ # Merge results into a single output
1319
+ # ------------------------------------------------------
1320
+ print("Merging results...")
1321
+ print(temporal_working_directory)
1322
+ with open(output, 'w') as outfile:
1323
+ for file in Fraction_Results:
1324
+ with open(file) as Temp:
1325
+ shutil.copyfileobj(Temp, outfile)
1326
+ file.unlink()
1327
+ print("FastAAI finishied correctly on {}".format(datetime.datetime.now()))
1328
+ # ------------------------------------------------------
1329
+ # If comparing viral genomes
1330
+
1331
+
1332
+
1333
+
1334
+
1335
+ if __name__ == "__main__":
1336
+ main()