miga-base 1.2.17.0 → 1.2.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +1 -1
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI +3659 -0
  7. data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
  8. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/FastAAI/README.md +84 -0
  10. data/utils/enveomics/Docs/recplot2.md +244 -0
  11. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  13. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  14. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  15. data/utils/enveomics/LICENSE.txt +73 -0
  16. data/utils/enveomics/Makefile +52 -0
  17. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  18. data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
  19. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  20. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  21. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  22. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  23. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  24. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  25. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  26. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  27. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
  28. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  29. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  30. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  31. data/utils/enveomics/Manifest/categories.json +165 -0
  32. data/utils/enveomics/Manifest/examples.json +162 -0
  33. data/utils/enveomics/Manifest/tasks.json +4 -0
  34. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  35. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  36. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  37. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  38. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  39. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  48. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  49. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  50. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  51. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  52. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  53. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  54. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  62. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  63. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  64. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  65. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  66. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  67. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  68. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  69. data/utils/enveomics/README.md +42 -0
  70. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  71. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  72. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  73. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  74. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  75. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  76. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  77. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  78. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  79. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  80. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  81. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  82. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  83. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  84. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  85. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  86. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  87. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  88. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  89. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  90. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  91. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  92. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  93. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
  94. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  95. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  96. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  97. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  98. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  99. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  100. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  101. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  102. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  103. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  104. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  105. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  106. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  107. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  108. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  109. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  110. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  111. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  112. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  113. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  114. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  115. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  116. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  117. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  118. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  119. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  120. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  121. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  122. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  123. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  124. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  125. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  126. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  127. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  128. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  129. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  130. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  131. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  132. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  133. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  134. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  135. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  136. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  137. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  138. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  139. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  140. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  141. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  142. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  143. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  144. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  145. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  146. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  147. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  148. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  149. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  150. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  151. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  152. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  153. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  154. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  155. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  156. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  157. data/utils/enveomics/Scripts/aai.rb +421 -0
  158. data/utils/enveomics/Scripts/ani.rb +362 -0
  159. data/utils/enveomics/Scripts/anir.rb +137 -0
  160. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  161. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  162. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  163. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  164. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  165. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  166. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  167. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  168. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  169. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  170. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  171. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  172. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
  173. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  174. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  175. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  176. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  177. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  178. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  179. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  180. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
  181. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  182. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  183. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  184. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  185. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  186. data/utils/enveomics/Scripts/ogs.rb +104 -0
  187. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  188. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  189. data/utils/enveomics/Scripts/rbm.rb +108 -0
  190. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  191. data/utils/enveomics/Tests/Makefile +10 -0
  192. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  193. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  194. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  195. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  196. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  197. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  198. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  199. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  200. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  201. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  202. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  203. data/utils/enveomics/Tests/alkB.nwk +1 -0
  204. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  205. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  206. data/utils/enveomics/Tests/hiv1.faa +59 -0
  207. data/utils/enveomics/Tests/hiv1.fna +134 -0
  208. data/utils/enveomics/Tests/hiv2.faa +70 -0
  209. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  210. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  211. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  212. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  213. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  214. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  215. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  216. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  217. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  218. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  219. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  220. data/utils/enveomics/build_enveomics_r.bash +45 -0
  221. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  222. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  223. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  224. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  225. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  226. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  227. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  228. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  229. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  230. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  231. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  232. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  233. data/utils/enveomics/enveomics.R/README.md +81 -0
  234. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  235. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  236. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  237. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  238. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  239. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  240. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  241. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  242. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  243. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  244. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  245. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  246. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  247. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  248. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  249. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  250. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  251. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  252. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  253. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  254. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  255. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  256. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  257. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  258. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  259. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  260. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  261. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  262. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  263. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  264. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  265. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  266. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  267. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  268. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  269. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  270. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  271. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  272. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  273. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  274. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  275. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  276. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  277. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  278. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  279. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  280. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  282. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  283. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  284. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  285. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  286. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  287. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  288. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  289. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  290. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  291. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  292. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  293. data/utils/enveomics/globals.mk +8 -0
  294. data/utils/enveomics/manifest.json +9 -0
  295. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  296. data/utils/multitrim/README.md +67 -0
  297. data/utils/multitrim/multitrim.py +1555 -0
  298. data/utils/multitrim/multitrim.yml +13 -0
  299. metadata +301 -5
@@ -0,0 +1,3659 @@
1
+ #!/usr/bin/env python3
2
+
3
+ ################################################################################
4
+ """---0.0 Import Modules---"""
5
+ import subprocess
6
+ import argparse
7
+ import datetime
8
+ import shutil
9
+ import textwrap
10
+ import multiprocessing
11
+ import pickle
12
+ import gzip
13
+ import tempfile
14
+ #Shouldn't play any role.
15
+ #from random import randint
16
+
17
+ #We could probably remove Path, too.
18
+ from pathlib import Path
19
+ #This as well
20
+ from functools import partial
21
+ import time
22
+ from collections import defaultdict
23
+ import sys
24
+ import os
25
+ from math import floor
26
+ import sqlite3
27
+ #numpy dependency
28
+ import numpy as np
29
+ import io
30
+ import random
31
+
32
+
33
+ #Takes a bytestring from the SQL database and converts it to a numpy array.
34
+ def convert_array(bytestring):
35
+ return np.frombuffer(bytestring, dtype = np.int32)
36
+
37
+ def convert_float_array_16(bytestring):
38
+ return np.frombuffer(bytestring, dtype = np.float16)
39
+
40
+ def convert_float_array_32(bytestring):
41
+ return np.frombuffer(bytestring, dtype = np.float32)
42
+
43
+ def convert_float_array_64(bytestring):
44
+ return np.frombuffer(bytestring, dtype = np.float64)
45
+
46
+
47
+ #Iterator for agnostic reader
48
+ class agnostic_reader_iterator:
49
+ def __init__(self, reader):
50
+ self.handle_ = reader.handle
51
+ self.is_gz_ = reader.is_gz
52
+
53
+ def __next__(self):
54
+ if self.is_gz_:
55
+ line = self.handle_.readline().decode()
56
+ else:
57
+ line = self.handle_.readline()
58
+
59
+ #Ezpz EOF check
60
+ if line:
61
+ return line
62
+ else:
63
+ raise StopIteration
64
+
65
+ #File reader that doesn't care if you give it a gzipped file or not.
66
+ class agnostic_reader:
67
+ def __init__(self, file):
68
+ self.path = file
69
+
70
+ with open(file, 'rb') as test_gz:
71
+ #Gzip magic number
72
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
73
+
74
+ self.is_gz = is_gz
75
+
76
+ if is_gz:
77
+ self.handle = gzip.open(self.path)
78
+ else:
79
+ self.handle = open(self.path)
80
+
81
+ def __iter__(self):
82
+ return agnostic_reader_iterator(self)
83
+
84
+ def close(self):
85
+ self.handle.close()
86
+
87
+ #FastAAI database class. This is the final database
88
+ class fastaai_database:
89
+ def __init__(self, path):
90
+ #open SQL db and load in
91
+
92
+ self.path = path
93
+ self.exists = os.path.exists(path)
94
+
95
+ self.child = None
96
+ self.connection = None
97
+ self.cursor = None
98
+
99
+ self.child_connection = None
100
+ self.child_cursor = None
101
+
102
+ self.accessions = None
103
+ #self.genomes = None
104
+
105
+ #gak stands for 'genome_accession_kmer_counts'
106
+ self.gak = None
107
+ self.genome_index = None
108
+ #Go from index to name
109
+ self.reverse_genome_index = None
110
+ self.protein_counts_by_genome = None
111
+
112
+ #self.accession_set = None
113
+
114
+ self.verbosity = False
115
+
116
+ #Open an SQL connection
117
+ def activate_connection(self, with_converter = True):
118
+ # Converts np.array to TEXT when inserting
119
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
120
+
121
+ #Converts byte string to numpy ndarray(int32) upon read from DB.
122
+ if with_converter:
123
+ sqlite3.register_converter("array", convert_array)
124
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
125
+
126
+ else:
127
+ #sqlite3.register_converter("array", convert_array)
128
+ self.connection = sqlite3.connect(self.path)
129
+
130
+ self.cursor = self.connection.cursor()
131
+ self.exists = True
132
+
133
+ #Close an SQL connection
134
+ def close_connection(self):
135
+ self.cursor.close()
136
+ self.connection.close()
137
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
138
+ self.cursor = None
139
+ self.connection = None
140
+
141
+ def initialize_parent_database(self):
142
+ if not self.exists:
143
+ print("I need to be activated first!")
144
+ else:
145
+ #DB exists. Add metadata tables if needed.
146
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
147
+ if self.cursor.fetchone()[0]!=1 :
148
+ self.cursor.execute('''CREATE TABLE genome_index
149
+ (genome text, gen_id INTEGER PRIMARY KEY, protein_count INTEGER)''')
150
+ self.connection.commit()
151
+
152
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
153
+ if self.cursor.fetchone()[0]!=1 :
154
+ self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
155
+ (genome INTEGER, accession INTEGER, count INTEGER)''')
156
+ self.connection.commit()
157
+
158
+ #Access an existing master database
159
+ def activate_child_connection(self, child):
160
+ #Don't try to connect unless it exists. This should never fail.
161
+ if os.path.exists(child):
162
+ self.child = child
163
+ self.child_connection = sqlite3.connect(self.child, detect_types=sqlite3.PARSE_DECLTYPES)
164
+ self.child_cursor = self.child_connection.cursor()
165
+ else:
166
+ print("Child database:", child, "not found!")
167
+
168
+ #Close access to master DB
169
+ def close_child_connection(self):
170
+ if self.child_cursor is not None:
171
+ self.child_cursor.close()
172
+ self.child_connection.close()
173
+ self.child_cursor = None
174
+ self.child_connection = None
175
+ self.child = None
176
+
177
+ def add_child_to_parent(self, acc, child_db, remove = True, selected_kmers = None, genomes_too = False, just_genomes = False, update_gak = False):
178
+ accession_index = generate_accessions_index()
179
+
180
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc + " (kmer INTEGER PRIMARY KEY, genomes array)"
181
+
182
+ if not just_genomes:
183
+ self.cursor.execute(create_command)
184
+ self.connection.commit()
185
+
186
+ if genomes_too or just_genomes:
187
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
188
+ self.cursor.execute(create_command)
189
+ self.connection.commit()
190
+
191
+ attach = "attach '"+child_db+"' as toMerge"
192
+
193
+ if selected_kmers is not None:
194
+ add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc + " WHERE kmer in ({kmers})".format(kmers = ','.join(['?']*len(selected_kmers)))
195
+ else:
196
+ add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc
197
+
198
+ if genomes_too or just_genomes:
199
+ add_genomes = "INSERT OR REPLACE INTO " + acc + "_genomes" + " SELECT * FROM toMerge." + acc+"_genomes"
200
+ if update_gak:
201
+ sql_acc_num = acc.replace("_", ".")
202
+ sql_acc_num = accession_index[sql_acc_num]
203
+ #Return num bytes, which is always 4*as many as there are entries, as the dtype is int32. See unique_kmers.
204
+ gak_sql = 'INSERT OR REPLACE INTO genome_acc_kmer_counts SELECT genome, ' + str(sql_acc_num) + ', length(kmers)/4 FROM toMerge.' + acc + '_genomes'
205
+
206
+ detach = "detach toMerge"
207
+
208
+ self.cursor.execute(attach)
209
+ self.connection.commit()
210
+
211
+ if not just_genomes:
212
+ if selected_kmers is not None:
213
+ self.cursor.execute(add, selected_kmers)
214
+ else:
215
+ self.cursor.execute(add)
216
+
217
+ self.connection.commit()
218
+
219
+ if genomes_too or just_genomes:
220
+ self.cursor.execute(add_genomes)
221
+ self.connection.commit()
222
+ if update_gak:
223
+ self.cursor.execute(gak_sql)
224
+ self.connection.commit()
225
+
226
+ self.cursor.execute(detach)
227
+ self.connection.commit()
228
+
229
+ if remove:
230
+ os.remove(child_db)
231
+
232
+ def add_genomes_first(self, accession, kmer_dict):
233
+ kmer_lists = []
234
+ for genome in kmer_dict:
235
+ kmer_lists.append((genome, kmer_dict[genome].tobytes()))
236
+
237
+
238
+ sql_friendly_accession = accession.replace(".", "_")
239
+
240
+ #self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
241
+
242
+ self.cursor.execute("CREATE TABLE IF NOT EXISTS " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
243
+ self.connection.commit()
244
+
245
+ self.cursor.executemany("INSERT OR REPLACE INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
246
+
247
+ self.connection.commit()
248
+
249
+ return sql_friendly_accession
250
+
251
+
252
+ def load_genome_index(self):
253
+ self.genome_index = {}
254
+ self.reverse_genome_index = {}
255
+ self.protein_counts_by_genome = {}
256
+
257
+ sql_command = ("SELECT genome, gen_id, protein_count FROM genome_index")
258
+
259
+ #Break resist.
260
+ gen = None
261
+ id = None
262
+ protein_count = None
263
+
264
+ for result in self.cursor.execute(sql_command).fetchall():
265
+ gen = result[0]
266
+ id = result[1]
267
+ protein_count = result[2]
268
+
269
+ self.genome_index[gen] = id
270
+ self.reverse_genome_index[id] = gen
271
+ self.protein_counts_by_genome[id] = protein_count
272
+
273
+ del gen
274
+ del id
275
+ del protein_count
276
+
277
+ def load_accessions(self, permitted_genomes = None, permitted_accessions = None):
278
+ #self.protein_counts_by_genome = None
279
+
280
+ self.gak = defaultdict(lambda: defaultdict())
281
+ self.accessions = set()
282
+
283
+
284
+ #It's possible to do both of these. Don't.
285
+ if permitted_genomes is not None:
286
+ sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(permitted_genomes)))
287
+ #data type is very important to SQL
288
+ sql_friendly = [int(permitted_genomes[i]) for i in range(0, len(permitted_genomes))]
289
+ for result in self.cursor.execute(sql_command, sql_friendly).fetchall():
290
+ genome, accession, kmer_ct = result[0], result[1], result[2]
291
+ self.gak[genome][accession] = kmer_ct
292
+
293
+ if permitted_accessions is not None:
294
+ sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE accession IN ({accessions})".format(accessions=','.join(['?']*len(permitted_accessions)))
295
+ #data type is very important to SQL
296
+ #sql_friendly = [int(permitted_accessions[i]) for i in range(0, len(permitted_genomes))]
297
+ for result in self.cursor.execute(sql_command, permitted_accessions).fetchall():
298
+ genome, accession, kmer_ct = result[0], result[1], result[2]
299
+ self.gak[genome][accession] = kmer_ct
300
+
301
+ #Normal case
302
+ if permitted_accessions is None and permitted_genomes is None:
303
+ sql_command = "SELECT * FROM genome_acc_kmer_counts"
304
+ for result in self.cursor.execute(sql_command).fetchall():
305
+ genome, accession, kmer_ct = result[0], result[1], result[2]
306
+ self.gak[genome][accession] = kmer_ct
307
+
308
+ #un-defaultdict
309
+ self.gak = dict(self.gak)
310
+ for genome in self.gak:
311
+ self.gak[genome] = dict(self.gak[genome])
312
+ self.accessions = self.accessions.union(self.gak[genome].keys())
313
+
314
+ self.accessions = tuple(self.accessions)
315
+
316
+ def just_accessions(self):
317
+ converter = generate_accessions_index()
318
+ acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
319
+ tables = [item[0] for item in self.cursor.execute(acc_sql).fetchall()]
320
+
321
+ genome_tables = []
322
+ for table in tables:
323
+ if table.endswith('_genomes'):
324
+ genome_tables.append(table)
325
+
326
+ for table in genome_tables:
327
+ tables.pop(tables.index(table))
328
+
329
+ tables.pop(tables.index('genome_acc_kmer_counts'))
330
+ tables.pop(tables.index('genome_index'))
331
+
332
+ #Back to indicies.
333
+ tables = [converter[table.replace('_', '.')] for table in tables]
334
+
335
+ self.accessions = tuple(tables)
336
+
337
+ def unload_genomes_and_accessions(self):
338
+ self.gak = None
339
+ self.genome_index = None
340
+ #Go from index to name
341
+ self.reverse_genome_index = None
342
+ self.protein_counts_by_genome = None
343
+
344
+ #Child database class. This is only used during database builds and merges. Designed to take one single accession at a time and produce a correctly formatted table of kmers and accessions.
345
+ class child_database:
346
+ def __init__(self, path, parent):
347
+ #open SQL db and load in
348
+
349
+ self.path = path
350
+ self.exists = False
351
+
352
+ self.parent = parent
353
+ self.parent_exists = os.path.exists(parent)
354
+
355
+ self.connection = None
356
+ self.cursor = None
357
+
358
+ self.parent_connection = None
359
+ self.parent_cursor = None
360
+
361
+ self.verbosity = False
362
+
363
+ #Open an SQL connection
364
+ def activate_child_connection(self):
365
+ # Converts np.array to TEXT when inserting
366
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
367
+
368
+ # Converts TEXT to np.array when selecting
369
+ sqlite3.register_converter("array", convert_array)
370
+
371
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
372
+ self.cursor = self.connection.cursor()
373
+ self.exists = True
374
+
375
+ #Close an SQL connection
376
+ def close_child_connection(self):
377
+ self.cursor.close()
378
+ self.connection.close()
379
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
380
+ self.cursor = None
381
+ self.connection = None
382
+
383
+ def initialize_child_database(self):
384
+ if not self.exists:
385
+ print("I need to be activated first!")
386
+ else:
387
+ #DB exists. Add metadata tables.
388
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
389
+ if self.cursor.fetchone()[0]!=1 :
390
+ self.cursor.execute('''CREATE TABLE genome_index
391
+ (genome text, gen_id integer, protein_count integer)''')
392
+ self.connection.commit()
393
+
394
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
395
+ if self.cursor.fetchone()[0]!=1 :
396
+ self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
397
+ (genome integer, accession integer, count integer)''')
398
+ self.connection.commit()
399
+
400
+
401
+ #Access an existing master database
402
+ def activate_parent_connection(self):
403
+ if os.path.exists(self.parent):
404
+ self.parent_exists = True
405
+ #sqlite3.register_adapter(np.ndarray, adapt_array)
406
+ # Converts TEXT to np.array when selecting
407
+ sqlite3.register_converter("array", convert_array)
408
+ self.parent_connection = sqlite3.connect(self.parent, detect_types=sqlite3.PARSE_DECLTYPES)
409
+ self.parent_cursor = self.parent_connection.cursor()
410
+
411
+ #Close access to master DB
412
+ def close_parent_connection(self):
413
+ if self.parent_cursor is not None:
414
+ self.parent_cursor.close()
415
+ self.parent_connection.close()
416
+ self.parent_cursor = None
417
+ self.parent_connection = None
418
+
419
+ def add_genomes_first(self, accession, kmer_lists):
420
+
421
+ #kmer_lists = []
422
+ #Shoot... gotta pass the args
423
+
424
+ #for file in prepared_files:
425
+ # if accession in file.best_hits_kmers:
426
+ # kmer_lists.append((genome_index[file.basename], file.best_hits_kmers[accession].tobytes()))
427
+
428
+ sql_friendly_accession = accession.replace(".", "_")
429
+
430
+ self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
431
+
432
+ self.cursor.execute("CREATE TABLE " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
433
+ self.connection.commit()
434
+
435
+ self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
436
+
437
+ self.connection.commit()
438
+
439
+ return sql_friendly_accession
440
+
441
+
442
+ def add_accession(self, accession, insert_kmers):
443
+ sql_friendly_accession = accession.replace(".", "_")
444
+
445
+ if self.parent_exists:
446
+ parent_kmers = {}
447
+ #Check to see if this acc. is already in parent DB
448
+ table_exists = (self.parent_cursor.execute(" SELECT count(name) FROM sqlite_master WHERE type='table' AND name=(?)", (sql_friendly_accession,)).fetchone()[0] == 1)
449
+ #If the accession is in the parent DB
450
+ if table_exists:
451
+ #Select the records where the kmers are in the new kmers to be added - we don't have to modify the ones that aren't.
452
+ search_command = "SELECT * FROM "+ sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(insert_kmers)))
453
+
454
+ #Convert the kmers in the current insert list to the correct type for sql to match them
455
+ selection = tuple([int(key) for key in insert_kmers.keys()])
456
+
457
+ for item in self.parent_cursor.execute(search_command, selection).fetchall():
458
+ #Get the kmer for this parent
459
+ k = item[0]
460
+ #If the record would be modified in the parent, combine the to-add (which will replace the row) with the existing data. Otw. the record is unaffected and we can ignore it.
461
+ if k in insert_kmers:
462
+ insert_kmers[k] = np.union1d(insert_kmers[k], item[1])
463
+
464
+
465
+ #Free up the space.
466
+ del parent_kmers
467
+
468
+ formatted_kmers = []
469
+
470
+ #Translate the ndarray into its constituent byte data
471
+ for kmer in insert_kmers:
472
+ formatted_kmers.append((int(kmer), insert_kmers[kmer].tobytes(), ))
473
+
474
+ del insert_kmers
475
+
476
+ #Remove the child if it exists - it shouldn't ever exist because these child DBs should be deleted upon being added to the parent, but might if a run was stopped halfway.
477
+ self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession)
478
+
479
+ self.cursor.execute("CREATE TABLE " + sql_friendly_accession + " (kmer INTEGER PRIMARY KEY, genomes array)")
480
+ self.connection.commit()
481
+
482
+ self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + " VALUES (?, ?) ", formatted_kmers)
483
+
484
+ self.connection.commit()
485
+
486
+ del formatted_kmers
487
+
488
+ return sql_friendly_accession
489
+
490
+
491
+ #Holds partial results for calculating AAI.
492
+ class calculation_database:
493
+ def __init__(self, path, precision):
494
+ #open SQL db and load in
495
+
496
+ self.path = path
497
+ self.exists = False
498
+
499
+ self.connection = None
500
+ self.cursor = None
501
+
502
+ self.genomes = None
503
+
504
+ self.verbosity = False
505
+
506
+ self.precision = precision
507
+
508
+ #Open an SQL connection
509
+ def activate_connection(self):
510
+ # Converts np.array to TEXT when inserting
511
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
512
+
513
+ # Converts TEXT to np.array when selecting
514
+ if self.precision == "low":
515
+ sqlite3.register_converter("array", convert_float_array_16)
516
+ if self.precision == "med":
517
+ sqlite3.register_converter("array", convert_float_array_32)
518
+ if self.precision == "high":
519
+ sqlite3.register_converter("array", convert_float_array_64)
520
+
521
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
522
+ self.cursor = self.connection.cursor()
523
+ self.exists = True
524
+
525
+ #Close an SQL connection
526
+ def close_connection(self):
527
+ self.cursor.close()
528
+ self.connection.close()
529
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
530
+ self.cursor = None
531
+ self.connection = None
532
+
533
+ def initialize_database(self):
534
+ if not self.exists:
535
+ print("I need to be activated first!")
536
+ else:
537
+ #DB exists. Add metadata tables.
538
+ self.cursor.execute("DROP TABLE IF EXISTS jaccards")
539
+ self.connection.commit()
540
+ self.cursor.execute("CREATE TABLE jaccards (genome INTEGER PRIMARY KEY, jaccards array)")
541
+ self.connection.commit()
542
+
543
+ '''
544
+ Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
545
+
546
+ Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
547
+
548
+ '''
549
+ class input_file:
550
+ def __init__(self, input_path, output, verbosity):
551
+ #starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
552
+ self.path = input_path
553
+ #Output directory starts with this
554
+ self.output = os.path.normpath(os.path.basename(output) + "/")
555
+ #For printing file updates, this is the input name
556
+ self.name = os.path.basename(input_path)
557
+ #original name is the key used for the genomes index later on.
558
+ self.original_name = os.path.basename(input_path)
559
+ #This is the name that can be used for building files with new extensions.
560
+ if input_path.endswith(".gz"):
561
+ #Remove .gz first to make names consistent.
562
+ self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
563
+ else:
564
+ self.basename = os.path.splitext(os.path.basename(input_path))[0]
565
+ #'genome' or 'protein' or 'protein and HMM'
566
+ self.status = None
567
+ #These will keep track of paths for each stage of file for us.
568
+ self.genome = None
569
+ self.protein = None
570
+ self.hmm = None
571
+
572
+ self.best_hits = None
573
+ self.best_hits_kmers = None
574
+
575
+ self.protein_count = 0
576
+ self.protein_kmer_count = {}
577
+
578
+ self.trans_table = None
579
+ self.start_time = None
580
+ self.end_time = None
581
+ self.err_log = ""
582
+ #doesn't get updated otw.
583
+ self.initial_state = "protein+HMM"
584
+
585
+ self.verbose = verbosity
586
+
587
+ #r_scripts_loc = os.path.dirname(sys.modules['metapop'].__file__) + "/metapop_r/"
588
+ #"00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
589
+ self.hmm_path = None
590
+ try:
591
+ #Try to locate the data bundled as it would be with a pip/conda install.
592
+ script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
593
+ hmm_complete_model = script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'
594
+ self.hmm_path = str(hmm_complete_model)
595
+ #Check that the file exists or fail to the except.
596
+ fh = open(self.hmm_path)
597
+ fh.close()
598
+ except:
599
+ #Look in the same dir as the script; old method/MiGA friendly
600
+ script_path = Path(__file__)
601
+ script_dir = script_path.parent
602
+ hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
603
+ self.hmm_path = str(hmm_complete_model)
604
+
605
+ #Functions for externally setting status and file paths of particular types
606
+ def set_genome(self, path):
607
+ self.status = 'genome'
608
+ self.genome = path
609
+
610
+ def set_protein(self, path):
611
+ self.status = 'protein'
612
+ self.protein = path
613
+
614
+ def set_hmm(self, path):
615
+ if self.protein is None:
616
+ print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
617
+ self.status = 'protein and hmm'
618
+ self.hmm = path
619
+
620
+ #Runs prodigal, compares translation tables and stores faa files
621
+ def genome_to_protein(self):
622
+ if self.genome is None:
623
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
624
+ else:
625
+ folder = Path(self.output + "/predicted_proteins")
626
+ protein_output = folder / (self.basename + '.faa')
627
+ output_11 = folder / (self.basename + '.faa.11')
628
+ output_4 = folder / (self.basename + '.faa.4')
629
+ temp_output = folder / (self.basename + '.temp')
630
+
631
+ intermediate = folder / (self.basename + '_genome_intermediate.fasta')
632
+
633
+ #total_bases = 0
634
+
635
+ genome_parser = agnostic_reader(self.genome)
636
+
637
+ if genome_parser.is_gz:
638
+ #File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
639
+ #print("unzipping input...")
640
+ midpoint = open(intermediate, "w")
641
+ #Count input bases and write an unzipped file for prodigal's sake.
642
+ for line in genome_parser:
643
+ #if not line.startswith(">"):
644
+ # total_bases += len(line.strip())
645
+ midpoint.write(line)
646
+
647
+ midpoint.close()
648
+
649
+ else:
650
+ #File is already unzipped, just point to it
651
+ intermediate = self.genome
652
+ #Count input bases
653
+ #for line in genome_parser:
654
+ # if not line.startswith(">"):
655
+ # total_bases += len(line.strip())
656
+
657
+ genome_parser.close()
658
+ '''
659
+ A chunk of code originally indended to match GTDBtk's table selection criteria.
660
+ if total_bases > 100000:
661
+ #training mode
662
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
663
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
664
+ else:
665
+ #Metagenome mode for very short genomes.
666
+ subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_11), "-q", "-o", str(temp_output)])
667
+ subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
668
+ '''
669
+
670
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
671
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
672
+
673
+ #We can get rid of the temp file immediately, we won't be using it
674
+ temp_output.unlink()
675
+ if genome_parser.is_gz:
676
+ #If the file was copied, delete. Otw. this would delete the input and we don't want that.
677
+ intermediate.unlink()
678
+
679
+ # Compare translation tables
680
+ length_4 = 0
681
+ length_11 = 0
682
+ with open(output_4, 'r') as table_4:
683
+ for line in table_4:
684
+ if line.startswith(">"):
685
+ continue
686
+ else:
687
+ length_4 += len(line.strip())
688
+
689
+ with open(output_11, 'r') as table_11:
690
+ for line in table_11:
691
+ if line.startswith(">"):
692
+ continue
693
+ else:
694
+ length_11 += len(line.strip())
695
+
696
+ #Select the winning translation table and remove the other. Open the winner.
697
+ if (length_4 / length_11) >= 1.1:
698
+ output_11.unlink()
699
+ self.trans_table = "4"
700
+ chosen_protein = open(output_4, 'r')
701
+ table_11 = False
702
+ else:
703
+ output_4.unlink()
704
+ self.trans_table = "11"
705
+ chosen_protein = open(output_11, 'r')
706
+ table_11 = True
707
+
708
+ destination = open(protein_output, "w")
709
+
710
+ #Clean the winning output.
711
+ for line in chosen_protein:
712
+ if line.startswith(">"):
713
+ destination.write("{}".format(line))
714
+ else:
715
+ line = line.replace('*', '')
716
+ destination.write("{}".format(line))
717
+
718
+ destination.close()
719
+ chosen_protein.close()
720
+
721
+ # Remove the winning intermediate file, since we have the cleaned output
722
+ if table_11:
723
+ output_11.unlink()
724
+ else:
725
+ output_4.unlink()
726
+
727
+ self.set_protein(str(protein_output))
728
+
729
+ #run hmmsearch on a protein
730
+ def protein_to_hmm(self):
731
+ if self.protein is None:
732
+ print(self.name, "wasn't a declared as a protein! I can't make this into an HMM!")
733
+ else:
734
+
735
+ folder = Path(self.output + "/hmms")
736
+
737
+ hmm_output = folder / (self.basename + '.hmm')
738
+ temp_output = folder / (self.basename + '.temp')
739
+
740
+ intermediate = folder / (self.basename + '_protein_intermediate.faa')
741
+
742
+ current_protein = ""
743
+ current_seq = ""
744
+
745
+ protein_parser = agnostic_reader(self.protein)
746
+
747
+ #File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
748
+ #Keeps track of \n chars in the protein sequences.
749
+ line_ct = 0
750
+ midpoint = open(intermediate, "w")
751
+
752
+ for line in protein_parser:
753
+ if line.startswith(">"):
754
+ if len(current_seq) > 0:
755
+ if len(current_seq) < 100000:
756
+ midpoint.write(current_protein)
757
+ midpoint.write(current_seq)
758
+ else:
759
+ self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
760
+ #print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
761
+ #print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
762
+ #print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
763
+
764
+ current_protein = line
765
+ current_seq = ""
766
+ line_ct = 0
767
+ else:
768
+ line_ct += 1
769
+ current_seq += line
770
+
771
+ protein_parser.close()
772
+
773
+ #Finally, last prot
774
+ if len(current_seq) > 0:
775
+ if len(current_seq) < 100000:
776
+ midpoint.write(current_protein)
777
+ midpoint.write(current_seq)
778
+ else:
779
+ self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
780
+ #print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
781
+ #print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
782
+ #print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
783
+
784
+ midpoint.close()
785
+
786
+ #Should locate the DBs regardless of path.
787
+ script_path = Path(__file__)
788
+ script_dir = script_path.parent
789
+ hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
790
+
791
+ subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
792
+ str(hmm_complete_model), str(intermediate)])
793
+
794
+ temp_output.unlink()
795
+ intermediate.unlink()
796
+
797
+ self.set_hmm(str(hmm_output))
798
+
799
+ def prot_and_hmm_to_besthits(self):
800
+ prots = []
801
+ accs = []
802
+ scores = []
803
+ f = agnostic_reader(self.hmm)
804
+ for line in f:
805
+ if line.startswith("#"):
806
+ continue
807
+ else:
808
+ segs = line.strip().split()
809
+ prots.append(segs[0])
810
+ accs.append(segs[3])
811
+ scores.append(segs[8])
812
+
813
+ f.close()
814
+
815
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
816
+
817
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
818
+ #Sort the hmm file based on the score column in descending order.
819
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
820
+
821
+ #Identify the first row where each gene name appears, after sorting by score;
822
+ #in effect, return the highest scoring assignment per gene name
823
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
824
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
825
+
826
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
827
+ #Don't sort the indices, we don't care about the scores anymore.
828
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
829
+
830
+ self.best_hits = dict(zip(hmm_file[:,0], hmm_file[:,1]))
831
+
832
+ self.best_hits_kmers = {}
833
+ current_seq = ""
834
+ current_prot = ""
835
+ is_besthit = False
836
+
837
+ prot = agnostic_reader(self.protein)
838
+
839
+ for line in prot:
840
+
841
+ if line.startswith(">"):
842
+ if len(current_seq) > 0:
843
+ kmer_set = unique_kmers(current_seq, 4)
844
+ self.protein_kmer_count[current_prot] = kmer_set.shape[0]
845
+ self.protein_count += 1
846
+ self.best_hits_kmers[current_prot] = kmer_set
847
+ #Select the best hit accession for this protein and just record that. We do not care about the names of the proteins.
848
+ current_prot = line[1:].strip().split(" ")[0]
849
+ if current_prot in self.best_hits:
850
+ current_prot = self.best_hits[current_prot]
851
+ is_besthit = True
852
+ else:
853
+ is_besthit = False
854
+ current_seq = ""
855
+ else:
856
+ if is_besthit:
857
+ current_seq += line.strip()
858
+
859
+ prot.close()
860
+
861
+ #Final iter. doesn't happen otw.
862
+ if current_prot in self.best_hits:
863
+ kmer_set = unique_kmers(current_seq, 4)
864
+ #kmer_set = [kmer_index[k] for k in kmer_set]
865
+ self.protein_kmer_count[current_prot] = kmer_set.shape[0]
866
+ self.protein_count += 1
867
+ self.best_hits_kmers[current_prot] = kmer_set
868
+
869
+ self.status = "finished preprocessing"
870
+
871
+ def preprocess(self):
872
+ #There's no advancement stage for protein and HMM
873
+ if self.status == 'genome':
874
+ start_time = curtime()
875
+ #report = True
876
+ if self.start_time is None:
877
+ self.start_time = start_time
878
+
879
+ if self.initial_state == "protein+HMM":
880
+ self.initial_state = "genome"
881
+
882
+ self.genome_to_protein()
883
+
884
+
885
+ if self.status == 'protein':
886
+ start_time = curtime()
887
+ #report = True
888
+ if self.start_time is None:
889
+ self.start_time = start_time
890
+
891
+ if self.initial_state == "protein+HMM":
892
+ self.initial_state = "protein"
893
+
894
+ self.protein_to_hmm()
895
+
896
+ if self.status == 'protein and hmm':
897
+ start_time = curtime()
898
+
899
+ if self.start_time is None:
900
+ self.start_time = start_time
901
+
902
+ self.prot_and_hmm_to_besthits()
903
+
904
+ #Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
905
+ if self.start_time is not None:
906
+ end_time = curtime()
907
+ self.end_time = end_time
908
+ else:
909
+ #Start was protein+HMM. There was no runtime, and intitial state is p+hmm
910
+ #self.initial_state = "protein+HMM"
911
+ self.start_time = "N/A"
912
+ self.end_time = "N/A"
913
+
914
+ #Protein not generated on this run.
915
+ if self.trans_table is None:
916
+ self.trans_table = "unknown"
917
+
918
+ '''
919
+ Viral functions
920
+ '''
921
+ #No translation table comparison for viruses. Slightly reduced logic.
922
+ def viral_genome_to_protein(self):
923
+ if self.genome is None:
924
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
925
+ else:
926
+ folder = Path(self.output + "/predicted_proteins")
927
+ intermediate_protein_output = folder / (self.basename + '.intermediate.faa')
928
+ final_protein_output = folder / (self.basename + '.faa')
929
+ temp_output = folder / (self.basename + '.temp')
930
+
931
+ subprocess.call(["prodigal", "-i", str(self.genome), "-a", str(intermediate_protein_output), "-p", "meta", "-q", "-o", str(temp_output)])
932
+
933
+ # Remove intermediate files
934
+ temp_output.unlink()
935
+
936
+ chosen_protein = open(intermediate_protein_output, 'r')
937
+ destination = open(final_protein_output, "w")
938
+
939
+ for line in chosen_protein:
940
+ if line.startswith(">"):
941
+ destination.write("{}".format(line))
942
+ else:
943
+ line = line.replace('*', '')
944
+ destination.write("{}".format(line))
945
+
946
+ destination.close()
947
+ chosen_protein.close()
948
+
949
+ intermediate_protein_output.unlink()
950
+
951
+ self.protein = str(protein_output)
952
+ self.status = 'protein'
953
+
954
+
955
+ '''
956
+ Preprocessing functions
957
+
958
+ Read directories, advance files to hmms as needed.
959
+ '''
960
+ #Toy function for passing to a pool
961
+ def do_advance(input_file_object):
962
+ input_file_object.preprocess()
963
+ return input_file_object
964
+
965
+ def initialize_preproc(index):
966
+ global kmer_index
967
+ kmer_index = index
968
+
969
+ #Function which takes an input list
970
+ def advance_inputs(genomes = None, proteins = None, hmms = None, genomes_file = None, proteins_file = None, hmms_file = None, output = "FastAAI", threads = 1, verbose = False, db_name = ""):
971
+ inputs = []
972
+
973
+ hmm_broke = False
974
+
975
+ if genomes_file is not None:
976
+ fh = agnostic_reader(genomes_file)
977
+
978
+ for line in fh:
979
+ clean = line.strip()
980
+ if not os.path.exists(clean):
981
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
982
+ else:
983
+ current_file = input_file(clean, output, verbose)
984
+ current_file.set_genome(clean)
985
+ inputs.append(current_file)
986
+ del current_file
987
+
988
+ fh.close()
989
+
990
+ if proteins_file is not None:
991
+ fh = agnostic_reader(proteins_file)
992
+
993
+ for line in fh:
994
+ #GOTOGOTO
995
+ print(line)
996
+
997
+ clean = line.strip()
998
+ if not os.path.exists(clean):
999
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
1000
+ else:
1001
+ current_file = input_file(clean, output, verbose)
1002
+ current_file.set_protein(clean)
1003
+ inputs.append(current_file)
1004
+ del current_file
1005
+
1006
+ fh.close()
1007
+
1008
+ if hmms_file is not None:
1009
+ fh = agnostic_reader(hmms_file)
1010
+
1011
+ hmm_pairs = []
1012
+
1013
+ for line in fh:
1014
+ clean = line.strip()
1015
+ if not os.path.exists(clean):
1016
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
1017
+ else:
1018
+ hmm_pairs.append(clean)
1019
+
1020
+ fh.close()
1021
+
1022
+ if len(hmm_pairs) != len(inputs):
1023
+ print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These pairs must be in the same order in your input file!")
1024
+ hmm_broke = True
1025
+ else:
1026
+ for h, i in zip(hmm_pairs, inputs):
1027
+ i.set_hmm(h)
1028
+
1029
+ if genomes is not None:
1030
+ set = os.listdir(genomes)
1031
+ #Sort is used to ensure lexicographic ordering.
1032
+ set.sort()
1033
+ set = [os.path.normpath(genomes + "/" + file) for file in set]
1034
+
1035
+ for file in set:
1036
+ if not os.path.exists(file):
1037
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1038
+ else:
1039
+ current_file = input_file(file, output, verbose)
1040
+ current_file.set_genome(file)
1041
+ inputs.append(current_file)
1042
+ del current_file
1043
+
1044
+ if proteins is not None:
1045
+ set = os.listdir(proteins)
1046
+ set.sort()
1047
+ set = [os.path.normpath(proteins + "/" + file) for file in set]
1048
+
1049
+ for file in set:
1050
+ if not os.path.exists(file):
1051
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1052
+ else:
1053
+ current_file = input_file(file, output, verbose)
1054
+ current_file.set_protein(file)
1055
+ inputs.append(current_file)
1056
+ del current_file
1057
+
1058
+ if hmms is not None:
1059
+ set = os.listdir(hmms)
1060
+ set.sort()
1061
+ set = [os.path.normpath(hmms + "/" + file) for file in set]
1062
+
1063
+ hmm_pairs = []
1064
+
1065
+ for file in set:
1066
+ if not os.path.exists(file):
1067
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1068
+ else:
1069
+ hmm_pairs.append(file)
1070
+
1071
+ if len(hmm_pairs) != len(inputs):
1072
+ print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These must be in the same alphabetical order in their respective directories!")
1073
+ hmm_broke = True
1074
+ else:
1075
+ for h, i in zip(hmm_pairs, inputs):
1076
+ i.set_hmm(h)
1077
+
1078
+ if hmm_broke:
1079
+ print("FastAAI can't proceed without matching HMM and protein pairs.")
1080
+ inputs = None
1081
+ return inputs
1082
+
1083
+ total_counts = len(inputs)
1084
+ count = 0
1085
+ last_pct = 0
1086
+
1087
+ if verbose:
1088
+ print("")
1089
+ #progress bar - possible dangerous use of the return to line start sequence.
1090
+ try:
1091
+ percentage = 0
1092
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
1093
+ sys.stdout.flush()
1094
+ except:
1095
+ #It's not really a big deal if the progress bar cannot be printed.
1096
+ pass
1097
+
1098
+ results = []
1099
+
1100
+ kmer_index_ = create_kmer_index()
1101
+ pool = multiprocessing.Pool(threads, initializer=initialize_preproc, initargs = (kmer_index_,))
1102
+
1103
+ for res in pool.imap(do_advance, inputs):
1104
+ results.append(res)
1105
+ if verbose:
1106
+ #progress bar - possible dangerous use of the return to line start sequence.
1107
+ try:
1108
+ count += 1
1109
+ percentage = (count/total_counts)*100
1110
+ if int(percentage/2) > last_pct or partition == total_partitions:
1111
+ sys.stdout.write('\033[A')
1112
+ sys.stdout.flush()
1113
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
1114
+ sys.stdout.flush()
1115
+
1116
+ last_pct = int(percentage/2)
1117
+ except:
1118
+ #It's not really a big deal if the progress bar cannot be printed.
1119
+ pass
1120
+
1121
+ pool.close()
1122
+ pool.join()
1123
+
1124
+ inputs = results
1125
+
1126
+ log_time = curtime()
1127
+
1128
+ if os.path.exists(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt")):
1129
+ preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "a")
1130
+ else:
1131
+ preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "w")
1132
+ print("log_date", "genome_name", "started_as_a", "start_time", "end_time", "protein_translation_table", "errors", sep = "\t", file = preproc_log)
1133
+ for i in inputs:
1134
+ print(log_time, i.basename, i.initial_state, i.start_time, i.end_time, i.trans_table, i.err_log, sep = "\t", file = preproc_log)
1135
+ preproc_log.close()
1136
+
1137
+ return inputs
1138
+
1139
+ '''
1140
+ Utility functions
1141
+ '''
1142
+ def prepare_directories(output, status, build_or_query):
1143
+ preparation_successful = True
1144
+
1145
+ if not os.path.exists(output):
1146
+ try:
1147
+ os.mkdir(output)
1148
+ except:
1149
+ print("")
1150
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
1151
+ print("")
1152
+ print("Troubleshooting:")
1153
+ print("")
1154
+ print(" (1) Do you have permission to create directories in the location you specified?")
1155
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
1156
+ print("")
1157
+ preparation_successful = False
1158
+
1159
+ if preparation_successful:
1160
+ try:
1161
+ if status == 'genome':
1162
+ if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
1163
+ os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
1164
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1165
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1166
+
1167
+ if status == 'protein':
1168
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1169
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1170
+
1171
+ if not os.path.exists(os.path.normpath(output + "/" + "logs")):
1172
+ os.mkdir(os.path.normpath(output + "/" + "logs"))
1173
+
1174
+ if build_or_query == "build":
1175
+ if not os.path.exists(os.path.normpath(output + "/" + "database")):
1176
+ os.mkdir(os.path.normpath(output + "/" + "database"))
1177
+
1178
+ if build_or_query == "query":
1179
+ if not os.path.exists(os.path.normpath(output + "/" + "results")):
1180
+ os.mkdir(os.path.normpath(output + "/" + "results"))
1181
+
1182
+
1183
+ except:
1184
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
1185
+ print("")
1186
+ print("This shouldn't happen. Do you have permission to write to that directory?")
1187
+
1188
+
1189
+ return preparation_successful
1190
+
1191
+ def check_out_input_files(genomes, proteins, hmms, gf, pf, hf):
1192
+ #Check only one method of supply was used per file type
1193
+ if (genomes is not None) and (gf is not None):
1194
+ print("Supply genomes either by directory or by file, not both.")
1195
+ return None
1196
+ if (proteins is not None) and (pf is not None):
1197
+ print("Supply proteins either by directory or by file, not both.")
1198
+ return None
1199
+ if (hmms is not None) and (hf is not None):
1200
+ print("Supply HMMs either by directory or by file, not both.")
1201
+ return None
1202
+
1203
+ #check that not both proteins and genomes supplied in any combo.
1204
+ if ((genomes is not None) and (pf is not None))\
1205
+ or ((gf is not None) and (proteins is not None))\
1206
+ or ((genomes is not None) and (proteins is not None))\
1207
+ or ((gf is not None) and (pf is not None)):
1208
+ print("Supply either genomes or proteins, not both. You can supply proteins and HMMs, but not genomes and proteins.")
1209
+ return None
1210
+
1211
+ #Check that if hmms are given, so are proteins
1212
+ if (hmms is not None) or (hf is not None):
1213
+ if (proteins is None) and (pf is None):
1214
+ print("If you supply HMMs, you also have to supply the proteins from which they were generated.")
1215
+ return None
1216
+
1217
+ #Determine status
1218
+ if (genomes is not None) or (gf is not None):
1219
+ print("Starting from genomes")
1220
+ start = 'genome'
1221
+
1222
+ else:
1223
+ if (hmms is not None) or (hf is not None):
1224
+ print("Starting from proteins and HMMs")
1225
+ start = 'protein and HMM'
1226
+
1227
+ else:
1228
+ print("Starting from proteins")
1229
+ start = 'protein'
1230
+
1231
+ return start
1232
+
1233
+
1234
+ #Build DB from genomes
1235
+
1236
+ def unique_kmers(seq, ksize):
1237
+ n_kmers = len(seq) - ksize + 1
1238
+ kmers = []
1239
+ for i in range(n_kmers):
1240
+ kmers.append(kmer_index[seq[i:i + ksize]])
1241
+ #We care about the type because we're working with bytes later.
1242
+ return np.unique(kmers).astype(np.int32)
1243
+
1244
+ #Quickly creates a dict of all poss. tetramers in a fixed, alphabetical order.
1245
+ #This can be used to index kmers so that the indices are identical (and thus interchangable) on separate runs of this program.
1246
+ def create_kmer_index():
1247
+ valid_chars = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '*']
1248
+ #This meshgrid method will produce all unique tetramers from AAAA to **** in a consistent order.
1249
+ #Rightmost char to leftmost, A to * in the same order as valid_chars
1250
+ kmer_index_ = np.stack(np.meshgrid(valid_chars, valid_chars, valid_chars, valid_chars), -1).reshape(-1, 4)
1251
+ #Unless someone is passing more than 2.1 billion genomes, int32 will be enough.
1252
+ kmer_index_ = dict(zip([''.join(kmer_index_[i,]) for i in range(0, kmer_index_.shape[0])], np.arange(kmer_index_.shape[0], dtype = np.int32)))
1253
+
1254
+ return kmer_index_
1255
+
1256
+ def split_seq(seq, num_grps):
1257
+ newseq = []
1258
+ splitsize = 1.0/num_grps*len(seq)
1259
+ for i in range(num_grps):
1260
+ newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
1261
+ return newseq
1262
+
1263
+ #gives the max and min index needed to split a list of (max_val) genomes into
1264
+ def split_indicies(max_val, num_grps):
1265
+ newseq = []
1266
+ splitsize = 1.0/num_grps*max_val
1267
+ for i in range(num_grps):
1268
+ newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
1269
+ return newseq
1270
+
1271
+ def list_to_index_dict(list):
1272
+ result = {}
1273
+ counter = 0
1274
+ for item in list:
1275
+ result[item] = counter
1276
+ counter += 1
1277
+ return result
1278
+
1279
+ def generate_accessions_index():
1280
+ list_of_poss_accs = list_to_index_dict(['PF01780.19', 'PF03948.14', 'PF17144.4', 'PF00830.19', 'PF00347.23', 'PF16906.5', 'PF13393.6',
1281
+ 'PF02565.15', 'PF01991.18', 'PF01984.20', 'PF00861.22', 'PF13656.6', 'PF00368.18', 'PF01142.18', 'PF00312.22', 'PF02367.17',
1282
+ 'PF01951.16', 'PF00749.21', 'PF01655.18', 'PF00318.20', 'PF01813.17', 'PF01649.18', 'PF01025.19', 'PF00380.19', 'PF01282.19',
1283
+ 'PF01864.17', 'PF01783.23', 'PF01808.18', 'PF01982.16', 'PF01715.17', 'PF00213.18', 'PF00119.20', 'PF00573.22', 'PF01981.16',
1284
+ 'PF00281.19', 'PF00584.20', 'PF00825.18', 'PF00406.22', 'PF00177.21', 'PF01192.22', 'PF05833.11', 'PF02699.15', 'PF01016.19',
1285
+ 'PF01765.19', 'PF00453.18', 'PF01193.24', 'PF05221.17', 'PF00231.19', 'PF00416.22', 'PF02033.18', 'PF01668.18', 'PF00886.19',
1286
+ 'PF00252.18', 'PF00572.18', 'PF00366.20', 'PF04104.14', 'PF04919.12', 'PF01912.18', 'PF00276.20', 'PF00203.21', 'PF00889.19',
1287
+ 'PF02996.17', 'PF00121.18', 'PF01990.17', 'PF00344.20', 'PF00297.22', 'PF01196.19', 'PF01194.17', 'PF01725.16', 'PF00750.19',
1288
+ 'PF00338.22', 'PF00238.19', 'PF01200.18', 'PF00162.19', 'PF00181.23', 'PF01866.17', 'PF00709.21', 'PF02006.16', 'PF00164.25',
1289
+ 'PF00237.19', 'PF01139.17', 'PF01351.18', 'PF04010.13', 'PF06093.13', 'PF00828.19', 'PF02410.15', 'PF01176.19', 'PF02130.17',
1290
+ 'PF01948.18', 'PF01195.19', 'PF01746.21', 'PF01667.17', 'PF03874.16', 'PF01090.19', 'PF01198.19', 'PF01250.17', 'PF17136.4',
1291
+ 'PF06026.14', 'PF03652.15', 'PF04019.12', 'PF01201.22', 'PF00832.20', 'PF01264.21', 'PF03840.14', 'PF00831.23', 'PF00189.20',
1292
+ 'PF02601.15', 'PF01496.19', 'PF00411.19', 'PF00334.19', 'PF00687.21', 'PF01157.18', 'PF01245.20', 'PF01994.16', 'PF01632.19',
1293
+ 'PF00827.17', 'PF01015.18', 'PF00829.21', 'PF00410.19', 'PF00833.18', 'PF00935.19', 'PF01992.16'])
1294
+
1295
+ return list_of_poss_accs
1296
+
1297
+ #Master function for building or adding to a DB with genomes.
1298
+ def add_inputs(output_path, parent_path, existing_index, threads, verbose, prep_args):
1299
+
1300
+ genomes, proteins, hmms, gf, pf, hf, db_name = prep_args[0], prep_args[1], prep_args[2], prep_args[3], prep_args[4], prep_args[5], prep_args[6]
1301
+
1302
+ print("")
1303
+ print("FastAAI is formatting your files to be saved to your database.")
1304
+
1305
+ #Let's push this to the inputs section.
1306
+ inputs = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output_path, threads = threads, verbose = verbose, db_name = db_name)
1307
+
1308
+ if inputs is None:
1309
+ return False
1310
+
1311
+ kmer_index = None
1312
+
1313
+ #global genome_index
1314
+ genome_index = {}
1315
+ next_index = 0
1316
+
1317
+ #Build upon the genome indexing of an existing DB
1318
+ if existing_index is not None:
1319
+ genome_index = existing_index
1320
+ #zero indexing makes this the next number to add.
1321
+ next_index = len(existing_index)
1322
+
1323
+ final_db = fastaai_database(parent_path)
1324
+ final_db.activate_connection()
1325
+ final_db.initialize_parent_database()
1326
+
1327
+ #This goes to the genome_index table
1328
+ protein_counts_to_add = []
1329
+ genome_acc_kmer_counts_to_add = []
1330
+
1331
+ acc_index = generate_accessions_index()
1332
+
1333
+ readied_kmers_by_acc = defaultdict(lambda: defaultdict(lambda: None))
1334
+
1335
+ #unique_accessions = set()
1336
+ for file in inputs:
1337
+
1338
+ genome = file.basename
1339
+
1340
+ #Collect all of the accessions actually found. Will usually be 122 for reasonably sized datasets.
1341
+ #unique_accessions = unique_accessions.union(set(file.best_hits.values()))
1342
+ #Avoid adding duplicate genomes
1343
+ if genome not in genome_index:
1344
+ protein_counts_to_add.append((genome, next_index, file.protein_count))
1345
+ for prot in file.protein_kmer_count:
1346
+ genome_acc_kmer_counts_to_add.append((next_index, acc_index[prot], file.protein_kmer_count[prot]))
1347
+ genome_index[genome] = next_index
1348
+ next_index += 1
1349
+
1350
+ this_index = genome_index[genome]
1351
+ for acc in file.best_hits_kmers:
1352
+ readied_kmers_by_acc[acc][this_index] = file.best_hits_kmers[acc]
1353
+ #Clean up space
1354
+ file.best_hits_kmers = None
1355
+
1356
+ inputs = None
1357
+
1358
+ #Default dicts can't be pickled.
1359
+ readied_kmers_by_acc = dict(readied_kmers_by_acc)
1360
+
1361
+ genomes_per_acc = {}
1362
+ for acc in readied_kmers_by_acc:
1363
+ readied_kmers_by_acc[acc] = dict(readied_kmers_by_acc[acc])
1364
+ genomes_per_acc[acc] = list(readied_kmers_by_acc[acc].keys())
1365
+ final_db.add_genomes_first(acc, readied_kmers_by_acc[acc])
1366
+ readied_kmers_by_acc[acc] = None
1367
+
1368
+ readied_kmers_by_acc = None
1369
+
1370
+ add_genomes = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
1371
+ add_proteins = "INSERT OR REPLACE INTO genome_acc_kmer_counts VALUES (?, ?, ?)"
1372
+
1373
+ final_db.cursor.executemany(add_genomes, protein_counts_to_add)
1374
+ final_db.cursor.executemany(add_proteins, genome_acc_kmer_counts_to_add)
1375
+ final_db.connection.commit()
1376
+
1377
+ final_db.cursor.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
1378
+ final_db.connection.commit()
1379
+
1380
+ protein_counts_to_add = None
1381
+ genome_acc_kmer_counts_to_add = None
1382
+
1383
+ unique_accessions = list(genomes_per_acc.keys())
1384
+ child_args = []
1385
+ for i in range(0, len(unique_accessions)):
1386
+ accession = unique_accessions[i]
1387
+ name = "accession_" + unique_accessions[i] + "_partition_" + str(i)
1388
+ child_path = os.path.normpath(output_path+"/temp")
1389
+ child_args.append([accession, name, child_path, parent_path, genomes_per_acc[accession], genome_index])
1390
+
1391
+ print("")
1392
+ print("Formatting data to add to database at", curtime())
1393
+
1394
+ #Add partition, output, parent DB data.
1395
+ if not os.path.exists(os.path.normpath(output_path+"/temp")):
1396
+ try:
1397
+ os.mkdir(os.path.normpath(output_path+"/temp"))
1398
+ except:
1399
+ print("Output directory failed to create! Cannot continue.")
1400
+ return False
1401
+
1402
+ if verbose:
1403
+ print("")
1404
+ count = 0
1405
+ total_counts = len(child_args)
1406
+ try:
1407
+ log_time = curtime()
1408
+ percentage = (count/total_counts)*100
1409
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' ) at ' + curtime() + "\n")
1410
+ sys.stdout.flush()
1411
+ except:
1412
+ #It's not really a big deal if the progress bar cannot be printed.
1413
+ pass
1414
+
1415
+ last_pct = 0
1416
+
1417
+ quiverfull = []
1418
+
1419
+ pool = multiprocessing.Pool(threads)
1420
+
1421
+ for result in pool.imap_unordered(produce_children, child_args):
1422
+ acc = result[0]
1423
+ child = result[1]
1424
+
1425
+ quiverfull.append([acc, child])
1426
+
1427
+ if verbose:
1428
+ count += 1
1429
+ try:
1430
+ percentage = (count/total_counts)*100
1431
+ log_time = curtime()
1432
+ sys.stdout.write('\033[A')
1433
+ sys.stdout.flush()
1434
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1435
+ sys.stdout.flush()
1436
+ except:
1437
+ #It's not really a big deal if the progress bar cannot be printed.
1438
+ pass
1439
+
1440
+ pool.close()
1441
+ pool.join()
1442
+
1443
+ print("")
1444
+ print("Adding data to final database.")
1445
+
1446
+ if verbose:
1447
+ print("")
1448
+
1449
+ count = 0
1450
+ total_counts = len(child_args)
1451
+ try:
1452
+ percentage = (count/total_counts)*100
1453
+
1454
+ ("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1455
+ sys.stdout.flush()
1456
+ except:
1457
+ #It's not really a big deal if the progress bar cannot be printed.
1458
+ pass
1459
+
1460
+ last_pct = 0
1461
+
1462
+ for result in quiverfull:
1463
+ acc = result[0]
1464
+ child = result[1]
1465
+ final_db.add_child_to_parent(acc, child)
1466
+
1467
+ if verbose:
1468
+ count += 1
1469
+ try:
1470
+ percentage = (count/total_counts)*100
1471
+ log_time = curtime()
1472
+ sys.stdout.write('\033[A')
1473
+ sys.stdout.flush()
1474
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1475
+ sys.stdout.flush()
1476
+ except:
1477
+ #It's not really a big deal if the progress bar cannot be printed.
1478
+ pass
1479
+
1480
+
1481
+ print("")
1482
+ #print("Cleaning up...")
1483
+ #final_db.connection.execute("VACUUM")
1484
+
1485
+ final_db.close_connection()
1486
+
1487
+ os.rmdir(os.path.normpath(output_path+"/temp"))
1488
+
1489
+ return True
1490
+
1491
+ #genome_index is global already
1492
+ def produce_children(args):
1493
+ acc = args[0]
1494
+ partition = args[1]
1495
+ output_base = args[2]
1496
+ parent_db = args[3]
1497
+ genomes_in_this_acc = args[4]
1498
+ genome_index = args[5]
1499
+
1500
+ parental_database = fastaai_database(parent_db)
1501
+
1502
+ sql_friendly_accession = acc.replace('.', '_')
1503
+
1504
+ read_parent_sql = "SELECT * FROM " + sql_friendly_accession + "_genomes WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(genomes_in_this_acc)))
1505
+
1506
+ parental_database.activate_connection()
1507
+
1508
+ genomes_for_this_acc = dict(parental_database.cursor.execute(read_parent_sql, genomes_in_this_acc).fetchall())
1509
+
1510
+ parental_database.close_connection()
1511
+
1512
+ child_db = os.path.normpath(output_base + "/" + partition + ".db")
1513
+
1514
+ this_child = child_database(child_db, parent_db)
1515
+
1516
+ this_child.activate_child_connection()
1517
+ #this_child.initialize_child_database()
1518
+ this_child.activate_parent_connection()
1519
+
1520
+ #Keys are genomes as indices, values are numpy arrays of kmers. This makes tuples.
1521
+ #this_child.add_genomes_first(acc, zip(genomes_for_this_acc.keys(), genomes_for_this_acc.values()))
1522
+
1523
+ #Here's where we add the genomes as such to the children, too.
1524
+ readied_kmers = defaultdict(lambda: [])
1525
+ for genome in genomes_for_this_acc:
1526
+ for kmer in genomes_for_this_acc[genome]:
1527
+ readied_kmers[kmer].append(genome)
1528
+ #cleanup space
1529
+ genomes_for_this_acc[genome] = None
1530
+
1531
+ del genomes_for_this_acc
1532
+
1533
+ readied_kmers = dict(readied_kmers)
1534
+ for kmer in readied_kmers:
1535
+ readied_kmers[kmer] = np.array(readied_kmers[kmer], dtype = np.int32)
1536
+
1537
+ sql_friendly_accession = this_child.add_accession(acc, readied_kmers)
1538
+
1539
+ this_child.close_parent_connection()
1540
+ this_child.close_child_connection()
1541
+
1542
+ del readied_kmers
1543
+
1544
+ return [sql_friendly_accession, child_db]
1545
+
1546
+ #Build or add to a FastAAI DB
1547
+ def build_db_opts():
1548
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
1549
+ description='''
1550
+ This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
1551
+
1552
+ Supply genomes OR proteins OR proteins AND HMMs as inputs.
1553
+
1554
+ If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
1555
+ If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
1556
+ If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
1557
+ You cannot supply both genomes and proteins
1558
+ ''')
1559
+
1560
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
1561
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
1562
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
1563
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
1564
+
1565
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
1566
+
1567
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
1568
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
1569
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
1570
+
1571
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
1572
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
1573
+
1574
+ args, unknown = parser.parse_known_args()
1575
+
1576
+ return parser, args
1577
+
1578
+ def build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose):
1579
+
1580
+ start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
1581
+
1582
+ #If something failed, we stop.
1583
+ if start is None:
1584
+ return False
1585
+
1586
+ good_to_go = prepare_directories(output, start, "build")
1587
+
1588
+ if not good_to_go:
1589
+ return False
1590
+
1591
+ #Check if the db contains path info. Incl. windows version.
1592
+ if "/" not in db_name and "\\" not in db_name:
1593
+ final_database = os.path.normpath(output + "/database/" + db_name)
1594
+ else:
1595
+ #If the person insists that the db has a path, let them.
1596
+ final_database = db_name
1597
+
1598
+ #We'll skip trying this if the file already exists.
1599
+ existing_genome_IDs = None
1600
+ try:
1601
+ if os.path.exists(final_database):
1602
+ parent = fastaai_database(final_database)
1603
+ parent.activate_connection()
1604
+
1605
+ existing_genome_IDs = {}
1606
+ sql_command = "SELECT genome, gen_id FROM genome_index"
1607
+ for result in parent.cursor.execute(sql_command).fetchall():
1608
+ genome = result[0]
1609
+ id = int(result[1])
1610
+ existing_genome_IDs[genome] = id
1611
+
1612
+ parent.close_connection()
1613
+ except:
1614
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
1615
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
1616
+ print("Exiting.")
1617
+ return False
1618
+
1619
+
1620
+ prep_args = [genomes, proteins, hmms, gf, pf, hf, db_name]
1621
+
1622
+ #inputs, output_path, parent_path, existing_index, threads
1623
+ success = add_inputs(output, final_database, existing_genome_IDs, threads, verbose, prep_args)
1624
+
1625
+ if success:
1626
+ print("Database build complete!")
1627
+
1628
+ return success
1629
+
1630
+
1631
+ #DB query functionality - unlimited version
1632
+ def do_query_vs_target_aai_only(query_name, target_name, threads, output, precision, verbose):
1633
+ if not os.path.exists(os.path.normpath(output+"/temp")):
1634
+ os.mkdir(os.path.normpath(output+"/temp"))
1635
+
1636
+ if precision == "low":
1637
+ jacc_precision = np.float16
1638
+ if precision == "med":
1639
+ jacc_precision = np.float32
1640
+ if precision == "high":
1641
+ jacc_precision = np.float64
1642
+
1643
+ #Save the file paths.
1644
+ query = fastaai_database(query_name)
1645
+ target = fastaai_database(target_name)
1646
+
1647
+ query.activate_connection()
1648
+ query.just_accessions()
1649
+ query_len = query.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
1650
+ #query.close_connection()
1651
+ target.activate_connection()
1652
+ target.just_accessions()
1653
+ target_len = target.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
1654
+ #target.close_connection()
1655
+
1656
+ print("FastAAI will search", query_len, "query genomes against", target_len, "target genomes.")
1657
+
1658
+ print("")
1659
+ print("FastAAI is preparing your AAI search... ", end = '', flush = True)
1660
+
1661
+ accessions_in_common = list(set(query.accessions).intersection(target.accessions))
1662
+
1663
+ query.accessions = None
1664
+ target.accessions = None
1665
+
1666
+ query.close_connection()
1667
+ target.close_connection()
1668
+
1669
+ load_args = [(query, target, acc) for acc in accessions_in_common]
1670
+
1671
+ loads = []
1672
+ ordered_accs = []
1673
+
1674
+ pool = multiprocessing.Pool(threads)
1675
+
1676
+ for result in pool.imap(load_getter, load_args):
1677
+ load = result[0]
1678
+ acc = result[1]
1679
+ #Load will be None if the accession is in both query and target, but they still don't share even a single Kmer. Unlikely, but it happened once, so it WILL happen again.
1680
+ if load is not None:
1681
+ loads.append(load)
1682
+ ordered_accs.append(acc)
1683
+
1684
+ pool.close()
1685
+ pool.join()
1686
+
1687
+ loads = np.array(loads)
1688
+ ordered_accs = np.array(ordered_accs)
1689
+
1690
+ order = loads.argsort()[::-1]
1691
+
1692
+ loads = loads[order]
1693
+ ordered_accs = ordered_accs[order]
1694
+
1695
+ load_balancer = {}
1696
+ accs_per_load = {}
1697
+ for i in range(0, threads):
1698
+ load_balancer[i] = 0
1699
+ accs_per_load[i] = []
1700
+
1701
+ for i in range(0, loads.shape[0]):
1702
+ index = list(load_balancer.values()).index(min(list(load_balancer.values())))
1703
+ #print(index, load)
1704
+ load_balancer[index] += loads[i]
1705
+ accs_per_load[index].append(int(ordered_accs[i]))
1706
+
1707
+ del loads
1708
+ del ordered_accs
1709
+
1710
+ print("done!")
1711
+ if verbose:
1712
+ print("FastAAI has balanced the workload of calculating AAI from your data.")
1713
+ for index in accs_per_load:
1714
+ print("Thread", index, "will handle", len(accs_per_load[index]), "accessions.")
1715
+ print("FastAAI is beginning the calculation of AAI between your query and target genomes.")
1716
+
1717
+ del load_balancer
1718
+
1719
+ input_queue = multiprocessing.Queue()
1720
+ output_queue = multiprocessing.Queue()
1721
+
1722
+ for thread in accs_per_load:
1723
+ input_queue.put(accs_per_load[thread])
1724
+
1725
+ for i in range(0, threads):
1726
+ input_queue.put('STOP')
1727
+
1728
+ for i in range(0, threads):
1729
+ multiprocessing.Process(target=accession_worker, args=(input_queue, output_queue, query, target, query_len, target_len, jacc_precision)).start()
1730
+
1731
+ print("")
1732
+
1733
+ results = np.zeros(shape = (query_len, target_len), dtype = jacc_precision)
1734
+
1735
+ #Counter to keep the threads running until the whole process is done.
1736
+ donezo = threads
1737
+ while donezo > 0:
1738
+ row = output_queue.get()
1739
+ try:
1740
+ results[row[0]] += row[1]
1741
+ except:
1742
+ donezo -= 1
1743
+
1744
+ print("AAI calculations complete. Formatting results for writing.")
1745
+
1746
+ #global glob_prec
1747
+ #glob_prec = jacc_precision
1748
+
1749
+ rdb_name = os.path.normpath(output+"/temp/aai_calc_db.db")
1750
+ rdb = calculation_database(rdb_name, precision)
1751
+ rdb.activate_connection()
1752
+ rdb.initialize_database()
1753
+
1754
+ #Get the data ready for passing to children...
1755
+
1756
+ results = np.split(results, query_len, axis = 0)
1757
+
1758
+ insertable = []
1759
+ #iterate over results and turn them into tuples.
1760
+ for i in range(0, query_len):
1761
+ insertable.append((i, results[i].tobytes()))
1762
+ results[i] = None
1763
+
1764
+ rdb.cursor.executemany("INSERT INTO jaccards VALUES (?, ?)", (insertable))
1765
+ rdb.connection.commit()
1766
+
1767
+ rdb.close_connection()
1768
+
1769
+ del insertable
1770
+ del results
1771
+
1772
+ #Now we split the query genomes into chunk and have threads process each chunk in parallel with its respective shared prot counts.
1773
+ query_chunks = split_indicies(query_len, threads)
1774
+ query_args = [([rdb_name], query_chunks[i], output, query, target, precision) for i in range(0, threads)]
1775
+
1776
+ print("Results formatted. Writing results starting at", curtime())
1777
+
1778
+ pool = multiprocessing.Pool(threads)
1779
+
1780
+ pool.map(finish_jaccards, query_args)
1781
+
1782
+ pool.close()
1783
+ pool.join()
1784
+
1785
+ os.remove(rdb_name)
1786
+
1787
+ print("FastAAI complete! Results at:", os.path.normpath(output+"/results/"))
1788
+
1789
+ return None
1790
+
1791
+ #Assess the number of comparisons that will have to be made to complete an accession so that balanced loads can be passed to threads
1792
+ def load_getter(args):
1793
+ query, target, accession = args[0], args[1], args[2]
1794
+ query.activate_connection()
1795
+ target.activate_connection()
1796
+
1797
+ original_index = generate_accessions_index()
1798
+ accession_inverter = {}
1799
+ for acc in original_index:
1800
+ sql_friendly_accession = acc.replace(".", "_")
1801
+ accession_inverter[original_index[acc]] = sql_friendly_accession
1802
+
1803
+ sql_friendly_accession = accession_inverter[accession].replace('.', '_')
1804
+ sql = "SELECT kmer FROM "+ sql_friendly_accession
1805
+ query.cursor.row_factory = lambda cursor, row: row[0]
1806
+ #query_kmers = set(query.cursor.execute(sql).fetchall()).intersection()
1807
+ target.cursor.row_factory = lambda cursor, row: row[0]
1808
+ #target_kmers = target.cursor.execute(sql).fetchall()
1809
+
1810
+ shared_kmers = list(set(query.cursor.execute(sql).fetchall()).intersection(target.cursor.execute(sql).fetchall()))
1811
+ query.cursor.row_factory = None
1812
+ target.cursor.row_factory = None
1813
+
1814
+ bytes_sql = "SELECT sum(length(genomes)) FROM " + sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(shared_kmers)))
1815
+
1816
+ if len(shared_kmers) > 0:
1817
+ tgt_res = target.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
1818
+ query_res = query.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
1819
+ #This if *should* always happen, if it gets checked.
1820
+ if tgt_res is not None and query_res is not None:
1821
+ load = int(tgt_res/(4096) * query_res/(4096))
1822
+ else:
1823
+ load = None
1824
+ else:
1825
+ load = None
1826
+
1827
+ query.close_connection()
1828
+ target.close_connection()
1829
+
1830
+ return [load, accession]
1831
+
1832
+ def accession_worker(in_queue, out_queue, query, target, qlen, tlen, prec):
1833
+ original_index = generate_accessions_index()
1834
+ accession_inverter = {}
1835
+ for acc in original_index:
1836
+ sql_friendly_accession = acc.replace(".", "_")
1837
+ accession_inverter[original_index[acc]] = sql_friendly_accession
1838
+
1839
+ query.activate_connection()
1840
+ target.activate_connection()
1841
+ query.load_genome_index()
1842
+ target.load_genome_index()
1843
+
1844
+ for my_accessions in iter(in_queue.get, 'STOP'):
1845
+
1846
+ #print(my_accessions)
1847
+
1848
+ target.load_accessions(permitted_accessions = my_accessions)
1849
+ query.load_accessions(permitted_accessions = my_accessions)
1850
+
1851
+ query_data = {}
1852
+ target_data = {}
1853
+
1854
+ for acc in my_accessions:
1855
+
1856
+ sql_friendly_accession = accession_inverter[acc].replace('.', '_')
1857
+
1858
+ query_data[acc] = dict(query.cursor.execute("SELECT * FROM "+sql_friendly_accession+"_genomes").fetchall())
1859
+
1860
+ query.cursor.row_factory = lambda cursor, row: row[0]
1861
+ selected_kmers = list(query.cursor.execute("SELECT kmer FROM "+sql_friendly_accession).fetchall())
1862
+ query.cursor.row_factory = None
1863
+
1864
+ target_sql = "SELECT * FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(selected_kmers)))
1865
+ target_data[acc] = dict(target.cursor.execute(target_sql, selected_kmers).fetchall())
1866
+
1867
+ target_kmer_cts_by_acc = {}
1868
+ for acc in my_accessions:
1869
+ target_kmer_cts_by_acc[acc] = np.zeros(tlen, dtype = np.int16)
1870
+
1871
+ for genome in target.gak:
1872
+ for acc in target.gak[genome]:
1873
+ target_kmer_cts_by_acc[acc][genome] = target.gak[genome][acc]
1874
+
1875
+ #No longer needed.
1876
+ target.gak = None
1877
+ #We want each thread to report every single genome
1878
+ for genome in query.gak:
1879
+ #count += 1
1880
+ #print("Thread", my_thread, "genome", count, "of", total)
1881
+ these_jaccards = np.zeros(tlen, dtype = np.float64)
1882
+ for acc in query.gak[genome]:
1883
+ these_intersections = np.zeros(tlen, dtype = np.int16)
1884
+ query_kmers = query_data[acc][genome]
1885
+ query_kmer_ct = query_kmers.shape
1886
+ for kmer in query_kmers:
1887
+ if kmer in target_data[acc]:
1888
+ these_intersections[target_data[acc][kmer]] += 1
1889
+
1890
+ these_jaccards += np.divide(these_intersections, np.subtract(np.add(query_kmer_ct, target_kmer_cts_by_acc[acc]), these_intersections))
1891
+
1892
+ out_queue.put([genome, these_jaccards])
1893
+
1894
+ target.close_connection()
1895
+ query.close_connection()
1896
+ out_queue.put("Based")
1897
+
1898
+ return None
1899
+
1900
+ def finish_jaccards(args):
1901
+ partial_dbs, my_query_genomes, output, query, target, prec = args[0], args[1], args[2], args[3] ,args[4], args[5]
1902
+ #Load protein counts
1903
+ #for each genome, query each partial and sum matching genomes, then divide by shared counts.
1904
+
1905
+ query.activate_connection()
1906
+ target.activate_connection()
1907
+ query.load_genome_index()
1908
+ target.load_genome_index()
1909
+
1910
+ selected_query_genomes = range(my_query_genomes[0], my_query_genomes[1])
1911
+
1912
+ offset = my_query_genomes[0]
1913
+
1914
+ target_len = len(target.genome_index)
1915
+ query_len = my_query_genomes[1] - my_query_genomes[0]
1916
+
1917
+ #get shared protein counts
1918
+ query.load_accessions(permitted_genomes = selected_query_genomes)
1919
+
1920
+ max_acc = 122
1921
+
1922
+ query_set = np.zeros(shape = (query_len, max_acc), dtype = np.int16)
1923
+
1924
+ for g in query.gak:
1925
+ query_set[(g-offset), list(query.gak[g])] += 1
1926
+
1927
+ target_set = np.zeros(shape = (max_acc, len(target.genome_index)), dtype = np.int16)
1928
+
1929
+ target.load_accessions()
1930
+
1931
+ target_protein_counts = np.zeros(target_len, dtype = np.int16)
1932
+ for t in target.gak:
1933
+ target_set[list(target.gak[t]), t] += 1
1934
+ target_protein_counts[t] = len(target.gak[t])
1935
+
1936
+ #This will be used to divide the jaccs and such. If disk, then disk, tho...
1937
+ shared_prot_counts_by_genome = np.dot(query_set, target_set)
1938
+
1939
+ del query_set
1940
+ del target_set
1941
+
1942
+ target.gak = None
1943
+
1944
+ query.close_connection()
1945
+ target.close_connection()
1946
+
1947
+ activated_DBs = []
1948
+ idx = 0
1949
+ for db in partial_dbs:
1950
+ activated_DBs.append(calculation_database(db, prec))
1951
+ activated_DBs[idx].activate_connection()
1952
+ idx += 1
1953
+
1954
+
1955
+ for genome in selected_query_genomes:
1956
+ sql = "SELECT jaccards FROM jaccards WHERE genome="+str(genome)
1957
+ total_jaccs = np.zeros(target_len, dtype = np.float64)
1958
+ shared_acc_counts = shared_prot_counts_by_genome[genome - offset]
1959
+ for db in activated_DBs:
1960
+ result = db.cursor.execute(sql).fetchone()[0]
1961
+ total_jaccs += result
1962
+
1963
+ total_jaccs = np.divide(total_jaccs, shared_acc_counts)
1964
+
1965
+ aai_est = numpy_kaai_to_aai(total_jaccs)
1966
+
1967
+ no_hit = np.where(shared_acc_counts == 0)
1968
+ #Actual hits is already stored in shared_acc_counts
1969
+ possible_hits = np.minimum(len(query.gak[genome]), target_protein_counts).astype(str)
1970
+
1971
+ total_jaccs = np.round(total_jaccs, 4).astype(str)
1972
+
1973
+ shared_acc_counts = shared_acc_counts.astype(str)
1974
+
1975
+ total_jaccs[no_hit] = "N/A"
1976
+ aai_est[no_hit] = "N/A"
1977
+ shared_acc_counts[no_hit] = "N/A"
1978
+ possible_hits[no_hit] = "N/A"
1979
+
1980
+ name = query.reverse_genome_index[genome]
1981
+
1982
+ output_file = output +"/results/"+name+"_results.txt"
1983
+ fh = open(output_file, "w")
1984
+
1985
+ for tgt in range(0, target_len):
1986
+ target_name = target.reverse_genome_index[tgt]
1987
+ if target_name == name:
1988
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+"100.0"+"\n")
1989
+ else:
1990
+ fh.write(name+"\t"+target_name+"\t"+total_jaccs[tgt]+"\t"+"N/A"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+aai_est[tgt]+"\n")
1991
+
1992
+ fh.close()
1993
+
1994
+ #Write partial to file, here.
1995
+
1996
+ for db in activated_DBs:
1997
+ db.close_connection()
1998
+
1999
+ return None
2000
+
2001
+
2002
+ #Here's the DB SQL querying functionality/limited version.
2003
+ def do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev):
2004
+ #Save the file paths.
2005
+ query_name, target_name = query, target
2006
+
2007
+ query = fastaai_database(query_name)
2008
+ query.activate_connection()
2009
+ query.load_genome_index()
2010
+ query.just_accessions()
2011
+
2012
+ converter = generate_accessions_index()
2013
+ acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
2014
+ tables = [item[0] for item in query.cursor.execute(acc_sql).fetchall()]
2015
+ cleaned_tables = []
2016
+ for table in tables:
2017
+ if table.endswith("_genomes"):
2018
+ acc_name = table.split("_genomes")[0]
2019
+ acc_name = acc_name.replace("_", ".")
2020
+ index = acc_name
2021
+ cleaned_tables.append((table, index))
2022
+
2023
+ del tables
2024
+
2025
+ #Go through tables and load data.
2026
+ query_acc_kmers = defaultdict(dict)
2027
+
2028
+ sys.stdout.write("\n")
2029
+ sys.stdout.write("Loading query data at " + curtime() + " ...\n")
2030
+ sys.stdout.flush()
2031
+
2032
+ for tab_idx in cleaned_tables:
2033
+ table = tab_idx[0]
2034
+ accession = tab_idx[1]
2035
+ for result in query.cursor.execute("SELECT * FROM " + table).fetchall():
2036
+ query_acc_kmers[result[0]][accession] = result[1]
2037
+
2038
+ query.close_connection()
2039
+
2040
+
2041
+ sys.stdout.write("\n")
2042
+ sys.stdout.write("Loading target data at " + curtime() + " ...\n")
2043
+ sys.stdout.flush()
2044
+
2045
+ target = fastaai_database(target_name)
2046
+ target.activate_connection()
2047
+ target.load_genome_index()
2048
+ target.load_accessions()
2049
+ target.close_connection()
2050
+
2051
+ query_args = []
2052
+ for genome in query_acc_kmers:
2053
+ query_args.append((target, query.reverse_genome_index[genome], query_acc_kmers[genome], os.path.normpath(output+"/results")))
2054
+
2055
+ detected_query_accs = query.accessions
2056
+ query_length = len(query.genome_index)
2057
+
2058
+ #Cleanup
2059
+ del query
2060
+ del query_acc_kmers
2061
+
2062
+ #global target_kmer_cts
2063
+ target_kmer_cts = {}
2064
+
2065
+ target_len = len(target.gak)
2066
+
2067
+ for accession in np.intersect1d(detected_query_accs, target.accessions):
2068
+ target_kmer_cts[accession] = np.zeros(target_len, dtype = np.int16)
2069
+ for g in target.gak:
2070
+ if accession in target.gak[g]:
2071
+ target_kmer_cts[accession][g] = target.gak[g][accession]
2072
+
2073
+ #global target_protein_counts
2074
+ target_protein_counts = np.zeros(target_len, dtype = np.int16)
2075
+ for g in target.gak:
2076
+ target_protein_counts[g] = len(target.gak[g])
2077
+
2078
+ target_length = len(target.gak)
2079
+
2080
+ target.gak = None
2081
+
2082
+ #Should just load the stuff then straightforward sql
2083
+ sys.stdout.write("\n")
2084
+ sys.stdout.write("FastAAI will search "+ str(query_length) + " query genomes against " + str(target_length) + " target genomes.\n")
2085
+ sys.stdout.write("\n")
2086
+
2087
+ count = 0
2088
+ total = len(query_args)
2089
+
2090
+ sys.stdout.write("Beginning AAI calculation at " + curtime())
2091
+
2092
+ if verbose:
2093
+ print("")
2094
+ #progress bar - possible dangerous use of the return to line start sequence.
2095
+ try:
2096
+ percentage = 0
2097
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2098
+ sys.stdout.flush()
2099
+ last_pct = 0
2100
+ except:
2101
+ #It's not really a big deal if the progress bar cannot be printed.
2102
+ pass
2103
+
2104
+ pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
2105
+
2106
+ #Process as we go.
2107
+ if do_stdev:
2108
+ for file in pool.imap(do_sql_query, query_args):
2109
+ if verbose:
2110
+ #progress bar - possible dangerous use of the return to line start sequence.
2111
+ try:
2112
+ count += 1
2113
+ percentage = (count/total)*100
2114
+ if int(percentage/2) > last_pct or count == total:
2115
+ sys.stdout.write('\033[A')
2116
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2117
+ sys.stdout.flush()
2118
+ last_pct = int(percentage/2)
2119
+ except:
2120
+ #It's not really a big deal if the progress bar cannot be printed.
2121
+ pass
2122
+
2123
+ pool.close()
2124
+ pool.join()
2125
+ else:
2126
+
2127
+ for file in pool.imap(do_sql_query_no_SD, query_args):
2128
+
2129
+ if verbose:
2130
+ #progress bar - possible dangerous use of the return to line start sequence.
2131
+ try:
2132
+ count += 1
2133
+ percentage = (count/total)*100
2134
+ if int(percentage/2) > last_pct or count == total:
2135
+ sys.stdout.write('\033[A')
2136
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2137
+ sys.stdout.flush()
2138
+ last_pct = int(percentage/2)
2139
+ except:
2140
+ #It's not really a big deal if the progress bar cannot be printed.
2141
+ pass
2142
+
2143
+ pool.close()
2144
+ pool.join()
2145
+
2146
+ print("AAI calculation complete! Results at:", os.path.normpath(output+"/results"))
2147
+
2148
+ return None
2149
+
2150
+ #This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
2151
+ def do_sql_query(args):
2152
+ kmer_index = create_kmer_index()
2153
+ accession_index = generate_accessions_index()
2154
+ #database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
2155
+ database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
2156
+
2157
+ database.activate_connection()
2158
+
2159
+ res_ct = 0
2160
+ target_len = len(database.genome_index)
2161
+
2162
+ results = np.zeros(shape = (len(acc_kmers), target_len), dtype = np.float64)
2163
+ row = 0
2164
+
2165
+ shared_acc_counts = np.zeros(target_len, dtype = np.int16)
2166
+
2167
+ for accession in acc_kmers:
2168
+ acc_index = accession_index[accession]
2169
+ sql_friendly_accession = accession.replace(".", "_")
2170
+ if acc_index in database.accessions:
2171
+ #The accession was found for this target genome, for each tgt genome.
2172
+ shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
2173
+ these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
2174
+ these_intersections = np.zeros(target_len, dtype = np.int16)
2175
+ sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2176
+ for result in database.cursor.execute(sql_query, these_kmers):
2177
+ these_intersections[result] += 1
2178
+
2179
+ results[row] = np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
2180
+
2181
+ row += 1
2182
+
2183
+ database.close_connection()
2184
+
2185
+ #These are the jacc averages
2186
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2187
+
2188
+ #Get the differences from the mean per hit
2189
+ results = results - jaccard_averages
2190
+ #Square them
2191
+ results = np.square(results)
2192
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2193
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2194
+
2195
+ aai_est = numpy_kaai_to_aai(jaccard_averages)
2196
+
2197
+ no_hit = np.where(shared_acc_counts == 0)
2198
+ #Actual hits is already stored in shared_acc_counts
2199
+ possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
2200
+
2201
+
2202
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2203
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2204
+
2205
+ shared_acc_counts = shared_acc_counts.astype(str)
2206
+
2207
+ jaccard_averages[no_hit] = "N/A"
2208
+ aai_est[no_hit] = "N/A"
2209
+ jaccard_SDs[no_hit] = "N/A"
2210
+ shared_acc_counts[no_hit] = "N/A"
2211
+ possible_hits[no_hit] = "N/A"
2212
+
2213
+ output_file = temp_out +"/"+name+"_results.txt"
2214
+ fh = open(output_file, "w")
2215
+
2216
+ for target in range(0, target_len):
2217
+ target_name = database.reverse_genome_index[target]
2218
+ if target_name == name:
2219
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
2220
+ else:
2221
+ fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+jaccard_SDs[target]+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
2222
+
2223
+ fh.close()
2224
+
2225
+ return output_file
2226
+
2227
+ #This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
2228
+ def do_sql_query_no_SD(args):
2229
+ kmer_index = create_kmer_index()
2230
+ accession_index = generate_accessions_index()
2231
+ #database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
2232
+ database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
2233
+
2234
+ database.activate_connection()
2235
+
2236
+ res_ct = 0
2237
+ target_len = len(database.genome_index)
2238
+
2239
+ results = np.zeros(shape = target_len, dtype = np.float64)
2240
+ #row = 0
2241
+
2242
+ shared_acc_counts = np.zeros(target_len, dtype = np.int16)
2243
+
2244
+ for accession in acc_kmers:
2245
+ acc_index = accession_index[accession]
2246
+ sql_friendly_accession = accession.replace(".", "_")
2247
+ if acc_index in database.accessions:
2248
+ #The accession was found for this target genome, for each tgt genome.
2249
+ shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
2250
+ these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
2251
+ these_intersections = np.zeros(target_len, dtype = np.int16)
2252
+ sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2253
+ for result in database.cursor.execute(sql_query, these_kmers):
2254
+ these_intersections[result] += 1
2255
+
2256
+ results += np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
2257
+
2258
+ database.close_connection()
2259
+
2260
+ #These are the jacc averages
2261
+ jaccard_averages = np.divide(results, shared_acc_counts)
2262
+ del results
2263
+
2264
+ aai_est = numpy_kaai_to_aai(jaccard_averages)
2265
+
2266
+ no_hit = np.where(shared_acc_counts == 0)
2267
+
2268
+ possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
2269
+
2270
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2271
+
2272
+ shared_acc_counts = shared_acc_counts.astype(str)
2273
+
2274
+ jaccard_averages[no_hit] = "N/A"
2275
+ aai_est[no_hit] = "N/A"
2276
+ shared_acc_counts[no_hit] = "N/A"
2277
+ possible_hits[no_hit] = "N/A"
2278
+
2279
+ output_file = temp_out +"/"+name+"_results.txt"
2280
+ fh = open(output_file, "w")
2281
+
2282
+ for target in range(0, target_len):
2283
+ target_name = database.reverse_genome_index[target]
2284
+ if target_name == name:
2285
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
2286
+ else:
2287
+ fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+"N/A"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
2288
+
2289
+ fh.close()
2290
+
2291
+ return output_file
2292
+
2293
+ def numpy_kaai_to_aai(kaai_array):
2294
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2295
+
2296
+ #Protect the original jaccard averages memory item
2297
+ aai_hat_array = kaai_array.copy()
2298
+
2299
+ non_zero = np.where(aai_hat_array > 0)
2300
+ is_zero = np.where(aai_hat_array <= 0)
2301
+
2302
+ #I broke this down into its original components
2303
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2304
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2305
+
2306
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2307
+ '''
2308
+ Same as the above, broken down into easier-to-follow steps.
2309
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2310
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2311
+ aai_hat_array = np.negative(aai_hat_array)
2312
+ aai_hat_array = np.exp(aai_hat_array)
2313
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2314
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2315
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2316
+ '''
2317
+
2318
+ #<30 and >90 values
2319
+ smol = np.where(aai_hat_array < 30)
2320
+ big = np.where(aai_hat_array > 90)
2321
+
2322
+ aai_hat_array = np.round(aai_hat_array, 2)
2323
+
2324
+ #Convert to final printables
2325
+ aai_hat_array = aai_hat_array.astype(str)
2326
+ aai_hat_array[smol] = "<30%"
2327
+ aai_hat_array[big] = ">90%"
2328
+ #The math of the above ends up with zero values being big, so we fix those.
2329
+ aai_hat_array[is_zero] = "<30%"
2330
+
2331
+ return aai_hat_array
2332
+
2333
+ def curtime():
2334
+ time_format = "%d/%m/%Y %H:%M:%S"
2335
+ timer = datetime.datetime.now()
2336
+ time = timer.strftime(time_format)
2337
+ return time
2338
+
2339
+ #Manages the query process.
2340
+ def db_query_opts():
2341
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2342
+ description='''
2343
+ This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
2344
+
2345
+ If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
2346
+ then search it against an existing target using this module than it is to do the same thing with an SQL query.
2347
+
2348
+ If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
2349
+ ''')
2350
+ parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
2351
+ parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
2352
+
2353
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2354
+
2355
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2356
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2357
+
2358
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2359
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
2360
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
2361
+
2362
+ args, unknown = parser.parse_known_args()
2363
+
2364
+ return parser, args
2365
+
2366
+ #Control the query process for any DB-first query.
2367
+ def db_query(query, target, verbose, output, threads, do_stdev, precision, memory_efficient):
2368
+ print("")
2369
+
2370
+ #Sanity checks.
2371
+ if not os.path.exists(target):
2372
+ print("Target database not found. Exiting FastAAI")
2373
+ sys.exit()
2374
+
2375
+ if not os.path.exists(query):
2376
+ print("Query database not found. Exiting FastAAI")
2377
+ sys.exit()
2378
+
2379
+ #status = "exists"
2380
+ query_ok = assess_db(query)
2381
+ target_ok = assess_db(target)
2382
+
2383
+ if query_ok != "exists":
2384
+ print("Query database improperly formatted. Exiting FastAAI")
2385
+ sys.exit()
2386
+
2387
+ if target_ok != "exists":
2388
+ print("Query database improperly formatted. Exiting FastAAI")
2389
+ sys.exit()
2390
+
2391
+ #Check if the database is querying against itself.
2392
+ if target is None or query is None:
2393
+ print("I require both a query and a target database. FastAAI exiting.")
2394
+ sys.exit()
2395
+
2396
+ if query == target:
2397
+ print("Performing an all vs. all query on", query)
2398
+ #all_vs_all = True
2399
+ else:
2400
+ print("Querying", query, "against", target)
2401
+ #all_vs_all = False
2402
+
2403
+ #Ready the output directories as needed.
2404
+ #The databases are already created, the only state they can be in in P+H
2405
+ good_to_go = prepare_directories(output, "protein and HMM", "query")
2406
+ if not good_to_go:
2407
+ print("Exiting FastAAI")
2408
+ sys.exit()
2409
+
2410
+ if precision not in ["high", "med", "low"]:
2411
+ print("Selected memory usage setting not found. Defaulting to med. Select one with --mem high/med/low.")
2412
+ precision = 'med'
2413
+
2414
+ #Default
2415
+ if (not memory_efficient) or do_stdev:
2416
+ do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev)
2417
+ #Not default.
2418
+ else:
2419
+ do_query_vs_target_aai_only(query, target, threads, output, precision, verbose)
2420
+
2421
+ print("")
2422
+
2423
+
2424
+ #Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
2425
+ def sql_query_opts():
2426
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2427
+ description='''
2428
+ This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
2429
+ If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
2430
+
2431
+ If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
2432
+ and place them in the output directory, just like it does while building a database.
2433
+
2434
+ Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
2435
+
2436
+ Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
2437
+ ''')
2438
+
2439
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
2440
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
2441
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
2442
+
2443
+ parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
2444
+
2445
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2446
+
2447
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
2448
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
2449
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
2450
+
2451
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2452
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2453
+
2454
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2455
+
2456
+ args, unknown = parser.parse_known_args()
2457
+
2458
+ return parser, args
2459
+
2460
+ def sql_query_thread_starter(kmer_cts, protein_cts):
2461
+ global target_kmer_cts
2462
+ global target_protein_counts
2463
+ target_kmer_cts = kmer_cts
2464
+ target_protein_counts = protein_cts
2465
+
2466
+
2467
+ def sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev):
2468
+
2469
+ if not os.path.exists(db_name):
2470
+ print("")
2471
+ print("FastAAI can't find your database:", db_name)
2472
+ print("Are you sure that the path you've given to the database is correct and that the database exists?")
2473
+ print("FastAAI exiting.")
2474
+ print("")
2475
+ sys.exit()
2476
+
2477
+ start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
2478
+
2479
+ #If something failed, we stop.
2480
+ if start is None:
2481
+ sys.exit()
2482
+
2483
+
2484
+
2485
+ good_to_go = prepare_directories(output, start, "query")
2486
+
2487
+ if not good_to_go:
2488
+ print("Exiting FastAAI")
2489
+ sys.exit()
2490
+
2491
+ #global kmer_index
2492
+ #kmer_index = create_kmer_index()
2493
+
2494
+
2495
+ print("")
2496
+ print("Preparing inputs for querying...")
2497
+
2498
+ prepared_files = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output, threads = threads, verbose = verbose, db_name = db_name)
2499
+
2500
+ if prepared_files is None:
2501
+ return None
2502
+
2503
+ query_accessions_detected = set()
2504
+ for file in prepared_files:
2505
+ query_accessions_detected = query_accessions_detected.union(file.best_hits.values())
2506
+
2507
+ #We don't want to get more than we have to.
2508
+ query_accessions_detected = list(query_accessions_detected)
2509
+
2510
+ if prepared_files is None:
2511
+ print("Exiting FastAAI")
2512
+ sys.exit()
2513
+
2514
+ if verbose:
2515
+ print("")
2516
+ print("Gathering database information...")
2517
+
2518
+ database = fastaai_database(db_name)
2519
+ database.activate_connection()
2520
+ database.load_genome_index()
2521
+ database.load_accessions()
2522
+ database.close_connection()
2523
+
2524
+ #formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
2525
+
2526
+ #global accession_index
2527
+ accession_index = generate_accessions_index()
2528
+
2529
+ #Translate to indicies.
2530
+ query_accessions_detected = [accession_index[a] for a in query_accessions_detected]
2531
+
2532
+ #global target_kmer_cts
2533
+ target_kmer_cts = {}
2534
+
2535
+ for accession in np.intersect1d(database.accessions, query_accessions_detected):
2536
+ target_kmer_cts[accession] = np.zeros(len(database.genome_index), dtype = np.int16)
2537
+ for g in database.gak:
2538
+ if accession in database.gak[g]:
2539
+ target_kmer_cts[accession][g] = database.gak[g][accession]
2540
+
2541
+ #global target_protein_counts
2542
+ target_protein_counts = np.zeros(len(database.gak), dtype = np.int16)
2543
+ for g in database.gak:
2544
+ target_protein_counts[g] = len(database.gak[g])
2545
+
2546
+ database.gak = None
2547
+
2548
+ formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
2549
+
2550
+ if verbose:
2551
+ print("")
2552
+ print("-"*100)
2553
+ print("")
2554
+
2555
+ count = 0
2556
+ total = len(formatted_dataset)
2557
+
2558
+ print("Beginning AAI calculation")
2559
+
2560
+ #globals to pass... target_kmer_cts target_protein_counts
2561
+ #Just remake these in the procs. kmer_index accession_index
2562
+
2563
+ if verbose:
2564
+ print("")
2565
+ #progress bar - possible dangerous use of the return to line start sequence.
2566
+ try:
2567
+ percentage = 0
2568
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2569
+ sys.stdout.flush()
2570
+ last_pct = 0
2571
+ except:
2572
+ #It's not really a big deal if the progress bar cannot be printed.
2573
+ pass
2574
+
2575
+ #If parallelized, do parallel
2576
+
2577
+ pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
2578
+
2579
+ #Process as we go.
2580
+ if do_stdev:
2581
+ for file in pool.imap(do_sql_query, formatted_dataset):
2582
+
2583
+ '''
2584
+ handle = open(file, "r")
2585
+
2586
+ for line in handle:
2587
+ final_result.write(line)
2588
+
2589
+ handle.close()
2590
+ os.remove(file)
2591
+ '''
2592
+ if verbose:
2593
+ #progress bar - possible dangerous use of the return to line start sequence.
2594
+ try:
2595
+ count += 1
2596
+ percentage = (count/total)*100
2597
+ if int(percentage/2) > last_pct or count == total:
2598
+ sys.stdout.write('\033[A')
2599
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2600
+ sys.stdout.flush()
2601
+ last_pct = int(percentage/2)
2602
+ except:
2603
+ #It's not really a big deal if the progress bar cannot be printed.
2604
+ pass
2605
+
2606
+ pool.close()
2607
+ pool.join()
2608
+ else:
2609
+
2610
+ for file in pool.imap(do_sql_query_no_SD, formatted_dataset):
2611
+ '''
2612
+ handle = open(file, "r")
2613
+
2614
+ for line in handle:
2615
+ final_result.write(line)
2616
+
2617
+ handle.close()
2618
+ os.remove(file)
2619
+ '''
2620
+ if verbose:
2621
+ #progress bar - possible dangerous use of the return to line start sequence.
2622
+ try:
2623
+ count += 1
2624
+ percentage = (count/total)*100
2625
+ if int(percentage/2) > last_pct or count == total:
2626
+ sys.stdout.write('\033[A')
2627
+ sys.stdout.flush()
2628
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2629
+ sys.stdout.flush
2630
+ last_pct = int(percentage/2)
2631
+ except:
2632
+ #It's not really a big deal if the progress bar cannot be printed.
2633
+ pass
2634
+
2635
+ pool.close()
2636
+ pool.join()
2637
+
2638
+ if verbose:
2639
+ print("")
2640
+ print("-"*100)
2641
+ print("")
2642
+
2643
+ if os.path.exists(output+"/temp"):
2644
+ os.rmdir(output+"/temp")
2645
+
2646
+ print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
2647
+ return None
2648
+
2649
+
2650
+ #Check to see if the file exists and is a valid fastAAI db
2651
+ def assess_db(path):
2652
+ status = None
2653
+ if os.path.exists(path):
2654
+ db = fastaai_database(path)
2655
+ try:
2656
+ db.activate_connection()
2657
+ sql = "SELECT name FROM sqlite_master WHERE type='table'"
2658
+
2659
+ db.cursor.row_factory = lambda cursor, row: row[0]
2660
+ tables = db.cursor.execute(sql).fetchall()
2661
+ db.cursor.row_factory = None
2662
+
2663
+ db.close_connection()
2664
+
2665
+ if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
2666
+ status = "exists"
2667
+ else:
2668
+ status = "wrong format"
2669
+
2670
+ except:
2671
+ status = "wrong format"
2672
+
2673
+ else:
2674
+ try:
2675
+ db = fastaai_database(path)
2676
+ db.activate_connection()
2677
+ db.initialize_parent_database()
2678
+ db.close_connection()
2679
+ status = "created"
2680
+ except:
2681
+ status = "unable to create"
2682
+
2683
+ return status
2684
+
2685
+ #Add one FastAAI DB to another FastAAI DB
2686
+ def merge_db_opts():
2687
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2688
+ description='''
2689
+ This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
2690
+ You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
2691
+
2692
+ Supply a comma-separated list of at least one donor database and a single recipient database.
2693
+ If the recipient already exists, then genomes in all the donors will be added to the recipient.
2694
+ If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
2695
+
2696
+ Example:
2697
+ FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
2698
+ This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
2699
+
2700
+ Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
2701
+ ''')
2702
+
2703
+ parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2704
+
2705
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
2706
+
2707
+ parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2708
+
2709
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2710
+
2711
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2712
+
2713
+ args, unknown = parser.parse_known_args()
2714
+
2715
+ return parser, args
2716
+
2717
+ def merge_db_thread_starter(rev_index, per_db_accs):
2718
+ global reverse_genome_indicies
2719
+ global accs_per_db
2720
+ reverse_genome_indicies = rev_index
2721
+ accs_per_db = per_db_accs
2722
+
2723
+
2724
+
2725
+ def merge_db(recipient, donors, donor_file, verbose, threads):
2726
+ #Prettier on the CLI
2727
+
2728
+ if donor_file is not None:
2729
+ fh = agnostic_reader(donor_file)
2730
+ donors = [line.strip() for line in fh]
2731
+ fh.close()
2732
+
2733
+ if donors is None or recipient is None:
2734
+ print("Either donor or target not given. FastAAI is exiting.")
2735
+ return None
2736
+
2737
+ print("")
2738
+
2739
+ if donor_file is None:
2740
+ donors = donors.split(",")
2741
+
2742
+ valid_donors = []
2743
+ for d in donors:
2744
+ if os.path.exists(d):
2745
+ if d == recipient:
2746
+ print("Donor database", d, "is the same as the recipient. This database will be skipped.")
2747
+ else:
2748
+ check = assess_db(d)
2749
+ if check == "exists":
2750
+ if d not in valid_donors:
2751
+ valid_donors.append(d)
2752
+ else:
2753
+ print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
2754
+ else:
2755
+ if check == "created":
2756
+ print("Donor database", d, "not found! Skipping.")
2757
+ else:
2758
+ print("Something was wrong with supplied database:", d+". A status check found:", check)
2759
+ else:
2760
+ print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
2761
+
2762
+ if len(valid_donors) == 0:
2763
+ print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
2764
+ sys.exit()
2765
+
2766
+ recip_check = assess_db(recipient)
2767
+
2768
+ if recip_check == "created" or recip_check == "exists":
2769
+ for donor in valid_donors:
2770
+ print("Donor database:", donor, "will be added to recipient database:", recipient)
2771
+
2772
+ recipient = fastaai_database(recipient)
2773
+ else:
2774
+ print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
2775
+ sys.exit()
2776
+
2777
+ if recipient is None or len(valid_donors) == 0:
2778
+ print("I require both a valid donor and a recipient database. FastAAI exiting.")
2779
+ sys.exit()
2780
+
2781
+ donor_dbs = []
2782
+ for d in valid_donors:
2783
+ donor_dbs.append(fastaai_database(d))
2784
+
2785
+ all_accessions = set()
2786
+ #global joint_genome_index
2787
+ joint_genome_index = {}
2788
+ joint_genome_counts = {}
2789
+ max_index = 0
2790
+ #The idea here is to create a set of arrays whose values span the range of each donor's genomes and translate those into an overall list, in order.
2791
+
2792
+ #global reverse_genome_indicies
2793
+ reverse_genome_indices = {}
2794
+
2795
+ #global accs_per_db
2796
+ accs_per_db = {}
2797
+
2798
+ #Load recipient data, if any.
2799
+ if recip_check == "exists":
2800
+ recipient.activate_connection()
2801
+ recipient.just_accessions()
2802
+ recipient.load_genome_index()
2803
+ recipient.close_connection()
2804
+
2805
+ all_accessions = all_accessions.union(recipient.accessions)
2806
+ accs_per_db[recipient.path] = recipient.accessions
2807
+ recipient.accessions = None
2808
+ max_index = len(recipient.genome_index)
2809
+
2810
+ joint_genome_index = dict(zip(recipient.genome_index.keys(), recipient.genome_index.values()))
2811
+ joint_genome_counts = dict(zip(recipient.protein_counts_by_genome.keys(), recipient.protein_counts_by_genome.values()))
2812
+
2813
+ #reverse_genome_index = dict(zip(joint_genome_index.values(),joint_genome_index.keys()))
2814
+ #So... the keys are the genome indicies of the recip. These... shouldn't need any updates. Only the donors need to match.
2815
+ ct = 0
2816
+ path = recipient.path
2817
+ reverse_genome_indices[path] = []
2818
+ for idx in sorted(recipient.genome_index.values()):
2819
+ reverse_genome_indices[path].append(idx)
2820
+ reverse_genome_indices[path] = np.array(reverse_genome_indices[path], dtype = np.int32)
2821
+ recipient.genome_index = None
2822
+
2823
+ #Donors should always exist, never be created.
2824
+ for d in donor_dbs:
2825
+ d.activate_connection()
2826
+ d.just_accessions()
2827
+ d.load_genome_index()
2828
+ d.close_connection()
2829
+ accs_per_db[d.path] = d.accessions
2830
+ all_accessions = all_accessions.union(d.accessions)
2831
+ d.accessions = None
2832
+ reverse_genome_indices[d.path] = []
2833
+ #Database construction indicates this should always be 0-COUNT
2834
+ for g in sorted(d.genome_index.keys()):
2835
+ if g not in joint_genome_index:
2836
+ reverse_genome_indices[d.path].append(max_index)
2837
+ joint_genome_index[g] = max_index
2838
+ #Map the counts on.
2839
+ joint_genome_counts[max_index] = d.protein_counts_by_genome[d.genome_index[g]]
2840
+ #reverse_genome_index[max_index] = g
2841
+ max_index += 1
2842
+ else:
2843
+ reverse_genome_indices[d.path].append(joint_genome_index[g])
2844
+ #Make it an array, now
2845
+ reverse_genome_indices[d.path] = np.array(reverse_genome_indices[d.path], dtype = np.int32)
2846
+ d.genome_index = None
2847
+
2848
+ #global accession_index
2849
+ accession_index = generate_accessions_index()
2850
+
2851
+ #global accession_inverter
2852
+ accession_inverter = {}
2853
+ for acc in accession_index:
2854
+ sql_friendly_accession = acc.replace(".", "_")
2855
+ accession_inverter[accession_index[acc]] = sql_friendly_accession
2856
+
2857
+ all_accessions = list(all_accessions)
2858
+
2859
+
2860
+ print("")
2861
+ print("Formatting data to add to database. Started at", curtime())
2862
+
2863
+ temp_dir = tempfile.mkdtemp()
2864
+ try:
2865
+ acc_args = [(acc, donor_dbs, recipient, temp_dir) for acc in all_accessions]
2866
+
2867
+ if verbose:
2868
+ print("")
2869
+ count = 0
2870
+ total_counts = len(acc_args)
2871
+ try:
2872
+ percentage = (count/total_counts)*100
2873
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2874
+ sys.stdout.flush()
2875
+ except:
2876
+ #It's not really a big deal if the progress bar cannot be printed.
2877
+ pass
2878
+
2879
+ last_pct = 0
2880
+
2881
+ pool = multiprocessing.Pool(threads, initializer=merge_db_thread_starter, initargs = (reverse_genome_indices, accs_per_db,))
2882
+
2883
+ quiverfull = []
2884
+ for result in pool.imap_unordered(pull_and_merge_accession, acc_args):
2885
+ acc = result[0]
2886
+ child = result[1]
2887
+ #sub_gak = result[2]
2888
+
2889
+ quiverfull.append([acc, child])
2890
+ #gaks.extend(sub_gak)
2891
+
2892
+ if verbose:
2893
+ count += 1
2894
+ try:
2895
+ percentage = (count/total_counts)*100
2896
+ log_time = curtime()
2897
+ sys.stdout.write('\033[A')
2898
+ sys.stdout.flush()
2899
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2900
+ sys.stdout.flush()
2901
+ except:
2902
+ #It's not really a big deal if the progress bar cannot be printed.
2903
+ pass
2904
+
2905
+ pool.close()
2906
+ pool.join()
2907
+
2908
+ print("")
2909
+ print("Adding data to final database. Started at", curtime())
2910
+
2911
+ if verbose:
2912
+ print("")
2913
+
2914
+ count = 0
2915
+ total_counts = len(acc_args)
2916
+ try:
2917
+ percentage = (count/total_counts)*100
2918
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2919
+ sys.stdout.flush()
2920
+ except:
2921
+ #It's not really a big deal if the progress bar cannot be printed.
2922
+ pass
2923
+
2924
+ last_pct = 0
2925
+
2926
+ recipient.activate_connection()
2927
+ genome_list_update_sql = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
2928
+ genome_reindex = []
2929
+ for g in joint_genome_index:
2930
+ genome_reindex.append((g, joint_genome_index[g], joint_genome_counts[joint_genome_index[g]]))
2931
+
2932
+ recipient.cursor.executemany(genome_list_update_sql, genome_reindex)
2933
+ recipient.connection.commit()
2934
+
2935
+ del genome_reindex
2936
+
2937
+ for result in quiverfull:
2938
+ acc = result[0]
2939
+ child = result[1]
2940
+
2941
+ recipient.add_child_to_parent(acc, child, genomes_too = True, update_gak = True)
2942
+
2943
+ if verbose:
2944
+ count += 1
2945
+ try:
2946
+ percentage = (count/total_counts)*100
2947
+ log_time = curtime()
2948
+ sys.stdout.write('\033[A')
2949
+ sys.stdout.flush()
2950
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2951
+ sys.stdout.flush()
2952
+ except:
2953
+ #It's not really a big deal if the progress bar cannot be printed.
2954
+ pass
2955
+ except:
2956
+ #Error
2957
+ if os.path.exists(temp_dir):
2958
+ shutil.rmtree(temp_dir)
2959
+ finally:
2960
+ #Success
2961
+ if os.path.exists(temp_dir):
2962
+ shutil.rmtree(temp_dir)
2963
+
2964
+ print("\nDatabases merged!")
2965
+
2966
+ return None
2967
+
2968
+ def pull_and_merge_accession(args):
2969
+ accession_index = generate_accessions_index()
2970
+
2971
+ #global accession_inverter
2972
+ accession_inverter = {}
2973
+ for acc in accession_index:
2974
+ sql_friendly_accession = acc.replace(".", "_")
2975
+ accession_inverter[accession_index[acc]] = sql_friendly_accession
2976
+
2977
+ #joint_genome_index, accession_index, accession_inverter, accs_per_db are global already.
2978
+ acc, donor_dbs, recipient, temp = args[0], args[1], args[2], args[3]
2979
+
2980
+ acc_name = accession_inverter[acc]
2981
+ acc_name_gens = acc_name + "_genomes"
2982
+
2983
+ query_sql = "SELECT * FROM " + acc_name
2984
+
2985
+ temp_db = fastaai_database(os.path.normpath(temp+"/"+acc_name+".db"))
2986
+ temp_db.activate_connection()
2987
+
2988
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)"
2989
+ temp_db.cursor.execute(create_command)
2990
+ temp_db.connection.commit()
2991
+
2992
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
2993
+ temp_db.cursor.execute(create_command)
2994
+ temp_db.connection.commit()
2995
+
2996
+ query_lists = {}
2997
+ for db in donor_dbs:
2998
+ if acc in accs_per_db[db.path]:
2999
+ db.activate_connection()
3000
+
3001
+ for result in db.cursor.execute(query_sql).fetchall():
3002
+ kmer = result[0]
3003
+ genomes = result[1]
3004
+ translated_genomes = reverse_genome_indicies[db.path][genomes]
3005
+
3006
+ if kmer in query_lists:
3007
+ query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
3008
+ else:
3009
+ query_lists[kmer] = translated_genomes
3010
+
3011
+ db.close_connection()
3012
+
3013
+ #Recipient is not guaranteed to be in the accs per db - if it was created anew, it wouldn't be.
3014
+ if recipient.path in accs_per_db:
3015
+ if acc in accs_per_db[recipient.path]:
3016
+ recipient.activate_connection()
3017
+
3018
+ for result in recipient.cursor.execute(query_sql).fetchall():
3019
+ kmer = result[0]
3020
+ genomes = result[1]
3021
+ translated_genomes = reverse_genome_indicies[recipient.path][genomes]
3022
+ if kmer in query_lists:
3023
+ query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
3024
+ else:
3025
+ query_lists[kmer] = translated_genomes
3026
+
3027
+ recipient.close_connection()
3028
+
3029
+ #Byte-string these.
3030
+ for kmer in query_lists:
3031
+ query_lists[kmer] = query_lists[kmer].tobytes()
3032
+
3033
+ temp_db.cursor.executemany("INSERT INTO " + acc_name + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
3034
+ temp_db.connection.commit()
3035
+
3036
+ del query_lists
3037
+
3038
+ #Reset. Do genomes
3039
+ query_genomes_sql = "SELECT * FROM " + acc_name_gens
3040
+ query_lists = {}
3041
+ for db in donor_dbs:
3042
+ if acc in accs_per_db[db.path]:
3043
+ db.activate_connection()
3044
+
3045
+ for result in db.cursor.execute(query_genomes_sql).fetchall():
3046
+ genome = result[0]
3047
+ kmers = result[1]
3048
+ translated_genome = int(reverse_genome_indicies[db.path][genome])
3049
+ #Each genome gets added only once, no dupes.
3050
+ if translated_genome not in query_lists:
3051
+ query_lists[translated_genome] = kmers
3052
+
3053
+ db.close_connection()
3054
+
3055
+ if recipient.path in accs_per_db:
3056
+ if acc in accs_per_db[recipient.path]:
3057
+ recipient.activate_connection()
3058
+
3059
+ for result in recipient.cursor.execute(query_genomes_sql).fetchall():
3060
+ genome = result[0]
3061
+ kmers = result[1]
3062
+ translated_genome = int(reverse_genome_indicies[recipient.path][genome])
3063
+ #Each genome gets added only once, no dupes.
3064
+ if translated_genome not in query_lists:
3065
+ query_lists[translated_genome] = kmers
3066
+
3067
+ recipient.close_connection()
3068
+
3069
+ #Byte-string these.
3070
+ #gak = []
3071
+ for g in query_lists:
3072
+ #gak.append((g, acc, query_lists[g].shape[0]))
3073
+ query_lists[g] = query_lists[g].tobytes()
3074
+
3075
+
3076
+ temp_db.cursor.executemany("INSERT INTO " + acc_name_gens + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
3077
+ temp_db.connection.commit()
3078
+
3079
+ temp_db.close_connection()
3080
+
3081
+ return [acc_name, temp_db.path]
3082
+
3083
+ #Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
3084
+ def single_query_opts():
3085
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3086
+ description='''
3087
+ This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
3088
+
3089
+ If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
3090
+ If you supply a protein as either query or target, an HMM file will be made for it.
3091
+ If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
3092
+
3093
+ No database will be built, and you cannot query multiple genomes with this module.
3094
+
3095
+ If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
3096
+ If you wish to query multiple genomes against multiple targets, use multi_query instead.
3097
+ ''')
3098
+ parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
3099
+ parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
3100
+
3101
+ parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
3102
+ parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
3103
+
3104
+ parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
3105
+ parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
3106
+
3107
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3108
+
3109
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3110
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3111
+
3112
+ #Alternative file input
3113
+
3114
+ args, unknown = parser.parse_known_args()
3115
+
3116
+ return parser, args
3117
+
3118
+ def do_single_query(input_file):
3119
+ input_file.preprocess()
3120
+ return input_file
3121
+
3122
+ def intersect_kmer_lists(pair):
3123
+ intersection = np.intersect1d(pair[0], pair[1]).shape[0]
3124
+ union = pair[0].shape[0] + pair[1].shape[0] - intersection
3125
+ return (intersection/union)
3126
+
3127
+ def kaai_to_aai(kaai):
3128
+ # Transform the kAAI into estimated AAI values
3129
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
3130
+
3131
+ return aai_hat
3132
+
3133
+ #This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
3134
+ def single_query(query_args, target_args, shared_args):
3135
+
3136
+ output, threads, verbose = shared_args[0], shared_args[1], shared_args[2]
3137
+
3138
+ genomes, proteins, hmms = query_args[0], query_args[1], query_args[2]
3139
+
3140
+ if genomes is None and proteins is None and hmms is None:
3141
+ print("Please supply a query genome, protein, or protein and HMM pair.")
3142
+ sys.exit()
3143
+
3144
+ query = None
3145
+
3146
+ if genomes is not None:
3147
+ query = input_file(genomes, output, verbose)
3148
+ query.set_genome(genomes)
3149
+ if proteins is not None:
3150
+ if query is not None:
3151
+ print("If you supply a genome for either query or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
3152
+ sys.exit()
3153
+ else:
3154
+ query = input_file(proteins, output, verbose)
3155
+ query.set_protein(proteins)
3156
+ if hmms is not None:
3157
+ if query is None:
3158
+ print("If you supply an HMM for either query or target, you must also supply the protein from which the HMM was generated.")
3159
+ sys.exit()
3160
+ else:
3161
+ query.set_hmm(hmms)
3162
+
3163
+ genomes, proteins, hmms = target_args[0], target_args[1], target_args[2]
3164
+
3165
+ if genomes is None and proteins is None and hmms is None:
3166
+ print("Please supply a target genome, protein, or protein and HMM pair.")
3167
+ sys.exit()
3168
+
3169
+ target = None
3170
+
3171
+ if genomes is not None:
3172
+ target = input_file(genomes, output, verbose)
3173
+ target.set_genome(genomes)
3174
+ if proteins is not None:
3175
+ if target is not None:
3176
+ print("If you supply a genome for either target or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
3177
+ sys.exit()
3178
+ else:
3179
+ target = input_file(proteins, output, verbose)
3180
+ target.set_protein(proteins)
3181
+ if hmms is not None:
3182
+ if target is None:
3183
+ print("If you supply an HMM for either target or target, you must also supply the protein from which the HMM was generated.")
3184
+ sys.exit()
3185
+ else:
3186
+ target.set_hmm(hmms)
3187
+
3188
+ if query.basename == target.basename:
3189
+ print("You've selected the same query and target genome. The AAI is 100%.")
3190
+ print("FastAAI exiting.")
3191
+ return None
3192
+
3193
+ statuses = ["genome", "protein", "protein and hmm"]
3194
+ query_stat = statuses.index(query.status)
3195
+ target_stat = statuses.index(target.status)
3196
+ minimum_status = statuses[min(query_stat, target_stat)]
3197
+
3198
+ start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
3199
+
3200
+ print("")
3201
+ print("Query start: ", start_printouts[query_stat])
3202
+ print("Target start:", start_printouts[target_stat])
3203
+ print("")
3204
+
3205
+ good_to_go = prepare_directories(output, minimum_status, "build")
3206
+
3207
+ if not good_to_go:
3208
+ print("Exiting FastAAI")
3209
+ sys.exit()
3210
+
3211
+ qname = query.basename
3212
+ tname = target.basename
3213
+
3214
+ name = qname + "_vs_" + tname + ".aai.txt"
3215
+ print("Output will be located at", os.path.normpath(output) + "/results/"+name)
3216
+
3217
+ #Give the data for kmer indexing to the parallel processes
3218
+ global kmer_index
3219
+ kmer_index = create_kmer_index()
3220
+
3221
+ advance_me = [query, target]
3222
+ #All we need to do this.
3223
+ pool = multiprocessing.Pool(min(threads, 2))
3224
+
3225
+ results = pool.map(do_single_query, advance_me)
3226
+
3227
+ pool.close()
3228
+ pool.join()
3229
+
3230
+ query = results[0]
3231
+ target = results[1]
3232
+
3233
+ #One of the printouts
3234
+ max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3235
+
3236
+ accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
3237
+
3238
+ seq_pairs = [[query.best_hits_kmers[acc], target.best_hits_kmers[acc]] for acc in accs_to_view]
3239
+
3240
+ pool = multiprocessing.Pool(min(threads, len(accs_to_view)))
3241
+
3242
+ results = np.array(pool.map(intersect_kmer_lists, seq_pairs))
3243
+
3244
+ pool.close()
3245
+ pool.join()
3246
+
3247
+ jacc_mean = np.mean(results)
3248
+ jacc_std = np.std(results)
3249
+ actual_prots = len(results)
3250
+ aai_est = round(kaai_to_aai(jacc_mean), 2)
3251
+
3252
+ if aai_est > 90:
3253
+ aai_est = "> 90%"
3254
+ else:
3255
+ if aai_est < 30:
3256
+ aai_est = "< 30%"
3257
+
3258
+ output = open(name, "w")
3259
+
3260
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, aai_est, file = output)
3261
+
3262
+ output.close()
3263
+
3264
+ print("FastAAI single query done! Estimated AAI:", aai_est)
3265
+
3266
+ def aai_index_opts():
3267
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3268
+ description='''
3269
+ This FastAAI module takes a set of genomes, proteins, or proteins and HMMs, creates a FastAAI database from them, and then executes an all vs. all AAI search of the genomes in the database
3270
+ ''')
3271
+
3272
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
3273
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
3274
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
3275
+
3276
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3277
+
3278
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3279
+
3280
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
3281
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
3282
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
3283
+
3284
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3285
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3286
+
3287
+
3288
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
3289
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
3290
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
3291
+
3292
+ args, unknown = parser.parse_known_args()
3293
+
3294
+ return parser, args
3295
+
3296
+ #Build a DB and query a dataset vs. self
3297
+ def aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, memory_use, unlimited_resources):
3298
+ #run build DB and then db_query with the fresh DB
3299
+ success = build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3300
+ if success:
3301
+ accessible_name = os.path.normpath(output + "/database/" + db_name)
3302
+ db_query(accessible_name, accessible_name, verbose, output, threads, do_stdev, memory_use, unlimited_resources)
3303
+ else:
3304
+ print("Database could not be built. FastAAI exiting.")
3305
+
3306
+ return None
3307
+
3308
+ #Build 2 DBs and query query DB vs target DB
3309
+ def multi_query_opts():
3310
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3311
+ description='''
3312
+ This FastAAI module takes a set of query genomes/proteins/proteins+HMMs and a set of target genomes/proteins/proteins+HMMs.
3313
+ Two FastAAI databases will be created, one for the query and one for the target, then the query database will have AAI calculated against the target database
3314
+ ''')
3315
+
3316
+ parser.add_argument('-qg', '--query_genomes', dest = 'query_genomes', default = None, help = 'A directory containing query genomes in FASTA format.')
3317
+ parser.add_argument('-qp', '--query_proteins', dest = 'query_proteins', default = None, help = 'A directory containing query protein amino acids in FASTA format.')
3318
+ parser.add_argument('-qm', '--query_hmms', dest = 'query_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of query proteins.')
3319
+
3320
+ parser.add_argument('-tg', '--target_genomes', dest = 'target_genomes', default = None, help = 'A directory containing target genomes in FASTA format.')
3321
+ parser.add_argument('-tp', '--target_proteins', dest = 'target_proteins', default = None, help = 'A directory containing target protein amino acids in FASTA format.')
3322
+ parser.add_argument('-tm', '--target_hmms', dest = 'target_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of target proteins.')
3323
+
3324
+
3325
+ parser.add_argument('-qd', '--query_database', dest = 'query_db_name', default = "FastAAI_query_database.sqlite.db", help = 'The name of the query database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3326
+ parser.add_argument('-td', '--target_database', dest = 'target_db_name', default = "FastAAI_target_database.sqlite.db", help = 'The name of the target database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3327
+
3328
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3329
+
3330
+ parser.add_argument('--query_genome_file', dest = 'qgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your query genome files, 1 per line.')
3331
+ parser.add_argument('--query_protein_file', dest = 'qpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your query protein files, 1 per line.')
3332
+ parser.add_argument('--query_hmm_file', dest = 'qhf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your query HMM files, 1 per line.')
3333
+
3334
+ parser.add_argument('--target_genome_file', dest = 'tgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your target genome files, 1 per line.')
3335
+ parser.add_argument('--target_protein_file', dest = 'tpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your target protein files, 1 per line.')
3336
+ parser.add_argument('--target_hmm_file', dest = 'thf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your target HMM files, 1 per line.')
3337
+
3338
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3339
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3340
+
3341
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
3342
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
3343
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
3344
+
3345
+ args, unknown = parser.parse_known_args()
3346
+
3347
+ return parser, args
3348
+
3349
+ #Build 2 DBs and query query DB vs target DB
3350
+ def multi_query(query_arg_list, target_arg_list, shared_args):
3351
+ pass
3352
+ output, threads, verbose, do_stdev, mem, efficient = shared_args[0], shared_args[1], shared_args[2], shared_args[3], shared_args[4], shared_args[5]
3353
+
3354
+ genomes, proteins, hmms, gf, pf, hf, db_name = query_arg_list[0], query_arg_list[1], query_arg_list[2], query_arg_list[3], query_arg_list[4], query_arg_list[5], query_arg_list[6]
3355
+ accessible_name_query = os.path.normpath(output + "/database/" + db_name)
3356
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3357
+
3358
+ genomes, proteins, hmms, gf, pf, hf, db_name = target_arg_list[0], target_arg_list[1], target_arg_list[2], target_arg_list[3], target_arg_list[4], target_arg_list[5], target_arg_list[6]
3359
+ accessible_name_target = os.path.normpath(output + "/database/" + db_name)
3360
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3361
+
3362
+ db_query(accessible_name_query, accessible_name_target, verbose, output, threads, do_stdev, mem, efficient)
3363
+
3364
+ '''
3365
+ Main
3366
+ '''
3367
+ def main():
3368
+ #The currently supported modules.
3369
+ modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query"]
3370
+
3371
+ #Print modules if someone just types FastAAI
3372
+ if len(sys.argv) < 2:
3373
+ print("")
3374
+ print(" Welcome to FastAAI")
3375
+ print("")
3376
+ print("")
3377
+ print(" Please select one of the following modules:")
3378
+ print("")
3379
+ print("------------------------------------------- Quick Usage Options -------------------------------------------")
3380
+ print("")
3381
+ print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
3382
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
3383
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
3384
+ print("")
3385
+ print("-------------------------------------- Database Construction Options --------------------------------------")
3386
+ print("")
3387
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
3388
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
3389
+ print("")
3390
+ print("---------------------------------------------- Query Options ----------------------------------------------")
3391
+ print("")
3392
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
3393
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
3394
+ print("")
3395
+ print("-----------------------------------------------------------------------------------------------------------")
3396
+ print("")
3397
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
3398
+ print("")
3399
+ sys.exit()
3400
+
3401
+ #This is the module selection
3402
+ selection = sys.argv[1]
3403
+
3404
+ if selection not in modules:
3405
+ print("")
3406
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
3407
+ print("")
3408
+ print("------------------------------------------- Quick Usage Options -------------------------------------------")
3409
+ print("")
3410
+ print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
3411
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
3412
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
3413
+ print("")
3414
+ print("-------------------------------------- Database Construction Options --------------------------------------")
3415
+ print("")
3416
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
3417
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
3418
+ print("")
3419
+ print("---------------------------------------------- Query Options ----------------------------------------------")
3420
+ print("")
3421
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
3422
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
3423
+ print("")
3424
+ print("-----------------------------------------------------------------------------------------------------------")
3425
+ print("")
3426
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
3427
+ print("")
3428
+ sys.exit()
3429
+
3430
+ #################### Database build or add ########################
3431
+
3432
+ if selection == "build_db":
3433
+ parser, opts = build_db_opts()
3434
+
3435
+ #module name only
3436
+ if len(sys.argv) < 3:
3437
+ print(parser.print_help())
3438
+ sys.exit()
3439
+
3440
+ #Directory based
3441
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3442
+
3443
+ #Input list based
3444
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3445
+
3446
+ output = os.path.normpath(opts.output)
3447
+
3448
+ threads = opts.threads
3449
+ verbose = opts.verbose
3450
+
3451
+ #Database handle
3452
+ db_name = opts.db_name
3453
+
3454
+
3455
+ #genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose
3456
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3457
+
3458
+ #################### Add two DBs ########################
3459
+
3460
+ if selection == "merge_db":
3461
+ parser, opts = merge_db_opts()
3462
+ if len(sys.argv) < 3:
3463
+ print(parser.print_help())
3464
+ sys.exit()
3465
+
3466
+ recipient = opts.recipient
3467
+ donors = opts.donors
3468
+ donor_file = opts.donor_file
3469
+ verbose = opts.verbose
3470
+ threads = opts.threads
3471
+
3472
+ merge_db(recipient, donors, donor_file, verbose, threads)
3473
+
3474
+ #################### Query files vs DB ########################
3475
+
3476
+ if selection == "simple_query":
3477
+ parser, opts = sql_query_opts()
3478
+
3479
+ if len(sys.argv) < 3:
3480
+ print(parser.print_help())
3481
+ sys.exit()
3482
+
3483
+ #directory based
3484
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3485
+
3486
+ #Input list based
3487
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3488
+
3489
+ db_name = opts.target
3490
+
3491
+ output = opts.output
3492
+ threads = opts.threads
3493
+ verbose = opts.verbose
3494
+
3495
+ do_stdev = opts.do_stdev
3496
+
3497
+ sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev)
3498
+
3499
+
3500
+ #################### Query DB vs DB ###########################
3501
+ if selection == "db_query":
3502
+ parser, opts = db_query_opts()
3503
+ #module name only
3504
+
3505
+ if len(sys.argv) < 3:
3506
+ print(parser.print_help())
3507
+ sys.exit()
3508
+
3509
+ query = opts.query
3510
+ target = opts.target
3511
+ verbose = opts.verbose
3512
+
3513
+ do_stdev = opts.do_stdev
3514
+ #massive = opts.massive
3515
+
3516
+ mem = opts.precision
3517
+ efficient = opts.large_mem
3518
+
3519
+ output = opts.output
3520
+ threads = opts.threads
3521
+
3522
+ db_query(query, target, verbose, output, threads, do_stdev, mem, efficient)
3523
+
3524
+ #################### One-pass functions #######################
3525
+ if selection == "single_query":
3526
+ parser, opts = single_query_opts()
3527
+ #module name only
3528
+
3529
+ if len(sys.argv) < 3:
3530
+ print(parser.print_help())
3531
+ sys.exit()
3532
+
3533
+ shared_opts = []
3534
+ output = os.path.normpath(opts.output)
3535
+ threads = opts.threads
3536
+ verbose = opts.verbose
3537
+
3538
+ shared_opts.append(output)
3539
+
3540
+ shared_opts.append(threads)
3541
+ shared_opts.append(verbose)
3542
+
3543
+ query_opts = []
3544
+
3545
+ query_genome = opts.query_genome
3546
+ query_protein = opts.query_protein
3547
+ query_hmm = opts.query_hmm
3548
+
3549
+
3550
+ query_opts.append(query_genome)
3551
+ query_opts.append(query_protein)
3552
+ query_opts.append(query_hmm)
3553
+
3554
+ target_opts = []
3555
+
3556
+ target_genome = opts.target_genome
3557
+ target_protein = opts.target_protein
3558
+ target_hmm = opts.target_hmm
3559
+
3560
+ #tg = opts.target_genome_file
3561
+ #tp = opts.target_protein_file
3562
+ #th = opts.target_hmm_file
3563
+
3564
+ target_opts.append(target_genome)
3565
+ target_opts.append(target_protein)
3566
+ target_opts.append(target_hmm)
3567
+
3568
+ single_query(query_opts, target_opts, shared_opts)
3569
+
3570
+ if selection == "aai_index":
3571
+ parser, opts = aai_index_opts()
3572
+ #module name only
3573
+
3574
+ if len(sys.argv) < 3:
3575
+ print(parser.print_help())
3576
+ sys.exit()
3577
+
3578
+
3579
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3580
+ #Text file versions of genomes/proteins/hmms
3581
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3582
+
3583
+ db_name = opts.db_name
3584
+
3585
+ output = opts.output
3586
+ threads = opts.threads
3587
+ verbose = opts.verbose
3588
+
3589
+ do_stdev = opts.do_stdev
3590
+ #massive = opts.massive
3591
+
3592
+ mem = opts.precision
3593
+ efficient = opts.large_mem
3594
+
3595
+ aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, mem, efficient)
3596
+
3597
+ if selection == "multi_query":
3598
+ parser, opts = multi_query_opts()
3599
+ #module name only
3600
+
3601
+ if len(sys.argv) < 3:
3602
+ print(parser.print_help())
3603
+ sys.exit()
3604
+
3605
+ shared_arg_list = []
3606
+ output = os.path.normpath(opts.output)
3607
+ threads = opts.threads
3608
+ verbose = opts.verbose
3609
+
3610
+ do_stdev = opts.do_stdev
3611
+ #massive = opts.massive
3612
+
3613
+ mem = opts.precision
3614
+ efficient = opts.large_mem
3615
+
3616
+ shared_arg_list.append(output)
3617
+ shared_arg_list.append(threads)
3618
+ shared_arg_list.append(verbose)
3619
+ shared_arg_list.append(do_stdev)
3620
+ shared_arg_list.append(mem)
3621
+ shared_arg_list.append(efficient)
3622
+
3623
+ query_arg_list = []
3624
+ genomes, proteins, hmms = opts.query_genomes, opts.query_proteins, opts.query_hmms
3625
+ #Text file versions of genomes/proteins/hmms
3626
+ gf, pf, hf = opts.qgf, opts.qpf, opts.qhf
3627
+ query_db_name = opts.query_db_name
3628
+
3629
+ query_arg_list.append(genomes)
3630
+ query_arg_list.append(proteins)
3631
+ query_arg_list.append(hmms)
3632
+ query_arg_list.append(gf)
3633
+ query_arg_list.append(pf)
3634
+ query_arg_list.append(hf)
3635
+ query_arg_list.append(query_db_name)
3636
+
3637
+ target_arg_list = []
3638
+ genomes, proteins, hmms = opts.target_genomes, opts.target_proteins, opts.target_hmms
3639
+ #Text file versions of genomes/proteins/hmms
3640
+ gf, pf, hf = opts.tgf, opts.tpf, opts.thf
3641
+ target_db_name = opts.target_db_name
3642
+
3643
+ target_arg_list.append(genomes)
3644
+ target_arg_list.append(proteins)
3645
+ target_arg_list.append(hmms)
3646
+ target_arg_list.append(gf)
3647
+ target_arg_list.append(pf)
3648
+ target_arg_list.append(hf)
3649
+ target_arg_list.append(target_db_name)
3650
+
3651
+ multi_query(query_arg_list, target_arg_list, shared_arg_list)
3652
+
3653
+ return None
3654
+
3655
+
3656
+ if __name__ == "__main__":
3657
+ main()
3658
+
3659
+