miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,84 @@
1
+ # FastAAI
2
+ Fast estimation of Average Amino Acid Identities (AAI) for bacterial and viral genomes.
3
+ Includes a module for the classification of viral genomes.
4
+
5
+ ## Content Table
6
+ * [Features](#features)
7
+ * [Citation](#citation)
8
+ * [Requirements](#requirements)
9
+ * [Installation](#installation)
10
+ * [Usage](#usage)
11
+ * [FAQs](#faqs)
12
+ * [License](#license)
13
+
14
+ ## Features
15
+ Coming soon
16
+
17
+ ## Citation
18
+ Coming soon
19
+
20
+ ## Requirements:
21
+ - Programs:
22
+ - [HMMER](http://hmmer.org/) >= 3.1
23
+ - Python >=3.6,<3.9
24
+ - Base Python Modules:
25
+ - argparse
26
+ - datetime
27
+ - pathlib
28
+ - shutil
29
+ - subprocess
30
+ - gzip
31
+ - multiprocessing
32
+ - textwrap
33
+ - pickle
34
+ - tempfile
35
+ - sys
36
+ - functools
37
+ - Additional Python Modules:
38
+ - numpy
39
+
40
+ ## Installation
41
+ ### Conda Installation
42
+ FastAAIIt appears we need a bunch of pre-requisites to run FastAAI No worries, their installation using Conda is quite easy. If you don't have Conda, you can install it as follows:
43
+ 1. Download Anaconda from https://www.anaconda.com/products/individual.
44
+ 2. Run `bash Anaconda-latest-Linux-x86_64.sh` and follow the installation instructions.
45
+ 3. Once installed you can run `conda -V`. You should get the version of conda that you installed.
46
+
47
+ Now, let's add the conda channels required to install the pre-requisites:
48
+
49
+ ```bash
50
+ conda config --add channels conda-forge
51
+ conda config --add channels bioconda
52
+ conda config --add channels cruizperez
53
+ ```
54
+
55
+ Then, create an environment for MicrobeAnnotator:
56
+
57
+ ```bash
58
+ conda create -n fastaai hmmer prodigal numpy python=3.7 fastaai
59
+ ```
60
+
61
+ And activate it:
62
+
63
+ ```bash
64
+ conda activate microbeannotator
65
+ ```
66
+
67
+ Both main scripts (microbeannotator and microbeannotator_db_builder) should be in your path ready for use!
68
+ This should take care of most of the requirements except for Aspera Connect and KofamScan, which are a little more involved. Let's install those.
69
+
70
+ ### Pip Installation
71
+ #Once you have installed the pre-requisites to run MicrobeAnnotator, or if you already had them and you are not using Conda, you can install MicrobeAnnotator using pip:
72
+
73
+
74
+ ## Usage
75
+ ### Database creation
76
+
77
+
78
+ ## FAQs
79
+
80
+
81
+
82
+ ## License
83
+
84
+ See LICENSE
@@ -0,0 +1,1296 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ ########################################################################
5
+ # Author: Carlos Ruiz
6
+ # Intitution: Georgia Institute of Technology
7
+ # Version: 0.8
8
+ # Date: March 02, 2020
9
+
10
+ # Description: Calculates the average amino acid identity using k-mers
11
+ from single copy genes. It is a faster version of the regular AAI (Blast
12
+ or Diamond) and the hAAI implemented in MiGA.
13
+ ########################################################################
14
+ """
15
+
16
+ ################################################################################
17
+ """---0.0 Import Modules---"""
18
+ import subprocess, argparse, multiprocessing, datetime, shutil
19
+ import textwrap, pickle, gzip
20
+ from random import randint
21
+ from pathlib import Path
22
+ from sys import argv
23
+ from sys import exit
24
+ from functools import partial
25
+ from os.path import realpath
26
+ import numpy
27
+ import tempfile
28
+
29
+
30
+ ################################################################################
31
+ """---1.0 Define Functions---"""
32
+ # --- Run prodigal ---
33
+ # ------------------------------------------------------
34
+ def run_prodigal(input_file):
35
+ """
36
+ Runs prodigal, compares translation tables and stores faa files
37
+
38
+ Arguments:
39
+ input_file -- Path to genome FastA file
40
+
41
+ Returns:
42
+ output -- Path to amino acid fasta result
43
+ """
44
+ # Predict proteins with translation tables 4 and 11
45
+ file_path = Path(input_file)
46
+ filename = file_path.name
47
+ folder = file_path.parent
48
+ protein_output = folder / (filename + '.faa')
49
+ output_11 = folder / (filename + '.faa.11')
50
+ temp_output = folder / (filename + '.temp')
51
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_11),
52
+ "-p", "meta", "-q", "-o", str(temp_output)])
53
+ output_4 = folder / (filename + '.faa.4')
54
+ temp_output = folder / (filename + '.temp')
55
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_4),
56
+ "-p", "meta", "-g", "4", "-q", "-o", str(temp_output)])
57
+
58
+ # Compare translation tables
59
+ length_4 = 0
60
+ length_11 = 0
61
+ with open(output_4, 'r') as table_4:
62
+ for line in table_4:
63
+ if line.startswith(">"):
64
+ continue
65
+ else:
66
+ length_4 += len(line.strip())
67
+
68
+ with open(output_11, 'r') as table_11:
69
+ for line in table_11:
70
+ if line.startswith(">"):
71
+ continue
72
+ else:
73
+ length_11 += len(line.strip())
74
+
75
+ if (length_4 / length_11) >= 1.1:
76
+ shutil.copy(output_4, protein_output)
77
+ else:
78
+ shutil.copy(str(output_11), str(protein_output))
79
+
80
+ # Remove intermediate files
81
+ output_4.unlink()
82
+ output_11.unlink()
83
+ temp_output.unlink()
84
+
85
+ # Remove stop '*' codons from protein sequences
86
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
87
+ for line in final_protein:
88
+ if line.startswith(">"):
89
+ temporal_file.write("{}".format(line))
90
+ else:
91
+ line = line.replace('*', '')
92
+ temporal_file.write("{}".format(line))
93
+ shutil.copy(str(temp_output), str(protein_output))
94
+ temp_output.unlink()
95
+
96
+ return str(protein_output)
97
+ # ------------------------------------------------------
98
+
99
+ # --- Run prodigal for viruses ---
100
+ # ------------------------------------------------------
101
+ def run_prodigal_virus(input_file):
102
+ """
103
+ Runs prodigal, compares translation tables and stores faa files
104
+
105
+ Arguments:
106
+ input_file -- Path to genome FastA file
107
+
108
+ Returns:
109
+ output -- Path to amino acid fasta result
110
+ """
111
+ # Predict proteins with translation tables 4 and 11
112
+ file_path = Path(input_file)
113
+ filename = file_path.name
114
+ folder = file_path.parent
115
+ protein_output = folder / (filename + '.faa')
116
+ temp_output = folder / (filename + '.temp')
117
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(protein_output),
118
+ "-p", "meta", "-q", "-o", str(temp_output)])
119
+
120
+ # Remove intermediate files
121
+ temp_output.unlink()
122
+
123
+ # Remove stop '*' codons from protein sequences
124
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
125
+ for line in final_protein:
126
+ if line.startswith(">"):
127
+ temporal_file.write("{}".format(line))
128
+ else:
129
+ line = line.replace('*', '')
130
+ temporal_file.write("{}".format(line))
131
+ shutil.copy(str(temp_output), str(protein_output))
132
+ temp_output.unlink()
133
+
134
+ return str(protein_output)
135
+ # ------------------------------------------------------
136
+
137
+ # --- Run hmmsearch ---
138
+ # ------------------------------------------------------
139
+ def run_hmmsearch(input_file):
140
+ """
141
+ Runs hmmsearch on the set of SCGs and select the
142
+ best Archaea or Bacterial model
143
+
144
+ Arguments:
145
+ input_file -- Path to protein FastA file
146
+
147
+ Returns:
148
+ output -- Path to hmmsearch hits table
149
+ """
150
+ file_path = Path(input_file)
151
+ folder = file_path.parent
152
+ name = file_path.name
153
+ hmm_output = folder / (name + '.hmm')
154
+ temp_output = folder / (name + '.temp')
155
+ script_path = Path(realpath(__file__))
156
+ script_dir = script_path.parent
157
+ hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
158
+ subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
159
+ str(hmm_complete_model), str(file_path)])
160
+ temp_output.unlink()
161
+ return str(hmm_output)
162
+ # ------------------------------------------------------
163
+
164
+ # --- Filter HMM results for best matches ---
165
+ # ------------------------------------------------------
166
+ def hmm_filter(scg_hmm_file, keep):
167
+ """
168
+ Filters HMM results for best hits per protein
169
+
170
+ Arguments:
171
+ SCG_HMM_file {file path} -- Path to HMM results file
172
+ keep {bool} -- Keep HMM files
173
+
174
+ Returns:
175
+ outfile -- Path to filtered files
176
+ """
177
+ hmm_path = Path(scg_hmm_file)
178
+ name = hmm_path.name
179
+ folder = hmm_path.parent
180
+ outfile = folder / (name + '.filt')
181
+ hmm_hit_dict = {}
182
+ with open(scg_hmm_file, 'r') as hit_file:
183
+ for line in hit_file:
184
+ if line.startswith("#"):
185
+ continue
186
+ else:
187
+ hit = line.strip().split()
188
+ protein_name = hit[0]
189
+ score = float(hit[8])
190
+ if protein_name in hmm_hit_dict:
191
+ if score > hmm_hit_dict[protein_name][0]:
192
+ hmm_hit_dict[protein_name] = [score, line]
193
+ elif score < hmm_hit_dict[protein_name][0]:
194
+ continue
195
+ else:
196
+ if randint(2) > 0:
197
+ hmm_hit_dict[protein_name] = [score, line]
198
+ else:
199
+ hmm_hit_dict[protein_name] = [score, line]
200
+ with open(outfile, 'w') as output:
201
+ for hits in hmm_hit_dict.values():
202
+ output.write("{}".format(hits[1]))
203
+ return str(outfile)
204
+ # ------------------------------------------------------
205
+
206
+ # --- Find Kmers from HMM results ---
207
+ # ------------------------------------------------------
208
+ def kmer_extract(input_files):
209
+ """
210
+ Extract kmers from protein files that have hits
211
+ in the HMM searches.
212
+
213
+ Arguments:
214
+ SCG_HMM_file {file path} -- Path to filtered HMM results.
215
+
216
+ Returns:
217
+ [genome_kmers] -- Dictionary of kmers per gene.
218
+ """
219
+ final_filename = input_files[0]
220
+ protein_file = input_files[1]
221
+ scg_hmm_file = input_files[2]
222
+ positive_matches = {}
223
+ positive_proteins = []
224
+ with open(scg_hmm_file, 'r') as hmm_input:
225
+ for line in hmm_input:
226
+ line = line.strip().split()
227
+ protein_name = line[0]
228
+ model_name = line[3]
229
+ score = line[8]
230
+ if model_name in positive_matches:
231
+ if score > positive_matches[model_name][1]:
232
+ positive_matches[model_name] = [protein_name, score]
233
+ else:
234
+ continue
235
+ else:
236
+ positive_matches[model_name] = [protein_name, score]
237
+ for proteins in positive_matches.values():
238
+ positive_proteins.append(proteins[0])
239
+ scg_kmers = read_kmers_from_file(protein_file, positive_proteins, 4)
240
+ for accession, protein in positive_matches.items():
241
+ scg_kmers[accession] = scg_kmers.pop(protein[0])
242
+ genome_kmers = {final_filename : scg_kmers}
243
+ return genome_kmers
244
+ # ------------------------------------------------------
245
+
246
+ # --- Extract kmers from protein sequences ---
247
+ # ------------------------------------------------------
248
+ def read_kmers_from_file(filename, positive_hits, ksize):
249
+ scg_kmers = {}
250
+ store_sequence = False
251
+ protein_name = ""
252
+ protein_sequence = ""
253
+ with open(filename) as fasta_in:
254
+ for line in fasta_in:
255
+ if line.startswith(">"):
256
+ if store_sequence == True:
257
+ kmers = build_kmers(protein_sequence, ksize)
258
+ scg_kmers[protein_name] = kmers
259
+ protein_sequence = ""
260
+ store_sequence = False
261
+ line = line.replace(">", "")
262
+ protein_name = line.strip().split()[0]
263
+ if protein_name in positive_hits:
264
+ store_sequence = True
265
+ else:
266
+ if store_sequence == True:
267
+ protein_sequence += line.strip()
268
+ else:
269
+ continue
270
+ if store_sequence == True:
271
+ kmers = build_kmers(protein_sequence, ksize)
272
+ scg_kmers[protein_name] = kmers
273
+ return scg_kmers
274
+ # ------------------------------------------------------
275
+
276
+ # --- Extract kmers from viral protein sequences ---
277
+ # ------------------------------------------------------
278
+ def read_viral_kmers_from_file(input_information):
279
+ final_filename = input_information[0]
280
+ protein_file = input_information[1]
281
+ kmer_size = input_information[2]
282
+ scg_kmers = set()
283
+ protein_sequence = ""
284
+ store_sequence = False
285
+ with open(protein_file) as fasta_in:
286
+ for line in fasta_in:
287
+ if line.startswith(">"):
288
+ if store_sequence == True:
289
+ kmers = build_kmers(protein_sequence, kmer_size)
290
+ kmers = set(kmers.split(","))
291
+ scg_kmers.update(kmers)
292
+ protein_sequence = ""
293
+ else:
294
+ protein_sequence = ""
295
+ store_sequence = True
296
+ else:
297
+ protein_sequence += line.strip()
298
+ genome_kmers = {final_filename : list(scg_kmers)}
299
+ return genome_kmers
300
+ # ------------------------------------------------------
301
+
302
+ # --- Build Kmers ---
303
+ # ------------------------------------------------------
304
+ def build_kmers(sequence, ksize):
305
+ kmers = []
306
+ n_kmers = len(sequence) - ksize + 1
307
+
308
+ for i in range(n_kmers):
309
+ kmer = sequence[i:i + ksize]
310
+ kmers.append(kmer)
311
+ kmers_set = ','.join(set(kmers))
312
+ return kmers_set
313
+ # ------------------------------------------------------
314
+
315
+ # --- Parse kAAI when query == reference ---
316
+ #Carlos, This function is not used with the new changes
317
+ # ------------------------------------------------------
318
+ def single_kaai_parser(query_id):
319
+ """
320
+ Calculates Jaccard distances on kmers from proteins shared
321
+
322
+ Arguments:
323
+ query_id {str} -- Id of the query genome
324
+
325
+ Returns:
326
+ [Path to output] -- Path to output file
327
+ """
328
+ file_path = Path(query_id)
329
+
330
+ #Carlos, tempdir for safety
331
+ tmp_folder = tempfile.TemporaryDirectory()
332
+ running_folder = tmp_folder.name
333
+
334
+
335
+ temp_output = running_folder / file_path.with_suffix('.aai.temp')
336
+ # Get number and list of SCG detected in query
337
+ query_num_scg = len(query_kmer_dictionary[query_id])
338
+ query_scg_list = query_kmer_dictionary[query_id].keys()
339
+ # Start comparison with all genomes in the query dictionary
340
+ with open(temp_output, 'w') as out_file:
341
+ for target_genome, scg_ids in query_kmer_dictionary.items():
342
+ jaccard_similarities = []
343
+ # Get number and list of SCG detected in reference
344
+ target_num_scg = len(scg_ids)
345
+ target_scg_list = scg_ids.keys()
346
+ # Choose the smallest set of proteins
347
+ if query_num_scg > target_num_scg:
348
+ final_scg_list = target_scg_list
349
+ else:
350
+ final_scg_list = query_scg_list
351
+ # Compare all the proteins in the final SCG list
352
+ for accession in final_scg_list:
353
+ if accession in query_scg_list and accession in target_scg_list:
354
+ # Get set and list for each SCG accession
355
+ kmers_query = set(query_kmer_dictionary[query_id][accession].split(','))
356
+ kmers_target = query_kmer_dictionary[target_genome][accession].split(',')
357
+ # Calculate jaccard_similarity
358
+ intersection = len(kmers_query.intersection(kmers_target))
359
+ union = len(kmers_query.union(kmers_target))
360
+ jaccard_similarities.append(intersection / union)
361
+ else:
362
+ continue
363
+ try:
364
+ n = len(jaccard_similarities)
365
+ mean = sum(jaccard_similarities)/n
366
+ var = sum([ (x - mean)**2 for x in jaccard_similarities ])/(n - 1)
367
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
368
+ round(mean, 4), round(var**0.5, 4),
369
+ len(jaccard_similarities), len(final_scg_list)))
370
+ except:
371
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
372
+ "NA", "NA", "NA", "NA"))
373
+
374
+ return temp_output
375
+ # ------------------------------------------------------
376
+
377
+ # --- Parse viral kAAI when query == reference ---
378
+ # ------------------------------------------------------
379
+ def single_virus_kaai_parser(query_id):
380
+ """
381
+ Calculates Jaccard distances on kmers from viral proteins
382
+
383
+ Arguments:
384
+ query_id {str} -- Id of the query genome
385
+
386
+ Returns:
387
+ [Path to output] -- Path to output file
388
+ """
389
+ file_path = Path(query_id)
390
+
391
+ #Carlos, tempdir for safety
392
+ tmp_folder = tempfile.TemporaryDirectory()
393
+ running_folder = tmp_folder.name
394
+
395
+
396
+ temp_output = running_folder / file_path.with_suffix('.aai.temp')
397
+ # Start comparison with all genomes in the query dictionary
398
+ with open(temp_output, 'w') as out_file:
399
+ for target_genome, kmers_target in query_kmer_dictionary.items():
400
+ jaccard_index = None
401
+ kmers_query = set(query_kmer_dictionary[query_id])
402
+ intersection = len(kmers_query.intersection(kmers_target))
403
+ union = len(kmers_query.union(kmers_target))
404
+ try:
405
+ jaccard_index = intersection / union
406
+ out_file.write("{}\t{}\t{}\n".format(query_id, target_genome, jaccard_index))
407
+ except:
408
+ out_file.write("{}\t{}\tNA\n".format(query_id, target_genome))
409
+ return temp_output
410
+ # ------------------------------------------------------
411
+
412
+ # --- Parse kAAI when query != reference ---
413
+ # ------------------------------------------------------
414
+ def double_kaai_parser(query_id):
415
+ """
416
+ Calculates Jaccard distances on kmers from proteins shared
417
+
418
+ Arguments:
419
+ query_id {str} -- Id of the query genome
420
+
421
+ Returns:
422
+ [Path to output] -- Path to output file
423
+ """
424
+ file_path = Path(query_id)
425
+
426
+ #Carlos, tempdir for safety
427
+ tmp_folder = tempfile.TemporaryDirectory()
428
+ running_folder = tmp_folder.name
429
+
430
+
431
+ temp_output = running_folder / file_path.with_suffix('.aai.temp')
432
+ # Get number and list of SCG detected in query
433
+ query_num_scg = len(query_kmer_dictionary[query_id])
434
+ query_scg_list = query_kmer_dictionary[query_id].keys()
435
+ # Start comparison with all genomes in the query dictionary
436
+ with open(temp_output, 'w') as out_file:
437
+ for target_genome, scg_ids in ref_kmer_dictionary.items():
438
+ jaccard_similarities = []
439
+ # Get number and list of SCG detected in reference
440
+ target_num_scg = len(scg_ids)
441
+ target_scg_list = scg_ids.keys()
442
+ # Choose the smallest set of proteins
443
+ if query_num_scg > target_num_scg:
444
+ final_scg_list = target_scg_list
445
+ else:
446
+ final_scg_list = query_scg_list
447
+ # Compare all the proteins in the final SCG list
448
+ for accession in final_scg_list:
449
+ if accession in query_scg_list and accession in target_scg_list:
450
+ # Get set and list for each SCG accession
451
+ kmers_query = set(query_kmer_dictionary[query_id][accession].split(','))
452
+ kmers_target = ref_kmer_dictionary[target_genome][accession].split(',')
453
+ # Calculate jaccard_similarity
454
+ intersection = len(kmers_query.intersection(kmers_target))
455
+ union = len(kmers_query.union(kmers_target))
456
+ jaccard_similarities.append(intersection / union)
457
+ else:
458
+ continue
459
+ try:
460
+ n = len(jaccard_similarities)
461
+ mean = sum(jaccard_similarities)/n
462
+ var = sum([ (x - mean)**2 for x in jaccard_similarities ])/(n - 1)
463
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
464
+ round(mean, 4), round(var**0.5, 4),
465
+ len(jaccard_similarities), len(final_scg_list)))
466
+ except:
467
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
468
+ "NA", "NA", "NA", "NA"))
469
+ return temp_output
470
+ # ------------------------------------------------------
471
+
472
+ # --- Parse viral kAAI when query != reference ---
473
+ # ------------------------------------------------------
474
+ def double_viral_kaai_parser(query_id):
475
+ """
476
+ Calculates Jaccard distances on kmers from viral proteins
477
+
478
+ Arguments:
479
+ query_id {str} -- Id of the query genome
480
+
481
+ Returns:
482
+ [Path to output] -- Path to output file
483
+ """
484
+ file_path = Path(query_id)
485
+
486
+ #Carlos, tempdir for safety
487
+ tmp_folder = tempfile.TemporaryDirectory()
488
+ running_folder = tmp_folder.name
489
+
490
+
491
+ temp_output = running_folder / file_path.with_suffix('.aai.temp')
492
+ # Start comparison with all genomes in the query dictionary
493
+ with open(temp_output, 'w') as out_file:
494
+ for target_genome, kmers_target in ref_kmer_dictionary.items():
495
+ jaccard_index = None
496
+ kmers_query = set(query_kmer_dictionary[query_id])
497
+ intersection = len(kmers_query.intersection(kmers_target))
498
+ union = len(kmers_query.union(kmers_target))
499
+ try:
500
+ jaccard_index = intersection / union
501
+ out_file.write("{}\t{}\t{}\n".format(query_id, target_genome, jaccard_index))
502
+ except:
503
+ out_file.write("{}\t{}\tNA\n".format(query_id, target_genome))
504
+ return temp_output
505
+ # ------------------------------------------------------
506
+
507
+ # --- Query == Reference initializer function ---
508
+ # ------------------------------------------------------
509
+ def single_dictionary_initializer(_dictionary):
510
+ """
511
+ Make dictionary available for multiprocessing
512
+ """
513
+ global query_kmer_dictionary
514
+ query_kmer_dictionary = _dictionary
515
+ # ------------------------------------------------------
516
+
517
+ # --- Query != Reference initializer function ---
518
+ # ------------------------------------------------------
519
+ def two_dictionary_initializer(_query_dictionary, _ref_dictionary):
520
+ """
521
+ Make dictionary available for multiprocessing
522
+ """
523
+ global query_kmer_dictionary
524
+ global ref_kmer_dictionary
525
+ query_kmer_dictionary = _query_dictionary
526
+ ref_kmer_dictionary = _ref_dictionary
527
+ # ------------------------------------------------------
528
+
529
+ # --- Merge kmer dictionaries ---
530
+ # ------------------------------------------------------
531
+ def merge_dicts(dictionaries):
532
+ """
533
+ Given any number of dicts, shallow copy and merge into a new dict,
534
+ precedence goes to key value pairs in latter dicts.
535
+ """
536
+ result = {}
537
+ for kmer_dictionary in dictionaries:
538
+ result.update(kmer_dictionary)
539
+ return result
540
+ # ------------------------------------------------------
541
+
542
+
543
+ #My version 1 - numpy-ized
544
+ def single_kaai_parser_all_v_all(args):
545
+ """
546
+ Calculates Jaccard distances on kmers from proteins shared
547
+
548
+ Arguments:
549
+ query_id {str} -- Id of the query genome
550
+
551
+ Returns:
552
+ [Path to output] -- Path to output file
553
+ """
554
+ #Use split as slice if true
555
+
556
+ query_id = args[0]
557
+ skip_first_n = args[1]
558
+
559
+ file_path = Path(query_id)
560
+
561
+ tmp_folder = tempfile.TemporaryDirectory()
562
+ running_folder = tmp_folder.name
563
+
564
+ #Just for my own testing. Temp dir is definitely the correct choice, here.
565
+ #running_folder = Path("faster_kaai")
566
+
567
+ temp_output = running_folder / file_path.with_suffix('.aai.temp')
568
+
569
+
570
+ #The goal is to numpy-ize the following loop in all possible aspects for a (hopeful) speed increase
571
+
572
+
573
+ #query_num_scg = len(query_kmer_dictionary[query_id])
574
+
575
+ query_scg_list = numpy.array(list(query_kmer_dictionary[query_id].keys()))
576
+
577
+ with open(temp_output, 'w') as out_file:
578
+
579
+ '''
580
+ Target genomes each control a set of protein family keys
581
+
582
+ The goal is to get the jaccard index for the kmers in all cases
583
+ of shared protein families for the two genomes in question, for
584
+ each pair of genomes
585
+
586
+ From above, we have the number of proteins in the query dict
587
+ and a list of the IDs
588
+
589
+ below we get the number of proteins in the target dict
590
+ and a list of the IDs
591
+
592
+ 1 choose the shorter list (each item has to be in both to be used, after all)
593
+ 2 check if each family is in both lists
594
+ (kind of an unnecessarily big search cost, yeah? O(n) time with very few n = 1 cases; maybe we can make a dict of dicts of IDs, and check with try: [ID] except: ?)
595
+ 3 get all of the jaccard similarities for kmers in shared protein families
596
+
597
+ 4 calculate the mean and variance for each similarity set
598
+
599
+ 5 repeat for the remaining genomes.
600
+
601
+ '''
602
+
603
+ #for target_genome, scg_ids in query_kmer_dictionary.items():
604
+ for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
605
+ scg_ids = query_kmer_dictionary[target_genome]
606
+
607
+ #If self, 1.0 similarity.
608
+ if query_id == target_genome:
609
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
610
+ 1.0, 0.0,
611
+ len(query_scg_list), len(query_scg_list)))
612
+ continue
613
+
614
+ jaccard_similarities = []
615
+ # Get number and list of SCG detected in reference
616
+ #target_num_scg = len(scg_ids)
617
+ target_scg_list = numpy.array(list(scg_ids.keys()))
618
+
619
+ final_scg_list = numpy.intersect1d(query_scg_list, target_scg_list)
620
+
621
+ #I would like to figure out how to vectorize this.
622
+ for accession in final_scg_list:
623
+ #Because of the prep work, these are already numpy arrays of numbers keying to the kmers they represent from the old kmer dict..
624
+ kmers_query = query_kmer_dictionary[query_id][accession]
625
+ kmers_target = query_kmer_dictionary[target_genome][accession]
626
+
627
+ # Calculate jaccard_similarity - intersection is by far the slowest step, so this is by far the best place to optimize.
628
+ if len(kmers_query) < len(kmers_target):
629
+ intersection = len(intersect1d_searchsorted(kmers_query, kmers_target))
630
+ else:
631
+ intersection = len(intersect1d_searchsorted(kmers_target, kmers_query))
632
+
633
+ union = len(numpy.union1d(kmers_query, kmers_target))
634
+ jaccard_similarities.append(intersection / union)
635
+
636
+ #Allow for numpy in-builts; they're a little faster.
637
+ jaccard_similarities = numpy.array(jaccard_similarities, dtype=numpy.float_)
638
+
639
+ try:
640
+ #No longer needed.
641
+ #n = len(jaccard_similarities)
642
+ mean = numpy.mean(jaccard_similarities)
643
+ var = numpy.std(jaccard_similarities)
644
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
645
+ round(mean, 4), round(var, 4),
646
+ len(jaccard_similarities), len(final_scg_list)))
647
+ except:
648
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
649
+ "NA", "NA", "NA", "NA"))
650
+ return temp_output
651
+
652
+
653
+ def initializer_tracker(_dictionary1, _dictionary2):
654
+ """
655
+ Make dictionary available for multiprocessing
656
+ """
657
+ global kmer_dict
658
+ global tracker_dict
659
+ kmer_dict = _dictionary1
660
+ tracker_dict = _dictionary2
661
+
662
+
663
+ def unique_kmers(kmer_dict):
664
+
665
+ tracker_dict = {}
666
+
667
+ counter = 0
668
+
669
+ for file in kmer_dict:
670
+ for id in kmer_dict[file]:
671
+ #These are the actual kmers
672
+ for kmer in kmer_dict[file][id].split(','):
673
+ #Hash might be fast?
674
+ try:
675
+ tracker_dict[kmer]
676
+ except:
677
+ tracker_dict[kmer] = counter
678
+ counter += 1
679
+
680
+ return tracker_dict
681
+
682
+
683
+ def convert_kmers_to_indices(kmer_dict):
684
+ for genome in kmer_dict:
685
+ inner_count = 0
686
+ cur_tup = string_to_tup(genome)
687
+ for pf in kmer_dict[genome]:
688
+ kmer_dict[genome][pf] = cur_tup[inner_count]
689
+ inner_count += 1
690
+
691
+ return kmer_dict
692
+
693
+ def string_to_tup(genome):
694
+ sets = []
695
+ for pf in kmer_dict[genome]:
696
+ curset = []
697
+ for kmer in kmer_dict[genome][pf].split(","):
698
+ curset.append(tracker_dict[kmer])
699
+
700
+ #Do all the overhead here, ONCE.
701
+ sets.append(numpy.sort(numpy.unique(numpy.array(curset, dtype=numpy.int32))))
702
+
703
+ return(sets)
704
+
705
+ def numpyize_kmers(kmer_dict):
706
+ #make kmer global for tracker
707
+ single_dictionary_initializer(kmer_dict)
708
+ #get a list of kmer - index for all unique kmers
709
+ print("Indexing unique kmers")
710
+ tracker = unique_kmers(kmer_dict)
711
+ #Make these global for other functions
712
+ initializer_tracker(kmer_dict, tracker)
713
+ #convert comma sep. strings of kmers to ascending sorted lists of unique integers corresponding to the kmers in each protein, for each genome
714
+ print("Keying kmers")
715
+ kmer_dict = convert_kmers_to_indices(kmer_dict)
716
+
717
+ #Get skip indices
718
+ smartargs = []
719
+ genome_ids = list(kmer_dict.keys())
720
+ for i in range(0, len(genome_ids)):
721
+ smartargs.append([genome_ids[i], i])
722
+
723
+ print("Beginning AAI calculations now.")
724
+
725
+ return kmer_dict, smartargs
726
+
727
+ #relies on assuming that the values in both of these arrays are unique and sorted, which I do in str_to_tup
728
+ def intersect1d_searchsorted(A,B):
729
+ idx = numpy.searchsorted(B,A)
730
+ idx[idx==len(B)] = 0
731
+ return A[B[idx] == A]
732
+
733
+
734
+ ################################################################################
735
+ """---2.0 Main Function---"""
736
+
737
+ def main():
738
+ # Setup parser for arguments.
739
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
740
+ description='''This script calculates the average amino acid identity using k-mers\n'''
741
+ '''from single copy genes. It is a faster version of the regular AAI '''
742
+ '''(Blast or Diamond) and the hAAI implemented in MiGA.'''
743
+ '''Usage: ''' + argv[0] + ''' -p [Protein Files] -t [Threads] -o [Output]\n'''
744
+ '''Global mandatory parameters: -g [Genome Files] OR -p [Protein Files] OR -s [SCG HMM Results] -o [AAI Table Output]\n'''
745
+ '''Optional Database Parameters: See ''' + argv[0] + ' -h')
746
+ mandatory_options = parser.add_argument_group('Mandatory i/o options. You must select an option for the queries and one for the references.')
747
+ mandatory_options.add_argument('--qg', dest='query_genomes', action='store', required=False,
748
+ help='File with list of query genomes.')
749
+ mandatory_options.add_argument('--qp', dest='query_proteins', action='store', required=False,
750
+ help='File with list of query proteins.')
751
+ mandatory_options.add_argument('--qh', dest='query_hmms', action='store', required=False,
752
+ help=textwrap.dedent('''
753
+ File with list of pre-computed query hmmsearch results.
754
+ If you select this option you must also provide a file with
755
+ a list of protein files for the queries (with --qp).
756
+ '''))
757
+ mandatory_options.add_argument('--qd', dest='query_database', action='store', required=False,
758
+ help='File with list of pre-indexed query databases.')
759
+ mandatory_options.add_argument('--rg', dest='reference_genomes', action='store', required=False,
760
+ help='File with list of reference genomes.')
761
+ mandatory_options.add_argument('--rp', dest='reference_proteins', action='store', required=False,
762
+ help='File with list of reference proteins.')
763
+ mandatory_options.add_argument('--rh', dest='reference_hmms', action='store', required=False,
764
+ help=textwrap.dedent('''
765
+ File with list of pre-computed reference hmmsearch results.
766
+ If you select this option you must also provide a file with
767
+ a list of protein files for the references (with --qp).
768
+ '''))
769
+ mandatory_options.add_argument('--rd', dest='reference_database', action='store', required=False,
770
+ help='File with list of pre-indexed reference databases.')
771
+ mandatory_options.add_argument('-o', '--output', dest='output', action='store', required=False, help='Output file. By default kaai_comparisons.txt')
772
+ additional_input_options = parser.add_argument_group('Behavior modification options.')
773
+ additional_input_options.add_argument('-e', '--ext', dest='extension', action='store', required=False,
774
+ help='Extension to remove from original filename, e.g. ".fasta"')
775
+ additional_input_options.add_argument('-i', '--index', dest='index_db', action='store_true', required=False,
776
+ help='Only index and store databases, i.e., do not perform comparisons.')
777
+ misc_options = parser.add_argument_group('Miscellaneous options')
778
+ misc_options.add_argument('--virus', dest='virus', action='store_true', required=False,
779
+ help='Toggle virus-virus comparisons. Use only with viral genomes or proteins.')
780
+ misc_options.add_argument('-t', '--threads', dest='threads', action='store', default=1, type=int, required=False,
781
+ help='Number of threads to use, by default 1')
782
+ misc_options.add_argument('-k', '--keep', dest='keep', action='store_false', required=False,
783
+ help='Keep intermediate files, by default true')
784
+
785
+ args = parser.parse_args()
786
+
787
+ query_genomes = args.query_genomes
788
+ reference_genomes = args.reference_genomes
789
+ query_proteins = args.query_proteins
790
+ reference_proteins = args.reference_proteins
791
+ query_hmms = args.query_hmms
792
+ reference_hmms = args.reference_hmms
793
+ query_database = args.query_database
794
+ reference_database = args.reference_database
795
+ output = args.output
796
+ if output == None:
797
+ output == "kaai_comparisons.txt"
798
+ extension = args.extension
799
+ index_db = args.index_db
800
+ threads = args.threads
801
+ keep = args.keep
802
+ virus = args.virus
803
+
804
+ print("kAAI started on {}".format(datetime.datetime.now()))
805
+ # Check user input
806
+ # ------------------------------------------------------
807
+ # Check if no query was provided
808
+ if query_genomes == None and query_proteins == None and query_hmms == None and query_database == None:
809
+ exit('Please prove a file with a list of queries, e.g., --qg, --qp, --qh, or --qd)')
810
+ # Check query inputs
811
+ query_input = None
812
+ if query_hmms != None:
813
+ if virus == True:
814
+ exit("If you are comparing viruses, please start from the genome or protein files.")
815
+ query_input = query_hmms
816
+ if query_proteins != None:
817
+ print("Starting from query hmmsearch results.")
818
+ print("You also provided the list of protein files used for hmmsearch.")
819
+ elif query_proteins == None:
820
+ print("You chose to start from pre-computed hmmsearch results for your queries (--qh).")
821
+ print("However, I also need the location of the query proteins used for hmmsearch.")
822
+ exit("Please provide them with --qp.")
823
+ elif query_proteins != None:
824
+ query_input = query_proteins
825
+ print("Starting from query proteins.")
826
+ elif query_genomes != None:
827
+ query_input = query_genomes
828
+ print("Starting from query genomes.")
829
+ elif query_database != None:
830
+ query_input = query_database
831
+ print("Starting from the pre-indexed query database.")
832
+ # Check if no reference was provided
833
+ if reference_genomes == None and reference_proteins == None and reference_hmms == None and reference_database == None:
834
+ exit('Please prove a file with a list of references, e.g., --rg, --rp, --rh, or --rd)')
835
+ # Check reference inputs
836
+ reference_input = None
837
+ if reference_hmms != None:
838
+ if virus == True:
839
+ exit("If you are comparing viruses, please start from the genome or protein files.")
840
+ reference_input = reference_hmms
841
+ if reference_proteins != None:
842
+ print("Starting from reference hmmsearch results.")
843
+ print("You also provided the list of protein files used for hmmsearch.")
844
+ elif reference_proteins == None:
845
+ print("You chose to start from pre-computed hmmsearch results for your references (--rh).")
846
+ print("However, I also need the location of the query proteins used for hmmsearch.")
847
+ exit("Please provide them with --rp.")
848
+ elif reference_proteins != None:
849
+ reference_input = reference_proteins
850
+ print("Starting from reference proteins.")
851
+ elif reference_genomes != None:
852
+ reference_input = reference_genomes
853
+ print("Starting from reference genomes.")
854
+ elif reference_database != None:
855
+ reference_input = reference_database
856
+ print("Starting from the pre-indexed reference database.")
857
+ # ------------------------------------------------------
858
+
859
+ # Check if queries are the same as references (an all-vs-all comparison)
860
+ # ------------------------------------------------------
861
+ same_inputs = False
862
+ if query_input == reference_input:
863
+ same_inputs = True
864
+ if same_inputs == True:
865
+ print('You specified the same query and reference files.')
866
+ print('I will perform an all vs all comparison :)')
867
+ # ------------------------------------------------------
868
+
869
+ #* Database Parsing is the same regardless of bacterial or viral genomes
870
+ # If using pre-indexed databases, check if they are valid files.
871
+ # ------------------------------------------------------
872
+ # If any of the starting points is from database, then store the
873
+ # kmer structures in the corresponding dictionaries.
874
+ # Otherwise read the file list and get the filenames
875
+ query_kmer_dict = None
876
+ query_kmer_dict_list = []
877
+ reference_kmer_dict = None
878
+ reference_kmer_dict_list = []
879
+ # If starting from database and query == reference
880
+ if same_inputs == True:
881
+ if query_database != None:
882
+ with open(query_database) as query_database_files:
883
+ for db_location in query_database_files:
884
+ if Path(db_location.strip()).is_file():
885
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
886
+ temp_dict = pickle.load(database_handle)
887
+ if isinstance(temp_dict,dict):
888
+ query_kmer_dict_list.append(temp_dict)
889
+ #Carlos, this line serves no purpose but does take a bunch of time and mem.
890
+ #print(query_kmer_dict_list)
891
+ else:
892
+ exit("One of the database files appear to have the wrong format. Please provide a correctly formated databases.")
893
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
894
+ else:
895
+ # If the inputs are not the same:
896
+ # If query and ref are provided
897
+ if query_database != None and reference_database != None:
898
+ with open(query_database, 'r') as query_database_files:
899
+ for db_location in query_database_files:
900
+ if Path(db_location.strip()).is_file():
901
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
902
+ temp_dict = pickle.load(database_handle)
903
+ if isinstance(temp_dict,dict):
904
+ query_kmer_dict_list.append(temp_dict)
905
+ else:
906
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated databases.")
907
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
908
+ with open(reference_database) as reference_database_files:
909
+ for db_location in reference_database_files:
910
+ if Path(db_location.strip()).is_file():
911
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
912
+ temp_dict = pickle.load(database_handle)
913
+ if isinstance(temp_dict,dict):
914
+ reference_kmer_dict_list.append(temp_dict)
915
+ else:
916
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated databases.")
917
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
918
+ # If only the query has a db
919
+ elif query_database != None and reference_database == None:
920
+ with open(query_database) as query_database_files:
921
+ for db_location in query_database_files:
922
+ if Path(db_location.strip()).is_file():
923
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
924
+ temp_dict = pickle.load(database_handle)
925
+ if isinstance(temp_dict,dict):
926
+ query_kmer_dict_list.append(temp_dict)
927
+ else:
928
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated databases.")
929
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
930
+ # If only the reference has a db
931
+ elif query_database == None and reference_database != None:
932
+ with open(reference_database) as reference_database_files:
933
+ for db_location in reference_database_files:
934
+ if Path(db_location.strip()).is_file():
935
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
936
+ temp_dict = pickle.load(database_handle)
937
+ if isinstance(temp_dict,dict):
938
+ reference_kmer_dict_list.append(temp_dict)
939
+ else:
940
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated databases.")
941
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
942
+ # ------------------------------------------------------
943
+
944
+ # Get files from the query and reference lists and then
945
+ # create a dictionary with resulting filenames and a list with dictionary keys
946
+ # The structure of the dictionary is:
947
+ # original_query, proteins, hmms, filtered_hmms
948
+ # ------------------------------------------------------
949
+ # First parse the query:
950
+ query_list = []
951
+ query_file_names = {}
952
+ # For bacterial genomes
953
+ if virus == False:
954
+ if query_database != None:
955
+ pass
956
+ else:
957
+ with open(query_input, 'r') as query_input_fh:
958
+ for line in query_input_fh:
959
+ query_list.append(line.strip())
960
+ for index, query in enumerate(query_list):
961
+ query_name = str(Path(query).name)
962
+ if extension != None:
963
+ query_name = query_name.replace(extension, "")
964
+ if query_hmms != None:
965
+ query_protein_list = []
966
+ with open(query_proteins, 'r') as query_protein_fh:
967
+ for line in query_protein_fh:
968
+ query_protein_list.append(line.strip())
969
+ query_file_names[query_name] = [None, query_protein_list[index], query, query + '.filt']
970
+ elif query_proteins != None:
971
+ query_file_names[query_name] = [None, query, query + '.hmm', query + '.hmm.filt']
972
+ elif query_genomes != None:
973
+ query_file_names[query_name] = [query, query + '.faa', query + '.faa.hmm', query + '.faa.hmm.filt']
974
+ # For viral genomes
975
+ else:
976
+ if query_database != None:
977
+ pass
978
+ else:
979
+ with open(query_input, 'r') as query_input_fh:
980
+ for line in query_input_fh:
981
+ query_list.append(line.strip())
982
+ for index, query in enumerate(query_list):
983
+ query_name = str(Path(query).name)
984
+ if extension != None:
985
+ query_name = query_name.replace(extension, "")
986
+ if query_proteins != None:
987
+ query_file_names[query_name] = [None, query]
988
+ elif query_genomes != None:
989
+ query_file_names[query_name] = [query, query + '.faa']
990
+
991
+ # Then parse the references:
992
+ reference_list = []
993
+ reference_file_names = {}
994
+ if same_inputs == True:
995
+ pass
996
+ else:
997
+ # For bacterial genomes
998
+ if virus == False:
999
+ if reference_database != None:
1000
+ pass
1001
+ else:
1002
+ with open(reference_input, 'r') as reference_input_fh:
1003
+ for line in reference_input_fh:
1004
+ reference_list.append(line.strip())
1005
+ for index, reference in enumerate(reference_list):
1006
+ reference_name = str(Path(reference).name)
1007
+ if extension != None:
1008
+ reference_name = reference_name.replace(extension, "")
1009
+ if reference_hmms != None:
1010
+ reference_protein_list = []
1011
+ with open(reference_proteins, 'r') as reference_protein_fh:
1012
+ for line in reference_protein_fh:
1013
+ reference_protein_list.append(line.strip())
1014
+ reference_file_names[reference_name] = [None, reference_protein_list[index], reference, reference + '.filt']
1015
+ elif reference_proteins != None:
1016
+ reference_file_names[reference_name] = [None, reference, reference + '.hmm', reference + '.hmm.filt']
1017
+ elif query_genomes != None:
1018
+ reference_file_names[reference_name] = [reference, reference + '.faa', reference + '.faa.hmm', reference + '.faa.hmm.filt']
1019
+ # For viral genomes
1020
+ else:
1021
+ if reference_database != None:
1022
+ pass
1023
+ else:
1024
+ with open(reference_input, 'r') as reference_input_fh:
1025
+ for line in reference_input_fh:
1026
+ reference_list.append(line.strip())
1027
+ for index, reference in enumerate(reference_list):
1028
+ reference_name = str(Path(reference).name)
1029
+ if extension != None:
1030
+ reference_name = reference_name.replace(extension, "")
1031
+ if reference_proteins != None:
1032
+ reference_file_names[reference_name] = [None, reference]
1033
+ elif query_genomes != None:
1034
+ reference_file_names[reference_name] = [reference, reference + '.faa']
1035
+ # ------------------------------------------------------
1036
+
1037
+ # Pre-index and store databases
1038
+ # ------------------------------------------------------
1039
+ # Pre-index queries
1040
+ if query_kmer_dict == None:
1041
+ print("Processing queries...")
1042
+ # If using bacterial genomes
1043
+ if virus == False:
1044
+ if query_hmms != None:
1045
+ query_hmm_results = query_list
1046
+ elif query_proteins != None:
1047
+ query_protein_files = query_list
1048
+ print("Searching against HMM models...")
1049
+ try:
1050
+ pool = multiprocessing.Pool(threads)
1051
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1052
+ finally:
1053
+ pool.close()
1054
+ pool.join()
1055
+ elif query_genomes != None:
1056
+ print("Predicting proteins...")
1057
+ # Predict query proteins
1058
+ try:
1059
+ pool = multiprocessing.Pool(threads)
1060
+ query_protein_files = pool.map(run_prodigal, query_list)
1061
+ finally:
1062
+ pool.close()
1063
+ pool.join()
1064
+ print("Done!")
1065
+ print("Searching against HMM models...")
1066
+ # Run hmmsearch against proteins predicted
1067
+ try:
1068
+ pool = multiprocessing.Pool(threads)
1069
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1070
+ finally:
1071
+ pool.close()
1072
+ pool.join()
1073
+ print("Done!")
1074
+ print("Filtering query hmmsearch results...")
1075
+ # Filter query HMM search results
1076
+ try:
1077
+ pool = multiprocessing.Pool(threads)
1078
+ pool.map(partial(hmm_filter, keep=keep), query_hmm_results)
1079
+ finally:
1080
+ pool.close()
1081
+ pool.join()
1082
+ print("Extracting kmers from query proteins...")
1083
+ # Finding kmers for all queries
1084
+ query_information = []
1085
+ for name, values in query_file_names.items():
1086
+ query_information.append((name, values[1], values[3]))
1087
+ try:
1088
+ pool = multiprocessing.Pool(threads)
1089
+ kmer_results = pool.map(kmer_extract, query_information)
1090
+ finally:
1091
+ pool.close()
1092
+ pool.join()
1093
+ query_kmer_dict = merge_dicts(kmer_results)
1094
+ del kmer_results
1095
+ # If using viral genomes
1096
+ else:
1097
+ if query_genomes != None:
1098
+ print("Predicting proteins...")
1099
+ # Predict query proteins
1100
+ try:
1101
+ pool = multiprocessing.Pool(threads)
1102
+ query_protein_files = pool.map(run_prodigal_virus, query_list)
1103
+ finally:
1104
+ pool.close()
1105
+ pool.join()
1106
+ print("Done!")
1107
+ elif query_proteins != None:
1108
+ query_protein_files = query_list
1109
+ print("Extracting kmers from query proteins...")
1110
+ query_information = []
1111
+ for name, values in query_file_names.items():
1112
+ query_information.append((name, values[1], 4))
1113
+ try:
1114
+ pool = multiprocessing.Pool(threads)
1115
+ kmer_results = pool.map(read_viral_kmers_from_file, query_information)
1116
+ finally:
1117
+ pool.close()
1118
+ pool.join()
1119
+ query_kmer_dict = merge_dicts(kmer_results)
1120
+ del kmer_results
1121
+
1122
+ # Pre-index references (if different from queries)
1123
+ if same_inputs == False and reference_kmer_dict == None:
1124
+ print("Processing references...")
1125
+ # If using bacterial genomes
1126
+ if virus == False:
1127
+ if reference_hmms != None:
1128
+ reference_hmm_results = reference_list
1129
+ elif reference_proteins != None:
1130
+ reference_protein_files = reference_list
1131
+ print("Searching against HMM models... ")
1132
+ try:
1133
+ pool = multiprocessing.Pool(threads)
1134
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1135
+ finally:
1136
+ pool.close()
1137
+ pool.join()
1138
+ if reference_genomes != None:
1139
+ print("Predicting proteins...")
1140
+ # Predict reference proteins
1141
+ try:
1142
+ pool = multiprocessing.Pool(threads)
1143
+ reference_protein_files = pool.map(run_prodigal, reference_list)
1144
+ finally:
1145
+ pool.close()
1146
+ pool.join()
1147
+ print("Done!")
1148
+ print("Searching against HMM models...")
1149
+ # Run hmmsearch against proteins predicted
1150
+ try:
1151
+ pool = multiprocessing.Pool(threads)
1152
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1153
+ finally:
1154
+ pool.close()
1155
+ pool.join()
1156
+ print("Done!")
1157
+ print("Filtering reference hmmsearch results...")
1158
+ # Filter reference HMM search results
1159
+ try:
1160
+ pool = multiprocessing.Pool(threads)
1161
+ pool.map(partial(hmm_filter, keep=keep), reference_hmm_results)
1162
+ finally:
1163
+ pool.close()
1164
+ pool.join()
1165
+ print("Extracting kmers from reference proteins...")
1166
+ # Finding kmers for all queries
1167
+ reference_information = []
1168
+ for name, values in reference_file_names.items():
1169
+ reference_information.append((name, values[1], values[3]))
1170
+ try:
1171
+ pool = multiprocessing.Pool(threads)
1172
+ kmer_results = pool.map(kmer_extract, reference_information)
1173
+ finally:
1174
+ pool.close()
1175
+ pool.join()
1176
+ reference_kmer_dict = merge_dicts(kmer_results)
1177
+ del kmer_results
1178
+ # If using viral genomes
1179
+ else:
1180
+ if query_genomes != None:
1181
+ print("Predicting proteins...")
1182
+ # Predict query proteins
1183
+ try:
1184
+ pool = multiprocessing.Pool(threads)
1185
+ query_protein_files = pool.map(run_prodigal, query_list)
1186
+ finally:
1187
+ pool.close()
1188
+ pool.join()
1189
+ print("Done!")
1190
+ elif query_proteins != None:
1191
+ query_protein_files = query_list
1192
+ print("Extracting kmers from query proteins...")
1193
+ reference_information = []
1194
+ for name, values in reference_file_names.items():
1195
+ reference_information.append((name, values[1], 4))
1196
+ try:
1197
+ pool = multiprocessing.Pool(threads)
1198
+ kmer_results = pool.map(read_viral_kmers_from_file, reference_information)
1199
+ finally:
1200
+ pool.close()
1201
+ pool.join()
1202
+ query_kmer_dict = merge_dicts(kmer_results)
1203
+ del kmer_results
1204
+ # ------------------------------------------------------
1205
+
1206
+ # Create or database(s) and compress it(them)
1207
+ # ------------------------------------------------------
1208
+ if same_inputs == True and query_database == None:
1209
+ print("Saving pre-indexed database...")
1210
+ query_database_name = query_input + '.db.gz'
1211
+ with gzip.open(query_database_name, 'wb') as database_handle:
1212
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1213
+ if same_inputs == False and query_database == None and reference_database == None:
1214
+ print("Saving pre-indexed databases...")
1215
+ query_database_name = query_input + '.db.gz'
1216
+ reference_database_name = reference_input + '.db.gz'
1217
+ with gzip.open(query_database_name, 'wb') as database_handle:
1218
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1219
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1220
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1221
+ elif same_inputs == False and query_database == None:
1222
+ print("Saving pre-indexed query database...")
1223
+ query_database_name = query_input + '.db.gz'
1224
+ with gzip.open(query_database_name, 'wb') as database_handle:
1225
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1226
+ elif same_inputs == False and reference_database == None:
1227
+ print("Saving pre-indexed reference database...")
1228
+ reference_database_name = reference_input + '.db.gz'
1229
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1230
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1231
+ # ------------------------------------------------------
1232
+ # Calculate Jaccard distances
1233
+ # ------------------------------------------------------
1234
+ if index_db == True:
1235
+ print("Finished pre-indexing databases.")
1236
+ print("Next time you can run the program using only these files with --qd and(or) --rd.")
1237
+ else:
1238
+ print("Calculating shared Kmer fraction...")
1239
+ if virus == False:
1240
+ if same_inputs == True:
1241
+ query_id_list = query_kmer_dict.keys()
1242
+ try:
1243
+
1244
+ fixed_dict, smart_args = numpyize_kmers(query_kmer_dict)
1245
+ #single_dictionary_initializer(fixed_dict)
1246
+
1247
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (fixed_dict,))
1248
+ Fraction_Results = pool.map(single_kaai_parser_all_v_all, smart_args)
1249
+ finally:
1250
+ pool.close()
1251
+ pool.join()
1252
+ else:
1253
+ query_id_list = query_kmer_dict.keys()
1254
+ try:
1255
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1256
+ Fraction_Results = pool.map(double_kaai_parser, query_id_list)
1257
+ finally:
1258
+ pool.close()
1259
+ pool.join()
1260
+ else:
1261
+ if same_inputs == True:
1262
+ query_id_list = query_kmer_dict.keys()
1263
+ try:
1264
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
1265
+ Fraction_Results = pool.map(single_virus_kaai_parser, query_id_list)
1266
+ finally:
1267
+ pool.close()
1268
+ pool.join()
1269
+ else:
1270
+ query_id_list = query_kmer_dict.keys()
1271
+ try:
1272
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1273
+ Fraction_Results = pool.map(double_viral_kaai_parser, query_id_list)
1274
+ finally:
1275
+ pool.close()
1276
+ pool.join()
1277
+ # ------------------------------------------------------
1278
+
1279
+ # Merge results into a single output
1280
+ # ------------------------------------------------------
1281
+ print("Merging results...")
1282
+ with open(output, 'w') as outfile:
1283
+ for file in Fraction_Results:
1284
+ with open(file) as Temp:
1285
+ shutil.copyfileobj(Temp, outfile)
1286
+ file.unlink()
1287
+ print("kAAI finishied correctly on {}".format(datetime.datetime.now()))
1288
+ # ------------------------------------------------------
1289
+ # If comparing viral genomes
1290
+
1291
+
1292
+
1293
+
1294
+
1295
+ if __name__ == "__main__":
1296
+ main()