miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,1336 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ ########################################################################
5
+ # Author: Carlos Ruiz
6
+ # Intitution: Georgia Institute of Technology
7
+ # Version: 1.0
8
+ # Date: Dec 10, 2020
9
+
10
+ # Description: Calculates the average amino acid identity using k-mers
11
+ from single copy genes. It is a faster version of the regular AAI (Blast
12
+ or Diamond) and the hAAI implemented in MiGA.
13
+ ########################################################################
14
+ """
15
+
16
+ ################################################################################
17
+ """---0.0 Import Modules---"""
18
+ import subprocess, argparse, multiprocessing, datetime, shutil
19
+ import textwrap, pickle, gzip
20
+ import numpy as np
21
+ from tempfile import TemporaryDirectory
22
+ from random import randint
23
+ from pathlib import Path
24
+ from sys import argv
25
+ from sys import exit
26
+ from functools import partial
27
+ import time
28
+
29
+
30
+ ################################################################################
31
+ """---1.0 Define Functions---"""
32
+ # --- Run prodigal ---
33
+ # ------------------------------------------------------
34
+ def run_prodigal(input_file):
35
+ """
36
+ Runs prodigal, compares translation tables and stores faa files
37
+
38
+ Arguments:
39
+ input_file -- Path to genome FastA file
40
+
41
+ Returns:
42
+ output -- Path to amino acid fasta result
43
+ """
44
+ # Predict proteins with translation tables 4 and 11
45
+ file_path = Path(input_file)
46
+ filename = file_path.name
47
+ folder = file_path.parent
48
+ protein_output = folder / (filename + '.faa')
49
+ output_11 = folder / (filename + '.faa.11')
50
+ temp_output = folder / (filename + '.temp')
51
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_11),
52
+ "-p", "meta", "-q", "-o", str(temp_output)])
53
+ output_4 = folder / (filename + '.faa.4')
54
+ temp_output = folder / (filename + '.temp')
55
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(output_4),
56
+ "-p", "meta", "-g", "4", "-q", "-o", str(temp_output)])
57
+
58
+ # Compare translation tables
59
+ length_4 = 0
60
+ length_11 = 0
61
+ with open(output_4, 'r') as table_4:
62
+ for line in table_4:
63
+ if line.startswith(">"):
64
+ continue
65
+ else:
66
+ length_4 += len(line.strip())
67
+
68
+ with open(output_11, 'r') as table_11:
69
+ for line in table_11:
70
+ if line.startswith(">"):
71
+ continue
72
+ else:
73
+ length_11 += len(line.strip())
74
+
75
+ if (length_4 / length_11) >= 1.1:
76
+ shutil.copy(output_4, protein_output)
77
+ else:
78
+ shutil.copy(str(output_11), str(protein_output))
79
+
80
+ # Remove intermediate files
81
+ output_4.unlink()
82
+ output_11.unlink()
83
+ temp_output.unlink()
84
+
85
+ # Remove stop '*' codons from protein sequences
86
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
87
+ for line in final_protein:
88
+ if line.startswith(">"):
89
+ temporal_file.write("{}".format(line))
90
+ else:
91
+ line = line.replace('*', '')
92
+ temporal_file.write("{}".format(line))
93
+ shutil.copy(str(temp_output), str(protein_output))
94
+ temp_output.unlink()
95
+
96
+ return str(protein_output)
97
+ # ------------------------------------------------------
98
+
99
+ # --- Run prodigal for viruses ---
100
+ # ------------------------------------------------------
101
+ def run_prodigal_virus(input_file):
102
+ """
103
+ Runs prodigal, compares translation tables and stores faa files
104
+
105
+ Arguments:
106
+ input_file -- Path to genome FastA file
107
+
108
+ Returns:
109
+ output -- Path to amino acid fasta result
110
+ """
111
+ # Predict proteins with translation tables 4 and 11
112
+ file_path = Path(input_file)
113
+ filename = file_path.name
114
+ folder = file_path.parent
115
+ protein_output = folder / (filename + '.faa')
116
+ temp_output = folder / (filename + '.temp')
117
+ subprocess.call(["prodigal", "-i", str(file_path), "-a", str(protein_output),
118
+ "-p", "meta", "-q", "-o", str(temp_output)])
119
+
120
+ # Remove intermediate files
121
+ temp_output.unlink()
122
+
123
+ # Remove stop '*' codons from protein sequences
124
+ with open(protein_output, 'r') as final_protein, open(temp_output, 'w') as temporal_file:
125
+ for line in final_protein:
126
+ if line.startswith(">"):
127
+ temporal_file.write("{}".format(line))
128
+ else:
129
+ line = line.replace('*', '')
130
+ temporal_file.write("{}".format(line))
131
+ shutil.copy(str(temp_output), str(protein_output))
132
+ temp_output.unlink()
133
+
134
+ return str(protein_output)
135
+ # ------------------------------------------------------
136
+
137
+ # --- Run hmmsearch ---
138
+ # ------------------------------------------------------
139
+ def run_hmmsearch(input_file):
140
+ """
141
+ Runs hmmsearch on the set of SCGs and select the
142
+ best Archaea or Bacterial model
143
+
144
+ Arguments:
145
+ input_file -- Path to protein FastA file
146
+
147
+ Returns:
148
+ output -- Path to hmmsearch hits table
149
+ """
150
+ file_path = Path(input_file)
151
+ folder = file_path.parent
152
+ name = file_path.name
153
+ hmm_output = folder / (name + '.hmm')
154
+ temp_output = folder / (name + '.temp')
155
+ script_path = Path(__file__)
156
+ script_dir = script_path.parent
157
+ hmm_complete_model = script_dir / "../00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
158
+ subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
159
+ str(hmm_complete_model), str(file_path)])
160
+ temp_output.unlink()
161
+ return str(hmm_output)
162
+ # ------------------------------------------------------
163
+
164
+ # --- Filter HMM results for best matches ---
165
+ # ------------------------------------------------------
166
+ def hmm_filter(scg_hmm_file, keep):
167
+ """
168
+ Filters HMM results for best hits per protein
169
+
170
+ Arguments:
171
+ SCG_HMM_file {file path} -- Path to HMM results file
172
+ keep {bool} -- Keep HMM files
173
+
174
+ Returns:
175
+ outfile -- Path to filtered files
176
+ """
177
+ hmm_path = Path(scg_hmm_file)
178
+ name = hmm_path.name
179
+ folder = hmm_path.parent
180
+ outfile = folder / (name + '.filt')
181
+ hmm_hit_dict = {}
182
+ with open(scg_hmm_file, 'r') as hit_file:
183
+ for line in hit_file:
184
+ if line.startswith("#"):
185
+ continue
186
+ else:
187
+ hit = line.strip().split()
188
+ protein_name = hit[0]
189
+ score = float(hit[8])
190
+ if protein_name in hmm_hit_dict:
191
+ if score > hmm_hit_dict[protein_name][0]:
192
+ hmm_hit_dict[protein_name] = [score, line]
193
+ elif score < hmm_hit_dict[protein_name][0]:
194
+ continue
195
+ else:
196
+ if randint(2) > 0:
197
+ hmm_hit_dict[protein_name] = [score, line]
198
+ else:
199
+ hmm_hit_dict[protein_name] = [score, line]
200
+ with open(outfile, 'w') as output:
201
+ for hits in hmm_hit_dict.values():
202
+ output.write("{}".format(hits[1]))
203
+ return str(outfile)
204
+ # ------------------------------------------------------
205
+
206
+ # --- Find Kmers from HMM results ---
207
+ # ------------------------------------------------------
208
+ def kmer_extract(input_files):
209
+ """
210
+ Extract kmers from protein files that have hits
211
+ in the HMM searches.
212
+
213
+ Arguments:
214
+ SCG_HMM_file {file path} -- Path to filtered HMM results.
215
+
216
+ Returns:
217
+ [genome_kmers] -- Dictionary of kmers per gene.
218
+ """
219
+ final_filename = input_files[0]
220
+ protein_file = input_files[1]
221
+ scg_hmm_file = input_files[2]
222
+ positive_matches = {}
223
+ positive_proteins = []
224
+ with open(scg_hmm_file, 'r') as hmm_input:
225
+ for line in hmm_input:
226
+ line = line.strip().split()
227
+ protein_name = line[0]
228
+ model_name = line[3]
229
+ score = line[8]
230
+ if model_name in positive_matches:
231
+ if score > positive_matches[model_name][1]:
232
+ positive_matches[model_name] = [protein_name, score]
233
+ else:
234
+ continue
235
+ else:
236
+ positive_matches[model_name] = [protein_name, score]
237
+ for proteins in positive_matches.values():
238
+ positive_proteins.append(proteins[0])
239
+ scg_kmers = read_kmers_from_file(protein_file, positive_proteins, 4)
240
+ for accession, protein in positive_matches.items():
241
+ scg_kmers[accession] = scg_kmers.pop(protein[0])
242
+ genome_kmers = {final_filename : scg_kmers}
243
+ return genome_kmers
244
+ # ------------------------------------------------------
245
+
246
+ # --- Extract kmers from protein sequences ---
247
+ # ------------------------------------------------------
248
+ def read_kmers_from_file(filename, positive_hits, ksize):
249
+ scg_kmers = {}
250
+ store_sequence = False
251
+ protein_name = ""
252
+ protein_sequence = ""
253
+ with open(filename) as fasta_in:
254
+ for line in fasta_in:
255
+ if line.startswith(">"):
256
+ if store_sequence == True:
257
+ kmers = build_kmers(protein_sequence, ksize)
258
+ scg_kmers[protein_name] = kmers
259
+ protein_sequence = ""
260
+ store_sequence = False
261
+ line = line.replace(">", "")
262
+ protein_name = line.strip().split()[0]
263
+ if protein_name in positive_hits:
264
+ store_sequence = True
265
+ else:
266
+ if store_sequence == True:
267
+ protein_sequence += line.strip()
268
+ else:
269
+ continue
270
+ if store_sequence == True:
271
+ kmers = build_kmers(protein_sequence, ksize)
272
+ scg_kmers[protein_name] = kmers
273
+ return scg_kmers
274
+ # ------------------------------------------------------
275
+
276
+ # --- Extract kmers from viral protein sequences ---
277
+ # ------------------------------------------------------
278
+ def read_viral_kmers_from_file(input_information):
279
+ final_filename = input_information[0]
280
+ protein_file = input_information[1]
281
+ kmer_size = input_information[2]
282
+
283
+ scg_kmers = set()
284
+ protein_sequence = ""
285
+ store_sequence = False
286
+ number_of_proteins = 0
287
+ with open(protein_file) as fasta_in:
288
+ for line in fasta_in:
289
+ if line.startswith(">"):
290
+ number_of_proteins += 1
291
+ if store_sequence == True:
292
+ kmers = build_viral_kmers(protein_sequence, kmer_size)
293
+ scg_kmers.update(kmers)
294
+ protein_sequence = ""
295
+ else:
296
+ protein_sequence = ""
297
+ store_sequence = True
298
+ else:
299
+ protein_sequence += line.strip()
300
+ if store_sequence == True:
301
+ kmers = build_viral_kmers(protein_sequence, kmer_size)
302
+ scg_kmers.update(kmers)
303
+ genome_kmers = {final_filename : [number_of_proteins, ','.join(list(scg_kmers))]}
304
+ return genome_kmers
305
+ # ------------------------------------------------------
306
+
307
+ # --- Build Kmers ---
308
+ # ------------------------------------------------------
309
+ def build_kmers(sequence, ksize):
310
+ kmers = []
311
+ n_kmers = len(sequence) - ksize + 1
312
+
313
+ for i in range(n_kmers):
314
+ kmer = sequence[i:i + ksize]
315
+ kmers.append(kmer)
316
+ kmers_set = ','.join(set(kmers))
317
+ return kmers_set
318
+ # ------------------------------------------------------
319
+
320
+ # --- Build Viral Kmers ---
321
+ # ------------------------------------------------------
322
+ def build_viral_kmers(sequence, ksize):
323
+ kmers = []
324
+ n_kmers = len(sequence) - ksize + 1
325
+
326
+ for i in range(n_kmers):
327
+ kmer = sequence[i:i + ksize]
328
+ kmers.append(kmer)
329
+ kmers_set = set(kmers)
330
+ return kmers_set
331
+ # ------------------------------------------------------
332
+
333
+ # --- Create global dictionary with unique kmers and indices for each one ---
334
+ # ------------------------------------------------------
335
+ def global_unique_kmers(kmer_dictionaries):
336
+ """
337
+ Extract every kmer in the whole dataset
338
+ Create global dictionary with unique kmers and indices for each one
339
+
340
+ Arguments:
341
+ kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
342
+
343
+ Returns:
344
+ [global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
345
+ """
346
+ # Make this dictionary global regardless of quer == reference or not
347
+ print("Indexing unique kmers")
348
+ global global_kmer_index_dictionary
349
+ global_kmer_index_dictionary = {}
350
+ counter = 0
351
+ for kmer_dict in kmer_dictionaries:
352
+ for marker_protein_id in kmer_dict.values():
353
+ for kmer_list in marker_protein_id.values():
354
+ kmer_list = kmer_list.split(',')
355
+ for kmer in kmer_list:
356
+ try:
357
+ global_kmer_index_dictionary[kmer]
358
+ except:
359
+ global_kmer_index_dictionary[kmer] = counter
360
+ counter += 1
361
+ # ------------------------------------------------------
362
+
363
+ # --- Create global viral dictionary with unique kmers and indices for each one ---
364
+ # ------------------------------------------------------
365
+ def global_unique_viral_kmers(kmer_dictionaries):
366
+ """
367
+ Extract every kmer in the whole dataset
368
+ Create global dictionary with unique kmers and indices for each one
369
+
370
+ Arguments:
371
+ kmer_dict {dict} -- Dictionary with kmers for each marker protein per input file
372
+
373
+ Returns:
374
+ [global_kmer_index_dictionary] -- Dictionary with a unique index per kmer
375
+ """
376
+ # Make this dictionary global regardless of quer == reference or not
377
+ print("Indexing unique kmers")
378
+ global global_kmer_index_dictionary
379
+ global_kmer_index_dictionary = {}
380
+ counter = 0
381
+ for kmer_dict in kmer_dictionaries:
382
+ for kmer_list in kmer_dict.values():
383
+ for kmer in kmer_list[1].split(','):
384
+ try:
385
+ global_kmer_index_dictionary[kmer]
386
+ except:
387
+ global_kmer_index_dictionary[kmer] = counter
388
+ counter += 1
389
+ # ------------------------------------------------------
390
+
391
+ # --- Convert kmers to indices ---
392
+ # ------------------------------------------------------
393
+ def convert_kmers_to_indices(kmer_dict):
394
+ print("Converting kmers to indices")
395
+ for genome in kmer_dict:
396
+ for protein_marker in kmer_dict[genome]:
397
+ kmer_index = []
398
+ for kmer in kmer_dict[genome][protein_marker].split(','):
399
+ kmer_index.append(global_kmer_index_dictionary[kmer])
400
+ kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
401
+ kmer_dict[genome][protein_marker] = kmer_index
402
+
403
+ return kmer_dict
404
+ # ------------------------------------------------------
405
+
406
+ # --- Convert viral kmers to indices ---
407
+ # ------------------------------------------------------
408
+ def convert_viral_kmers_to_indices(kmer_dict):
409
+ print("Converting kmers to indices")
410
+ for genome in kmer_dict:
411
+ kmer_index = []
412
+ for kmer in kmer_dict[genome][1].split(','):
413
+ kmer_index.append(global_kmer_index_dictionary[kmer])
414
+ kmer_index = np.sort(np.unique(np.array(kmer_index, dtype=np.int32)))
415
+ kmer_dict[genome][1] = kmer_index
416
+
417
+ return kmer_dict
418
+ # ------------------------------------------------------
419
+
420
+ # --- Transform kmer dictionaries to index dictionaries ---
421
+ # ------------------------------------------------------
422
+ def transform_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
423
+ kmer_dict = convert_kmers_to_indices(kmer_dict)
424
+ #Get skip indices
425
+ smartargs = []
426
+ genome_ids = list(kmer_dict.keys())
427
+ for i in range(0, len(genome_ids)):
428
+ if single_dataset == True:
429
+ smartargs.append((temporal_working_directory, genome_ids[i], i))
430
+ else:
431
+ smartargs.append((temporal_working_directory, genome_ids[i]))
432
+
433
+ return kmer_dict, smartargs
434
+ # ------------------------------------------------------
435
+
436
+ # --- Transform viral kmer dictionaries to index dictionaries ---
437
+ # ------------------------------------------------------
438
+ def transform_viral_kmer_dicts_to_arrays(kmer_dict, temporal_working_directory, single_dataset):
439
+ kmer_dict = convert_viral_kmers_to_indices(kmer_dict)
440
+ #Get skip indices
441
+ smartargs = []
442
+ genome_ids = list(kmer_dict.keys())
443
+ for i in range(0, len(genome_ids)):
444
+ if single_dataset == True:
445
+ smartargs.append((temporal_working_directory, genome_ids[i], i))
446
+ else:
447
+ smartargs.append((temporal_working_directory, genome_ids[i]))
448
+
449
+ return kmer_dict, smartargs
450
+ # ------------------------------------------------------
451
+
452
+ # --- Parse kAAI when query == reference ---
453
+ # ------------------------------------------------------
454
+ def single_kaai_parser(arguments):
455
+ """
456
+ Calculates the Jaccard distances using single protein markers shared by two genomes
457
+
458
+ Arguments:
459
+ arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
460
+
461
+ Returns:
462
+ [Path to output] -- Path to output file
463
+ """
464
+ temporal_folder = arguments[0]
465
+ query_id = arguments[1]
466
+ skip_first_n = arguments[2]
467
+
468
+ temporal_folder = Path(str(temporal_folder.name))
469
+ temporal_file = Path(query_id).name + '.faai.temp'
470
+ temporal_output = temporal_folder / temporal_file
471
+
472
+ query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
473
+ with open(temporal_output, 'w') as out_file:
474
+ #for target_genome, scg_ids in query_kmer_dictionary.items():
475
+ for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
476
+ # Get number and list of SCG detected in reference
477
+ target_scg_list = np.array(list(query_kmer_dictionary[target_genome].keys()))
478
+ shorter_genome = min(len(query_scg_list), len(target_scg_list))
479
+ #If self, 1.0 similarity.
480
+ if query_id == target_genome:
481
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
482
+ 1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
483
+ continue
484
+
485
+ jaccard_similarities = []
486
+ # Get shared proteins (scgs)
487
+ final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
488
+ # Extract a list of kmers for each SCG in the list
489
+ query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
490
+ reference_kmer_list = list(map(query_kmer_dictionary[target_genome].get, final_scg_list))
491
+ # Calculate the jaccard index
492
+ for accession in range(len(query_kmer_list)):
493
+ union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
494
+ intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
495
+ jaccard_similarities.append(intersection / union)
496
+
497
+ # Allow for numpy in-builts; they're a little faster.
498
+ if len(jaccard_similarities) > 0:
499
+ jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
500
+ try:
501
+ mean = np.mean(jaccard_similarities)
502
+ var = np.std(jaccard_similarities)
503
+ if mean >= 0.9:
504
+ aai_est = ">90%"
505
+ elif mean == 0:
506
+ aai_est = "<30%"
507
+ else:
508
+ aai_est = kaai_to_aai(mean)
509
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
510
+ round(mean, 4), round(var, 4),
511
+ len(jaccard_similarities), shorter_genome, aai_est))
512
+ except:
513
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
514
+ "NA", "NA", "NA", "NA", "NA"))
515
+ else:
516
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
517
+ "NA", "NA", "NA", "NA", "NA"))
518
+ return temporal_output
519
+ # ------------------------------------------------------
520
+
521
+ # --- Parse viral kAAI when query == reference ---
522
+ # ------------------------------------------------------
523
+ def single_virus_kaai_parser(arguments):
524
+ """
525
+ Calculates Jaccard distances on kmers from viral proteins
526
+
527
+ Arguments:
528
+ query_id {str} -- Id of the query genome
529
+
530
+ Returns:
531
+ [Path to output] -- Path to output file
532
+ """
533
+
534
+ temporal_folder = arguments[0]
535
+ query_id = arguments[1]
536
+ skip_first_n = arguments[2]
537
+
538
+ temporal_folder = Path(str(temporal_folder.name))
539
+ temporal_file = Path(query_id).name + '.faai.temp'
540
+ temporal_output = temporal_folder / temporal_file
541
+ # Get query kmers
542
+ proteins_query = query_kmer_dictionary[query_id][0]
543
+ kmers_query = query_kmer_dictionary[query_id][1]
544
+
545
+ # Start comparison with all genomes in the query dictionary
546
+ with open(temporal_output, 'w') as out_file:
547
+ for target_genome in list(query_kmer_dictionary.keys())[skip_first_n:]:
548
+ # If self, 1.0 similarity
549
+ if query_id == target_genome:
550
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
551
+ 1.0, proteins_query, proteins_query))
552
+ continue
553
+
554
+ jaccard_index = None
555
+ proteins_reference = query_kmer_dictionary[target_genome][0]
556
+ kmers_reference = query_kmer_dictionary[target_genome][1]
557
+ # Calculate the Jaccard Index
558
+ union = len(np.union1d(kmers_query, kmers_reference))
559
+ intersection = len(kmers_query) + len(kmers_reference) - union
560
+ jaccard_index = intersection/union
561
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
562
+ jaccard_index, proteins_query, proteins_reference))
563
+ return temporal_output
564
+ # ------------------------------------------------------
565
+
566
+ # --- Parse kAAI when query != reference ---
567
+ # ------------------------------------------------------
568
+ def double_kaai_parser(arguments):
569
+ """
570
+ Calculates the Jaccard distances using single protein markers shared by two genomes
571
+
572
+ Arguments:
573
+ arguments {tuple} -- Tuple with the temporal folder, the query id and the index of said query_id
574
+
575
+ Returns:
576
+ [Path to output] -- Path to output file
577
+ """
578
+ temporal_folder = arguments[0]
579
+ query_id = arguments[1]
580
+
581
+ temporal_folder = Path(str(temporal_folder.name))
582
+ temporal_file = Path(query_id).name + '.faai.temp'
583
+ temporal_output = temporal_folder / temporal_file
584
+
585
+ query_scg_list = np.array(list(query_kmer_dictionary[query_id].keys()))
586
+
587
+ with open(temporal_output, 'w') as out_file:
588
+ for target_genome in list(reference_kmer_dictionary.keys()):
589
+ # Get number and list of SCG detected in reference
590
+ target_scg_list = np.array(list(reference_kmer_dictionary[target_genome].keys()))
591
+ shorter_genome = min(len(query_scg_list), len(target_scg_list))
592
+ #If self, 1.0 similarity.
593
+ if query_id == target_genome:
594
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
595
+ 1.0, 0.0, len(query_scg_list), len(target_scg_list), 100))
596
+ continue
597
+
598
+ jaccard_similarities = []
599
+ # Get shared proteins (scgs)
600
+ final_scg_list = np.intersect1d(query_scg_list, target_scg_list)
601
+ # Extract a list of kmers for each SCG in the list
602
+ query_kmer_list = list(map(query_kmer_dictionary[query_id].get, final_scg_list))
603
+ reference_kmer_list = list(map(reference_kmer_dictionary[target_genome].get, final_scg_list))
604
+ # Calculate the jaccard index
605
+ for accession in range(len(query_kmer_list)):
606
+ union = len(np.union1d(query_kmer_list[accession], reference_kmer_list[accession]))
607
+ intersection = len(query_kmer_list[accession]) + len(reference_kmer_list[accession]) - union
608
+ jaccard_similarities.append(intersection / union)
609
+
610
+ # Allow for numpy in-builts; they're a little faster.
611
+ if len(jaccard_similarities) > 0:
612
+ jaccard_similarities = np.array(jaccard_similarities, dtype=np.float_)
613
+ try:
614
+ mean = np.mean(jaccard_similarities)
615
+ var = np.std(jaccard_similarities)
616
+ if mean >= 0.9:
617
+ aai_est = ">90%"
618
+ elif mean == 0:
619
+ aai_est = "<30%"
620
+ else:
621
+ aai_est = kaai_to_aai(mean)
622
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
623
+ round(mean, 4), round(var, 4),
624
+ len(jaccard_similarities), shorter_genome, aai_est))
625
+ except:
626
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
627
+ "NA", "NA", "NA", "NA", "NA"))
628
+ else:
629
+ out_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
630
+ "NA", "NA", "NA", "NA", "NA"))
631
+ return temporal_output
632
+ # ------------------------------------------------------
633
+
634
+ # --- Parse viral kAAI when query != reference ---
635
+ # ------------------------------------------------------
636
+ def double_viral_kaai_parser(arguments):
637
+ """
638
+ Calculates Jaccard distances on kmers from viral proteins
639
+
640
+ Arguments:
641
+ query_id {str} -- Id of the query genome
642
+
643
+ Returns:
644
+ [Path to output] -- Path to output file
645
+ """
646
+ temporal_folder = arguments[0]
647
+ query_id = arguments[1]
648
+
649
+ temporal_folder = Path(str(temporal_folder.name))
650
+ temporal_file = Path(query_id).name + '.faai.temp'
651
+ temporal_output = temporal_folder / temporal_file
652
+ # Get query kmers
653
+ proteins_query = query_kmer_dictionary[query_id][0]
654
+ kmers_query = query_kmer_dictionary[query_id][1]
655
+
656
+ # Start comparison with all genomes in the query dictionary
657
+ with open(temporal_output, 'w') as out_file:
658
+ for target_genome in reference_kmer_dictionary.keys():
659
+ # If self, 1.0 similarity
660
+ if query_id == target_genome:
661
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
662
+ 1.0, proteins_query, proteins_query))
663
+ continue
664
+
665
+ jaccard_index = None
666
+ proteins_reference = reference_kmer_dictionary[target_genome][0]
667
+ kmers_reference = reference_kmer_dictionary[target_genome][1]
668
+ # Calculate the Jaccard Index
669
+ union = len(np.union1d(kmers_query, kmers_reference))
670
+ intersection = len(kmers_query) + len(kmers_reference) - union
671
+ jaccard_index = intersection/union
672
+ out_file.write("{}\t{}\t{}\t{}\t{}\n".format(query_id, target_genome,
673
+ jaccard_index, proteins_query, proteins_reference))
674
+ return temporal_output
675
+ # ------------------------------------------------------
676
+
677
+ # --- Query == Reference initializer function ---
678
+ # ------------------------------------------------------
679
+ def single_dictionary_initializer(_dictionary):
680
+ """
681
+ Make dictionary available for multiprocessing
682
+ """
683
+ global query_kmer_dictionary
684
+ query_kmer_dictionary = _dictionary
685
+ # ------------------------------------------------------
686
+
687
+ # --- Query != Reference initializer function ---
688
+ # ------------------------------------------------------
689
+ def two_dictionary_initializer(_query_dictionary, _reference_dictionary):
690
+ """
691
+ Make dictionary available for multiprocessing
692
+ """
693
+ global query_kmer_dictionary
694
+ global reference_kmer_dictionary
695
+ query_kmer_dictionary = _query_dictionary
696
+ reference_kmer_dictionary = _reference_dictionary
697
+ # ------------------------------------------------------
698
+
699
+ # --- Merge kmer dictionaries ---
700
+ # ------------------------------------------------------
701
+ def merge_dicts(dictionaries):
702
+ """
703
+ Given any number of dicts, shallow copy and merge into a new dict,
704
+ precedence goes to key value pairs in latter dicts.
705
+ """
706
+ result = {}
707
+ for kmer_dictionary in dictionaries:
708
+ result.update(kmer_dictionary)
709
+ return result
710
+ # ------------------------------------------------------
711
+
712
+ # --- Merge kmer dictionaries ---
713
+ # ------------------------------------------------------
714
+ def kaai_to_aai(kaai):
715
+ # Transform the kAAI into estimated AAI values
716
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
717
+ return aai_hat
718
+ # ------------------------------------------------------
719
+
720
+
721
+ ################################################################################
722
+ """---2.0 Main Function---"""
723
+
724
+ def main():
725
+ # Setup parser for arguments.
726
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
727
+ description='''This script calculates the average amino acid identity using k-mers\n'''
728
+ '''from single copy genes. It is a faster version of the regular AAI '''
729
+ '''(Blast or Diamond) and the hAAI implemented in MiGA.'''
730
+ '''Usage: ''' + argv[0] + ''' -p [Protein Files] -t [Threads] -o [Output]\n'''
731
+ '''Global mandatory parameters: -g [Genome Files] OR -p [Protein Files] OR -s [SCG HMM Results] -o [AAI Table Output]\n'''
732
+ '''Optional Database Parameters: See ''' + argv[0] + ' -h')
733
+ mandatory_options = parser.add_argument_group('Mandatory i/o options. You must select an option for the queries and one for the references.')
734
+ mandatory_options.add_argument('--qg', dest='query_genomes', action='store', required=False,
735
+ help='File with list of query genomes.')
736
+ mandatory_options.add_argument('--qp', dest='query_proteins', action='store', required=False,
737
+ help='File with list of query proteins.')
738
+ mandatory_options.add_argument('--qh', dest='query_hmms', action='store', required=False,
739
+ help=textwrap.dedent('''
740
+ File with list of pre-computed query hmmsearch results.
741
+ If you select this option you must also provide a file with
742
+ a list of protein files for the queries (with --qp).
743
+ '''))
744
+ mandatory_options.add_argument('--qd', dest='query_database', action='store', required=False,
745
+ help='File with list of pre-indexed query databases.')
746
+ mandatory_options.add_argument('--rg', dest='reference_genomes', action='store', required=False,
747
+ help='File with list of reference genomes.')
748
+ mandatory_options.add_argument('--rp', dest='reference_proteins', action='store', required=False,
749
+ help='File with list of reference proteins.')
750
+ mandatory_options.add_argument('--rh', dest='reference_hmms', action='store', required=False,
751
+ help=textwrap.dedent('''
752
+ File with list of pre-computed reference hmmsearch results.
753
+ If you select this option you must also provide a file with
754
+ a list of protein files for the references (with --qp).
755
+ '''))
756
+ mandatory_options.add_argument('--rd', dest='reference_database', action='store', required=False,
757
+ help='File with list of pre-indexed reference databases.')
758
+ mandatory_options.add_argument('-o', '--output', dest='output', action='store', required=False, help='Output file. By default kaai_comparisons.txt')
759
+ additional_input_options = parser.add_argument_group('Behavior modification options.')
760
+ additional_input_options.add_argument('-e', '--ext', dest='extension', action='store', required=False,
761
+ help='Extension to remove from original filename, e.g. ".fasta"')
762
+ additional_input_options.add_argument('-i', '--index', dest='index_db', action='store_true', required=False,
763
+ help='Only index and store databases, i.e., do not perform comparisons.')
764
+ additional_input_options.add_argument('-a', '--all-vs-all', dest='all_vs_all',
765
+ action='store_true', required=False,
766
+ help='Perform all-vs-all comparison, using only query input.')
767
+ additional_input_options.add_argument('--input-paths', dest='input_paths',
768
+ action='store_true', required=False,
769
+ help='The input files are direct paths to the data, not lists of files.')
770
+ misc_options = parser.add_argument_group('Miscellaneous options')
771
+ misc_options.add_argument('--virus', dest='virus', action='store_true', required=False,
772
+ help='Toggle virus-virus comparisons. Use only with viral genomes or proteins.')
773
+ misc_options.add_argument('-t', '--threads', dest='threads', action='store', default=1, type=int, required=False,
774
+ help='Number of threads to use, by default 1')
775
+ misc_options.add_argument('-k', '--keep', dest='keep', action='store_false', required=False,
776
+ help='Keep intermediate files, by default true')
777
+
778
+ args = parser.parse_args()
779
+
780
+ query_genomes = args.query_genomes
781
+ query_proteins = args.query_proteins
782
+ query_hmms = args.query_hmms
783
+ query_database = args.query_database
784
+ if args.all_vs_all:
785
+ reference_genomes = query_genomes
786
+ reference_proteins = query_proteins
787
+ reference_hmms = query_hmms
788
+ reference_database = query_database
789
+ else:
790
+ reference_genomes = args.reference_genomes
791
+ reference_proteins = args.reference_proteins
792
+ reference_hmms = args.reference_hmms
793
+ reference_database = args.reference_database
794
+ output = args.output
795
+ if output == None:
796
+ output == "kaai_comparisons.txt"
797
+ extension = args.extension
798
+ index_db = args.index_db
799
+ threads = args.threads
800
+ keep = args.keep
801
+ virus = args.virus
802
+ input_paths = args.input_paths
803
+
804
+ print("FastAAI started on {}".format(datetime.datetime.now()))
805
+ # Check user input
806
+ # ------------------------------------------------------
807
+ # Check if no query was provided
808
+ if query_genomes == None and query_proteins == None and query_hmms == None and query_database == None:
809
+ exit('Please prove a file with a list of queries, e.g., --qg, --qp, --qh, or --qd)')
810
+ # Check query inputs
811
+ query_input = None
812
+ if query_hmms != None:
813
+ if virus == True:
814
+ exit("If you are comparing viruses, please start from the genome or protein files.")
815
+ query_input = query_hmms
816
+ if query_proteins != None:
817
+ print("Starting from query hmmsearch results.")
818
+ print("You also provided the list of protein files used for hmmsearch.")
819
+ elif query_proteins == None:
820
+ print("You chose to start from pre-computed hmmsearch results for your queries (--qh).")
821
+ print("However, I also need the location of the query proteins used for hmmsearch.")
822
+ exit("Please provide them with --qp.")
823
+ elif query_proteins != None:
824
+ query_input = query_proteins
825
+ print("Starting from query proteins.")
826
+ elif query_genomes != None:
827
+ query_input = query_genomes
828
+ print("Starting from query genomes.")
829
+ elif query_database != None:
830
+ query_input = query_database
831
+ print("Starting from the pre-indexed query database.")
832
+ # Check if no reference was provided
833
+ if reference_genomes == None and reference_proteins == None and reference_hmms == None and reference_database == None:
834
+ exit('Please prove a file with a list of references, e.g., --rg, --rp, --rh, or --rd)')
835
+ # Check reference inputs
836
+ reference_input = None
837
+ if reference_hmms != None:
838
+ if virus == True:
839
+ exit("If you are comparing viruses, please start from the genome or protein files.")
840
+ reference_input = reference_hmms
841
+ if reference_proteins != None:
842
+ print("Starting from reference hmmsearch results.")
843
+ print("You also provided the list of protein files used for hmmsearch.")
844
+ elif reference_proteins == None:
845
+ print("You chose to start from pre-computed hmmsearch results for your references (--rh).")
846
+ print("However, I also need the location of the query proteins used for hmmsearch.")
847
+ exit("Please provide them with --rp.")
848
+ elif reference_proteins != None:
849
+ reference_input = reference_proteins
850
+ print("Starting from reference proteins.")
851
+ elif reference_genomes != None:
852
+ reference_input = reference_genomes
853
+ print("Starting from reference genomes.")
854
+ elif reference_database != None:
855
+ reference_input = reference_database
856
+ print("Starting from the pre-indexed reference database.")
857
+ # ------------------------------------------------------
858
+
859
+ # Create temporal working directory
860
+ temporal_working_directory = TemporaryDirectory()
861
+ # ------------------------------------------------------
862
+
863
+ # Check if queries are the same as references (an all-vs-all comparison)
864
+ # ------------------------------------------------------
865
+ same_inputs = False
866
+ if query_input == reference_input:
867
+ same_inputs = True
868
+ if same_inputs == True:
869
+ print('You specified the same query and reference files.')
870
+ print('I will perform an all vs all comparison :)')
871
+ # ------------------------------------------------------
872
+
873
+ #* Database Parsing is the same regardless of bacterial or viral genomes
874
+ # If using pre-indexed databases, check if they are valid files.
875
+ # ------------------------------------------------------
876
+ # If any of the starting points is from database, then store the
877
+ # kmer structures in the corresponding dictionaries.
878
+ # Otherwise read the file list and get the filenames
879
+ query_kmer_dict = None
880
+ query_kmer_dict_list = []
881
+ reference_kmer_dict = None
882
+ reference_kmer_dict_list = []
883
+ query_database_files = []
884
+ reference_database_files = []
885
+ if query_database != None:
886
+ if input_paths == True:
887
+ query_database_files.append(query_database)
888
+ else:
889
+ with open(query_database) as database_files:
890
+ for db_location in database_files:
891
+ query_database_files.append(db_location)
892
+ if reference_database != None:
893
+ if input_paths == True:
894
+ reference_database_files.append(reference_database)
895
+ else:
896
+ with open(reference_database) as database_files:
897
+ for db_location in database_files:
898
+ reference_database_files.append(db_location)
899
+
900
+ # If starting from database and query == reference
901
+ if same_inputs == True:
902
+ if query_database != None:
903
+ for db_location in query_database_files:
904
+ if Path(db_location.strip()).is_file():
905
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
906
+ temp_dict = pickle.load(database_handle)
907
+ if isinstance(temp_dict,dict):
908
+ query_kmer_dict_list.append(temp_dict)
909
+ #Carlos, this line serves no purpose but does take a bunch of time and mem.
910
+ #print(query_kmer_dict_list)
911
+ else:
912
+ exit("One of the database files appear to have the wrong format. Please provide a correctly formated database.")
913
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
914
+ else:
915
+ # If the inputs are not the same:
916
+ # If query and ref are provided
917
+ if query_database != None and reference_database != None:
918
+ for db_location in query_database_files:
919
+ if Path(db_location.strip()).is_file():
920
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
921
+ temp_dict = pickle.load(database_handle)
922
+ if isinstance(temp_dict,dict):
923
+ query_kmer_dict_list.append(temp_dict)
924
+ else:
925
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
926
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
927
+ for db_location in reference_database_files:
928
+ if Path(db_location.strip()).is_file():
929
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
930
+ temp_dict = pickle.load(database_handle)
931
+ if isinstance(temp_dict,dict):
932
+ reference_kmer_dict_list.append(temp_dict)
933
+ else:
934
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
935
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
936
+ # If only the query has a db
937
+ elif query_database != None and reference_database == None:
938
+ for db_location in query_database_files:
939
+ if Path(db_location.strip()).is_file():
940
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
941
+ temp_dict = pickle.load(database_handle)
942
+ if isinstance(temp_dict,dict):
943
+ query_kmer_dict_list.append(temp_dict)
944
+ else:
945
+ exit("One of the query database files appear to have the wrong format. Please provide a correctly formated database.")
946
+ query_kmer_dict = merge_dicts(query_kmer_dict_list)
947
+ # If only the reference has a db
948
+ elif query_database == None and reference_database != None:
949
+ for db_location in reference_database_files:
950
+ if Path(db_location.strip()).is_file():
951
+ with gzip.open(db_location.strip(), 'rb') as database_handle:
952
+ temp_dict = pickle.load(database_handle)
953
+ if isinstance(temp_dict,dict):
954
+ reference_kmer_dict_list.append(temp_dict)
955
+ else:
956
+ exit("One of the reference database files appear to have the wrong format. Please provide a correctly formated database.")
957
+ reference_kmer_dict = merge_dicts(reference_kmer_dict_list)
958
+ # ------------------------------------------------------
959
+
960
+ # Get files from the query and reference lists and then
961
+ # create a dictionary with resulting filenames and a list with dictionary keys
962
+ # The structure of the dictionary is:
963
+ # original_query, proteins, hmms, filtered_hmms
964
+ # ------------------------------------------------------
965
+ # First parse the query:
966
+ query_list = []
967
+ query_file_names = {}
968
+ # For bacterial genomes
969
+ if virus == False:
970
+ if query_database != None:
971
+ pass
972
+ else:
973
+ if input_paths == True:
974
+ query_list.append(query_input)
975
+ else:
976
+ with open(query_input, 'r') as query_input_fh:
977
+ for line in query_input_fh:
978
+ query_list.append(line.strip())
979
+ for index, query in enumerate(query_list):
980
+ query_name = str(Path(query).name)
981
+ if extension != None:
982
+ query_name = query_name.replace(extension, "")
983
+ if query_hmms != None:
984
+ query_protein_list = []
985
+ with open(query_proteins, 'r') as query_protein_fh:
986
+ for line in query_protein_fh:
987
+ query_protein_list.append(line.strip())
988
+ query_file_names[query_name] = [None, query_protein_list[index], query, query + '.filt']
989
+ elif query_proteins != None:
990
+ query_file_names[query_name] = [None, query, query + '.hmm', query + '.hmm.filt']
991
+ elif query_genomes != None:
992
+ query_file_names[query_name] = [query, query + '.faa', query + '.faa.hmm', query + '.faa.hmm.filt']
993
+ # For viral genomes
994
+ else:
995
+ if query_database != None:
996
+ pass
997
+ else:
998
+ if input_paths == True:
999
+ query_list.append(query_input)
1000
+ else:
1001
+ with open(query_input, 'r') as query_input_fh:
1002
+ for line in query_input_fh:
1003
+ query_list.append(line.strip())
1004
+ for index, query in enumerate(query_list):
1005
+ query_name = str(Path(query).name)
1006
+ if extension != None:
1007
+ query_name = query_name.replace(extension, "")
1008
+ if query_proteins != None:
1009
+ query_file_names[query_name] = [None, query]
1010
+ elif query_genomes != None:
1011
+ query_file_names[query_name] = [query, query + '.faa']
1012
+
1013
+ # Then parse the references:
1014
+ reference_list = []
1015
+ reference_file_names = {}
1016
+ if same_inputs == True:
1017
+ pass
1018
+ else:
1019
+ # For bacterial genomes
1020
+ if virus == False:
1021
+ if reference_database != None:
1022
+ pass
1023
+ else:
1024
+ if input_paths == True:
1025
+ reference_list.append(reference_input)
1026
+ else:
1027
+ with open(reference_input, 'r') as reference_input_fh:
1028
+ for line in reference_input_fh:
1029
+ reference_list.append(line.strip())
1030
+ for index, reference in enumerate(reference_list):
1031
+ reference_name = str(Path(reference).name)
1032
+ if extension != None:
1033
+ reference_name = reference_name.replace(extension, "")
1034
+ if reference_hmms != None:
1035
+ reference_protein_list = []
1036
+ with open(reference_proteins, 'r') as reference_protein_fh:
1037
+ for line in reference_protein_fh:
1038
+ reference_protein_list.append(line.strip())
1039
+ reference_file_names[reference_name] = [None, reference_protein_list[index], reference, reference + '.filt']
1040
+ elif reference_proteins != None:
1041
+ reference_file_names[reference_name] = [None, reference, reference + '.hmm', reference + '.hmm.filt']
1042
+ elif query_genomes != None:
1043
+ reference_file_names[reference_name] = [reference, reference + '.faa', reference + '.faa.hmm', reference + '.faa.hmm.filt']
1044
+ # For viral genomes
1045
+ else:
1046
+ if reference_database != None:
1047
+ pass
1048
+ else:
1049
+ if input_paths == True:
1050
+ reference_list.append(reference_input)
1051
+ else:
1052
+ with open(reference_input, 'r') as reference_input_fh:
1053
+ for line in reference_input_fh:
1054
+ reference_list.append(line.strip())
1055
+ for index, reference in enumerate(reference_list):
1056
+ reference_name = str(Path(reference).name)
1057
+ if extension != None:
1058
+ reference_name = reference_name.replace(extension, "")
1059
+ if reference_proteins != None:
1060
+ reference_file_names[reference_name] = [None, reference]
1061
+ elif query_genomes != None:
1062
+ reference_file_names[reference_name] = [reference, reference + '.faa']
1063
+ # ------------------------------------------------------
1064
+
1065
+ # Pre-index and store databases
1066
+ # ------------------------------------------------------
1067
+ # Pre-index queries
1068
+ if query_kmer_dict == None:
1069
+ print("Processing queries...")
1070
+ # If using bacterial genomes
1071
+ if virus == False:
1072
+ if query_hmms != None:
1073
+ query_hmm_results = query_list
1074
+ elif query_proteins != None:
1075
+ query_protein_files = query_list
1076
+ print("Searching against HMM models...")
1077
+ try:
1078
+ pool = multiprocessing.Pool(threads)
1079
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1080
+ finally:
1081
+ pool.close()
1082
+ pool.join()
1083
+ elif query_genomes != None:
1084
+ print("Predicting proteins...")
1085
+ # Predict query proteins
1086
+ try:
1087
+ pool = multiprocessing.Pool(threads)
1088
+ query_protein_files = pool.map(run_prodigal, query_list)
1089
+ finally:
1090
+ pool.close()
1091
+ pool.join()
1092
+ print("Done!")
1093
+ print("Searching against HMM models...")
1094
+ # Run hmmsearch against proteins predicted
1095
+ try:
1096
+ pool = multiprocessing.Pool(threads)
1097
+ query_hmm_results = pool.map(run_hmmsearch, query_protein_files)
1098
+ finally:
1099
+ pool.close()
1100
+ pool.join()
1101
+ print("Done!")
1102
+ print("Filtering query hmmsearch results...")
1103
+ # Filter query HMM search results
1104
+ try:
1105
+ pool = multiprocessing.Pool(threads)
1106
+ pool.map(partial(hmm_filter, keep=keep), query_hmm_results)
1107
+ finally:
1108
+ pool.close()
1109
+ pool.join()
1110
+ print("Extracting kmers from query proteins...")
1111
+ # Finding kmers for all queries
1112
+ query_information = []
1113
+ for name, values in query_file_names.items():
1114
+ query_information.append((name, values[1], values[3]))
1115
+ try:
1116
+ pool = multiprocessing.Pool(threads)
1117
+ kmer_results = pool.map(kmer_extract, query_information)
1118
+ finally:
1119
+ pool.close()
1120
+ pool.join()
1121
+ query_kmer_dict = merge_dicts(kmer_results)
1122
+ del kmer_results
1123
+ # If using viral genomes
1124
+ else:
1125
+ if query_genomes != None:
1126
+ print("Predicting proteins...")
1127
+ # Predict query proteins
1128
+ try:
1129
+ pool = multiprocessing.Pool(threads)
1130
+ query_protein_files = pool.map(run_prodigal_virus, query_list)
1131
+ finally:
1132
+ pool.close()
1133
+ pool.join()
1134
+ print("Done!")
1135
+ elif query_proteins != None:
1136
+ query_protein_files = query_list
1137
+ print("Extracting kmers from query proteins...")
1138
+ query_information = []
1139
+ for name, values in query_file_names.items():
1140
+ query_information.append((name, values[1], 4))
1141
+ try:
1142
+ pool = multiprocessing.Pool(threads)
1143
+ kmer_results = pool.map(read_viral_kmers_from_file, query_information)
1144
+ finally:
1145
+ pool.close()
1146
+ pool.join()
1147
+ query_kmer_dict = merge_dicts(kmer_results)
1148
+ del kmer_results
1149
+
1150
+ # Pre-index references (if different from queries)
1151
+ if same_inputs == False and reference_kmer_dict == None:
1152
+ print("Processing references...")
1153
+ # If using bacterial genomes
1154
+ if virus == False:
1155
+ if reference_hmms != None:
1156
+ reference_hmm_results = reference_list
1157
+ elif reference_proteins != None:
1158
+ reference_protein_files = reference_list
1159
+ print("Searching against HMM models... ")
1160
+ try:
1161
+ pool = multiprocessing.Pool(threads)
1162
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1163
+ finally:
1164
+ pool.close()
1165
+ pool.join()
1166
+ if reference_genomes != None:
1167
+ print("Predicting proteins...")
1168
+ # Predict reference proteins
1169
+ try:
1170
+ pool = multiprocessing.Pool(threads)
1171
+ reference_protein_files = pool.map(run_prodigal, reference_list)
1172
+ finally:
1173
+ pool.close()
1174
+ pool.join()
1175
+ print("Done!")
1176
+ print("Searching against HMM models...")
1177
+ # Run hmmsearch against proteins predicted
1178
+ try:
1179
+ pool = multiprocessing.Pool(threads)
1180
+ reference_hmm_results = pool.map(run_hmmsearch, reference_protein_files)
1181
+ finally:
1182
+ pool.close()
1183
+ pool.join()
1184
+ print("Done!")
1185
+ print("Filtering reference hmmsearch results...")
1186
+ # Filter reference HMM search results
1187
+ try:
1188
+ pool = multiprocessing.Pool(threads)
1189
+ pool.map(partial(hmm_filter, keep=keep), reference_hmm_results)
1190
+ finally:
1191
+ pool.close()
1192
+ pool.join()
1193
+ print("Extracting kmers from reference proteins...")
1194
+ # Finding kmers for all queries
1195
+ reference_information = []
1196
+ for name, values in reference_file_names.items():
1197
+ reference_information.append((name, values[1], values[3]))
1198
+ try:
1199
+ pool = multiprocessing.Pool(threads)
1200
+ kmer_results = pool.map(kmer_extract, reference_information)
1201
+ finally:
1202
+ pool.close()
1203
+ pool.join()
1204
+ reference_kmer_dict = merge_dicts(kmer_results)
1205
+ del kmer_results
1206
+ # If using viral genomes
1207
+ else:
1208
+ if query_genomes != None:
1209
+ print("Predicting proteins...")
1210
+ # Predict query proteins
1211
+ try:
1212
+ pool = multiprocessing.Pool(threads)
1213
+ query_protein_files = pool.map(run_prodigal, query_list)
1214
+ finally:
1215
+ pool.close()
1216
+ pool.join()
1217
+ print("Done!")
1218
+ elif query_proteins != None:
1219
+ query_protein_files = query_list
1220
+ print("Extracting kmers from query proteins...")
1221
+ reference_information = []
1222
+ for name, values in reference_file_names.items():
1223
+ reference_information.append((name, values[1], 4))
1224
+ try:
1225
+ pool = multiprocessing.Pool(threads)
1226
+ kmer_results = pool.map(read_viral_kmers_from_file, reference_information)
1227
+ finally:
1228
+ pool.close()
1229
+ pool.join()
1230
+ reference_kmer_dict = merge_dicts(kmer_results)
1231
+ del kmer_results
1232
+ # ------------------------------------------------------
1233
+
1234
+ # Create or database(s) and compress it(them)
1235
+ # ------------------------------------------------------
1236
+ if same_inputs == True and query_database == None:
1237
+ print("Saving pre-indexed database...")
1238
+ query_database_name = query_input + '.db.gz'
1239
+ with gzip.open(query_database_name, 'wb') as database_handle:
1240
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1241
+ if same_inputs == False and query_database == None and reference_database == None:
1242
+ print("Saving pre-indexed databases...")
1243
+ query_database_name = query_input + '.db.gz'
1244
+ reference_database_name = reference_input + '.db.gz'
1245
+ with gzip.open(query_database_name, 'wb') as database_handle:
1246
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1247
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1248
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1249
+ elif same_inputs == False and query_database == None:
1250
+ print("Saving pre-indexed query database...")
1251
+ query_database_name = query_input + '.db.gz'
1252
+ with gzip.open(query_database_name, 'wb') as database_handle:
1253
+ pickle.dump(query_kmer_dict, database_handle, protocol=4)
1254
+ elif same_inputs == False and reference_database == None:
1255
+ print("Saving pre-indexed reference database...")
1256
+ reference_database_name = reference_input + '.db.gz'
1257
+ with gzip.open(reference_database_name, 'wb') as database_handle:
1258
+ pickle.dump(reference_kmer_dict, database_handle, protocol=4)
1259
+ # ------------------------------------------------------
1260
+ # Calculate Jaccard distances
1261
+ # ------------------------------------------------------
1262
+ if index_db == True:
1263
+ print("Finished pre-indexing databases.")
1264
+ print("Next time you can run the program using only these files with --qd and(or) --rd.")
1265
+ else:
1266
+ print("Calculating shared kmer fraction...")
1267
+ if virus == False:
1268
+ if same_inputs == True:
1269
+ # Create global kmer index dictionary "global_kmer_index_dictionary"
1270
+ print(temporal_working_directory)
1271
+ global_unique_kmers([query_kmer_dict])
1272
+ query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
1273
+ print("Beginning FastAAI pairwise calculations now.")
1274
+ try:
1275
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
1276
+ Fraction_Results = pool.map(single_kaai_parser, query_smart_args_tempdir)
1277
+ finally:
1278
+ pool.close()
1279
+ pool.join()
1280
+ else:
1281
+ print(temporal_working_directory)
1282
+ global_unique_kmers([query_kmer_dict, reference_kmer_dict])
1283
+ query_kmer_dict, query_smart_args_tempdir = transform_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
1284
+ reference_kmer_dict, _ref_smart_args_tempdir = transform_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
1285
+ print("Beginning FastAAI pairwise calculations now.")
1286
+ try:
1287
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1288
+ Fraction_Results = pool.map(double_kaai_parser, query_smart_args_tempdir)
1289
+ finally:
1290
+ pool.close()
1291
+ pool.join()
1292
+ else:
1293
+ if same_inputs == True:
1294
+ print(temporal_working_directory)
1295
+ global_unique_viral_kmers([query_kmer_dict])
1296
+ query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=True)
1297
+ print("Beginning FastAAI pairwise calculations now.")
1298
+ try:
1299
+ pool = multiprocessing.Pool(threads, initializer = single_dictionary_initializer, initargs = (query_kmer_dict,))
1300
+ Fraction_Results = pool.map(single_virus_kaai_parser, query_smart_args_tempdir)
1301
+ finally:
1302
+ pool.close()
1303
+ pool.join()
1304
+ else:
1305
+ print(temporal_working_directory)
1306
+ global_unique_viral_kmers([query_kmer_dict, reference_kmer_dict])
1307
+ query_kmer_dict, query_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(query_kmer_dict, temporal_working_directory, single_dataset=False)
1308
+ reference_kmer_dict, _ref_smart_args_tempdir = transform_viral_kmer_dicts_to_arrays(reference_kmer_dict, temporal_working_directory, single_dataset=False)
1309
+ print("Beginning FastAAI pairwise calculations now.")
1310
+ try:
1311
+ pool = multiprocessing.Pool(threads, initializer = two_dictionary_initializer, initargs = (query_kmer_dict, reference_kmer_dict))
1312
+ Fraction_Results = pool.map(double_viral_kaai_parser, query_smart_args_tempdir)
1313
+ finally:
1314
+ pool.close()
1315
+ pool.join()
1316
+ # ------------------------------------------------------
1317
+
1318
+ # Merge results into a single output
1319
+ # ------------------------------------------------------
1320
+ print("Merging results...")
1321
+ print(temporal_working_directory)
1322
+ with open(output, 'w') as outfile:
1323
+ for file in Fraction_Results:
1324
+ with open(file) as Temp:
1325
+ shutil.copyfileobj(Temp, outfile)
1326
+ file.unlink()
1327
+ print("FastAAI finishied correctly on {}".format(datetime.datetime.now()))
1328
+ # ------------------------------------------------------
1329
+ # If comparing viral genomes
1330
+
1331
+
1332
+
1333
+
1334
+
1335
+ if __name__ == "__main__":
1336
+ main()