miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,419 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ require 'optparse'
7
+ require 'tmpdir'
8
+ require 'zlib'
9
+ has_rest_client = true
10
+ has_sqlite3 = true
11
+ begin
12
+ require 'rubygems'
13
+ require 'restclient'
14
+ rescue LoadError
15
+ has_rest_client = false
16
+ end
17
+ begin
18
+ require 'sqlite3'
19
+ rescue LoadError
20
+ has_sqlite3 = false
21
+ end
22
+
23
+ o = {
24
+ bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
25
+ thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
26
+ len_fraction: 0.0, max_actg: 0.95
27
+ }
28
+ ARGV << '-h' if ARGV.size == 0
29
+ OptionParser.new do |opts|
30
+ opts.banner = "
31
+ Calculates the Average Amino Acid Identity between two genomes
32
+
33
+ Usage: #{$0} [options]"
34
+ opts.separator ''
35
+ opts.separator 'Mandatory'
36
+ opts.on(
37
+ '-1', '--seq1 FILE',
38
+ 'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
39
+ ) { |v| o[:seq1] = v }
40
+ opts.on(
41
+ '-2', '--seq2 FILE',
42
+ 'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
43
+ ) { |v| o[:seq2] = v }
44
+ if has_rest_client
45
+ opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
46
+ 'genome (nucleotides) with the format ncbi:CP014272 instead of files'
47
+ else
48
+ opts.separator ' Install rest-client to enable NCBI-acc support'
49
+ end
50
+ opts.separator ''
51
+ opts.separator 'Search Options'
52
+ opts.on(
53
+ '-l', '--len INT', Integer,
54
+ "Minimum alignment length (in residues). By default: #{o[:len]}"
55
+ ) { |v| o[:len] = v }
56
+ opts.on(
57
+ '-L', '--len-fraction NUM', Float,
58
+ 'Minimum alignment length as a fraction of the shorter sequence',
59
+ "(range 0-1). By default: #{o[:len_fraction]}"
60
+ ) { |v| o[:len_fraction] = v }
61
+ opts.on(
62
+ '-i', '--id FLOAT', Float,
63
+ "Minimum alignment identity (in %). By default: #{o[:id]}"
64
+ ) { |v| o[:id] = v }
65
+ opts.on(
66
+ '-s', '--bitscore FLOAT', Float,
67
+ "Minimum bit score (in bits). By default: #{o[:bits]}"
68
+ ) { |v| o[:bits] = v }
69
+ opts.on(
70
+ '-n', '--hits INT', Integer,
71
+ "Minimum number of hits. By default: #{o[:hits]}"
72
+ ) { |v| o[:hits] = v }
73
+ opts.on(
74
+ '-N', '--nucl',
75
+ 'The input sequences are nucleotides (genes), not proteins'
76
+ ) { |v| o[:nucl] = v }
77
+ opts.on(
78
+ '--max-actg FLOAT', Float,
79
+ 'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
80
+ "By default: #{o[:max_actg]}"
81
+ ) { |v| o[:max_actg] = v }
82
+ opts.separator ''
83
+ opts.separator 'Software Options'
84
+ opts.on(
85
+ '-b', '--bin DIR',
86
+ 'Path to the directory containing the binaries of the search program'
87
+ ) { |v| o[:bin] = v }
88
+ opts.on(
89
+ '-p', '--program STR',
90
+ 'Search program to be used. One of: blast+ (default), blast, blat, diamond'
91
+ ) { |v| o[:program] = v }
92
+ opts.on(
93
+ '-t', '--threads INT', Integer,
94
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
95
+ ) { |v| o[:thr] = v }
96
+ opts.separator ''
97
+ opts.separator 'SQLite3 Options'
98
+ unless has_sqlite3
99
+ opts.separator ' Install sqlite3 gem to enable database support'
100
+ end
101
+ opts.on(
102
+ '-S', '--sqlite3 FILE',
103
+ 'Path to the SQLite3 database to create (or update) with the results'
104
+ ) { |v| o[:sqlite3] = v }
105
+ opts.on(
106
+ '--name1 STR',
107
+ 'Name of --seq1 to use in --sqlite3. By default determined by filename'
108
+ ) { |v| o[:seq1name] = v }
109
+ opts.on(
110
+ '--name2 STR',
111
+ 'Name of --seq2 to use in --sqlite3. By default determined by filename'
112
+ ) { |v| o[:seq2name] = v }
113
+ opts.on(
114
+ '--[no-]save-rbm',
115
+ 'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
116
+ "By default: #{o[:dbrbm]}"
117
+ ) { |v| o[:dbrbm] = v }
118
+ opts.on(
119
+ '--lookup-first',
120
+ 'Indicates if the AAI should be looked up first in the database',
121
+ 'Requires --sqlite3, --auto, --name1, and --name2',
122
+ 'Incompatible with --res, --tab, --out, and --rbm'
123
+ ) { |v| o[:lookupfirst] = v }
124
+ opts.separator ''
125
+ opts.separator 'Other Output Options'
126
+ opts.on(
127
+ '-d', '--dec INT', Integer,
128
+ "Decimal positions to report. By default: #{o[:dec]}"
129
+ ) { |v| o[:dec] = v }
130
+ opts.on(
131
+ '-R', '--rbm FILE',
132
+ 'Saves a file with the reciprocal best matches'
133
+ ) { |v| o[:rbm] = v }
134
+ opts.on(
135
+ '-o', '--out FILE',
136
+ 'Saves a file describing the alignments used for two-way AAI'
137
+ ) { |v| o[:out] = v }
138
+ opts.on(
139
+ '-r', '--res FILE', 'Saves a file with the final results'
140
+ ) { |v| o[:res] = v }
141
+ opts.on(
142
+ '-T', '--tab FILE',
143
+ 'Saves a file with the final two-way results in a tab-delimited form',
144
+ 'The columns are (in that order):',
145
+ 'AAI, standard deviation, proteins used, proteins in the smallest genome'
146
+ ) { |v| o[:tab] = v }
147
+ opts.on(
148
+ '-a', '--auto',
149
+ 'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
150
+ ) { o[:auto] = true }
151
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
152
+ opts.on('-h', '--help', 'Display this screen') do
153
+ puts opts
154
+ exit
155
+ end
156
+ opts.separator ''
157
+ end.parse!
158
+
159
+ # Check input
160
+ abort '-1 is mandatory' if o[:seq1].nil?
161
+ abort '-2 is mandatory' if o[:seq2].nil?
162
+ if o[:program] == 'diamond' && o[:nucl]
163
+ abort '-p diamond is incompatible with -N'
164
+ end
165
+ unless o[:sqlite3].nil? or has_sqlite3
166
+ abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
167
+ end
168
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
169
+ if o[:lookupfirst]
170
+ abort '--lookup-first requires --name1' if o[:seq1name].nil?
171
+ abort '--lookup-first requires --name2' if o[:seq2name].nil?
172
+ abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
173
+ abort '--lookup-first requires --auto' unless o[:auto]
174
+ %w[res tab out rbm].each do |k|
175
+ abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
176
+ end
177
+ end
178
+
179
+ # Create SQLite3 file
180
+ unless o[:sqlite3].nil?
181
+ $stderr.puts "Accessing SQLite3 file: #{o[:sqlite3]}." unless o[:q]
182
+ sqlite_db = SQLite3::Database.new o[:sqlite3]
183
+ sqlite_db.execute "create table if not exists rbm( seq1 varchar(256), " +
184
+ "seq2 varchar(256), id1 varchar(256), id2 varchar(256), id float, " +
185
+ "evalue float, bitscore float )"
186
+ sqlite_db.execute "create table if not exists aai( seq1 varchar(256), " +
187
+ "seq2 varchar(256), aai float, sd float, n int, omega int )"
188
+ end
189
+
190
+ # Look-up first
191
+ if o[:lookupfirst]
192
+ val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
193
+ [o[:seq1name], o[:seq2name]]
194
+ val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
195
+ [o[:seq2name], o[:seq1name]] if val.empty?
196
+ unless val.empty?
197
+ puts val.first.first
198
+ exit
199
+ end
200
+ end
201
+
202
+ Dir.mktmpdir do |dir|
203
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
204
+
205
+ # Create databases.
206
+ $stderr.puts "Creating databases." unless o[:q]
207
+ minfrg = nil
208
+ seq_names = []
209
+ seq_len = {}
210
+ actg_cnt = {}
211
+ ori_ids = {}
212
+ [:seq1, :seq2].each do |seq|
213
+ abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
214
+ /^gi:/.match(o[seq])
215
+ acc = /^ncbi:(\S+)/.match(o[seq])
216
+ unless acc.nil?
217
+ abort "NCBI-acc requested, but rest-client not supported. First " +
218
+ "install gem rest-client." unless has_rest_client
219
+ abort "NCBI-acc are currently not supported with --nucl. Please use " +
220
+ "ani.rb instead." if o[:nucl]
221
+ $stderr.puts " Downloading dataset from NCBI:#{acc[1]}." unless o[:q]
222
+ responseLink = RestClient.get(
223
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
224
+ {params:{db:"protein",dbfrom:"nuccore",id:acc[1],idtype:"acc"}})
225
+ abort "Unable to reach NCBI EUtils, error code " +
226
+ responseLink.code.to_s + "." unless responseLink.code == 200
227
+ fromId = true
228
+ protIds = []
229
+ o[seq] = "#{dir}/ncbi-#{seq.to_s}.fa"
230
+ fo = File.open(o[seq], "w")
231
+ responseLink.to_str.each_line.grep(/\s<Id>/) do |ln|
232
+ idMatch = /<Id>(\S+)<\/Id>/.match(ln)
233
+ unless idMatch.nil?
234
+ protIds.push(idMatch[1]) unless fromId
235
+ fromId = false
236
+ end
237
+ end
238
+ response = RestClient.post(
239
+ 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
240
+ db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
241
+ )
242
+ abort "Unable to reach NCBI EUtils, error code " +
243
+ response.code.to_s + "." unless response.code == 200
244
+ fo.puts response.to_str
245
+ fo.close
246
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
247
+ "ncbi:#{acc[1]}" :
248
+ o[ "#{seq}name".to_sym ])
249
+ else
250
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
251
+ File.basename(o[seq], ".*") :
252
+ o[ "#{seq}name".to_sym ])
253
+ end
254
+ $stderr.puts " Reading FastA file: #{o[seq]}" unless o[:q]
255
+ unless o[:sqlite3].nil?
256
+ sqlite_db.execute "delete from rbm where seq1=? and seq2=?", seq_names
257
+ sqlite_db.execute "delete from aai where seq1=? and seq2=?", seq_names
258
+ end
259
+ ori_ids[seq] = [nil]
260
+ seq_len[seq] = [0]
261
+ actg_cnt[seq] = 0
262
+ seqs = 0
263
+ fi = File.extname(o[seq]) == '.gz' ?
264
+ Zlib::GzipReader.open(o[seq]) :
265
+ File.open(o[seq], 'r')
266
+ File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
267
+ fi.each_line do |ln|
268
+ if ln =~ /^>(\S+)/
269
+ seqs += 1
270
+ ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
271
+ seq_len[seq][seqs] = 0
272
+ fo.puts ">#{seqs}"
273
+ else
274
+ fo.puts ln
275
+ seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
276
+ actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
277
+ end
278
+ end
279
+ end
280
+ fi.close
281
+ unless o[:nucl]
282
+ actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
283
+ abort "Input sequences appear to be nucleotides " +
284
+ "(ACTGN fraction: %.2f%%)." % (actg_frx*100) if actg_frx > o[:max_actg]
285
+ end
286
+ $stderr.puts " File contains #{seqs} sequences." unless o[:q]
287
+ minfrg ||= seqs
288
+ minfrg = seqs if minfrg > seqs
289
+ case o[:program].downcase
290
+ when "blast"
291
+ `"#{o[:bin]}formatdb" -i "#{dir}/#{seq}.fa" \
292
+ -p #{o[:nucl] ? "F" : "T"}`
293
+ when "blast+"
294
+ `"#{o[:bin]}makeblastdb" -in "#{dir}/#{seq}.fa" \
295
+ -dbtype #{o[:nucl] ? "nucl" : "prot"}`
296
+ when "blat"
297
+ # Nothing to do
298
+ when "diamond"
299
+ `"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
300
+ --db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}" \
301
+ --quiet`
302
+ else
303
+ abort "Unsupported program: #{o[:program]}."
304
+ end
305
+ end
306
+
307
+ # Best-hits.
308
+ $stderr.puts "Running one-way comparisons." unless o[:q]
309
+ rbh = []
310
+ id2 = 0
311
+ sq2 = 0
312
+ n2 = 0
313
+ unless o[:out].nil?
314
+ fo = File.open(o[:out], "w")
315
+ fo.puts %w(identity aln.len mismatch gap.open evalue bitscore).join("\t")
316
+ end
317
+ res = File.open(o[:res], "w") unless o[:res].nil?
318
+ rbm = File.open(o[:rbm], "w") unless o[:rbm].nil?
319
+ [1,2].each do |i|
320
+ qry_seen = []
321
+ q = "#{dir}/seq#{i}.fa"
322
+ s = "#{dir}/seq#{i==1?2:1}.fa"
323
+ case o[:program].downcase
324
+ when "blast"
325
+ `"#{o[:bin]}blastall" -p blast#{o[:nucl] ? "n": "p"} -d "#{s}" \
326
+ -i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
327
+ when "blast+"
328
+ `"#{o[:bin]}blast#{o[:nucl] ? "n" : "p"}" -db "#{s}" -query "#{q}" \
329
+ -max_target_seqs 1 -num_threads #{o[:thr]} -outfmt 6 \
330
+ -out "#{dir}/#{i}.tab"`
331
+ when "blat"
332
+ `"#{o[:bin]}blat" "#{s}" "#{q}" #{"-prot" unless o[:nucl]} -out=blast8 \
333
+ "#{dir}/#{i}.tab.uns"`
334
+ `sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
335
+ when "diamond"
336
+ `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
337
+ --query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
338
+ && "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
339
+ --out "#{dir}/#{i}.tab" --quiet`
340
+ else
341
+ abort "Unsupported program: #{o[:program]}."
342
+ end
343
+ fh = File.open("#{dir}/#{i}.tab", "r")
344
+ id = 0
345
+ sq = 0
346
+ n = 0
347
+ fh.each_line do |ln|
348
+ ln.chomp!
349
+ row = ln.split(/\t/)
350
+ next unless qry_seen[ row[0].to_i ].nil?
351
+ next if row[3].to_i < o[:len] and
352
+ next if row[2].to_f < o[:id]
353
+ next if row[11].to_f < o[:bits]
354
+ next if row[3].to_f/[
355
+ seq_len[i==1 ? :seq1 : :seq2][row[0].to_i],
356
+ seq_len[i==1 ? :seq2 : :seq1][row[1].to_i]
357
+ ].min < o[:len_fraction]
358
+ qry_seen[ row[0].to_i ] = 1
359
+ id += row[2].to_f
360
+ sq += row[2].to_f ** 2
361
+ n += 1
362
+ if i==1
363
+ rbh[ row[0].to_i ] = row[1].to_i
364
+ else
365
+ if !rbh[ row[1].to_i ].nil? and rbh[ row[1].to_i ]==row[0].to_i
366
+ id2 += row[2].to_f
367
+ sq2 += row[2].to_f**2
368
+ n2 += 1
369
+ fo.puts [row[2..5],row[10..11]].join("\t") unless o[:out].nil?
370
+ rbm.puts [ori_ids[:seq1][row[1].to_i],
371
+ ori_ids[:seq2][row[0].to_i], row[2..5], row[8..9],
372
+ row[6..7], row[10..11]].join("\t") unless o[:rbm].nil?
373
+ sqlite_db.execute("insert into rbm values(?,?,?,?,?,?,?)",
374
+ seq_names + [ori_ids[:seq1][row[1].to_i],
375
+ ori_ids[:seq2][row[0].to_i], row[2], row[10], row[11]]
376
+ ) if not o[:sqlite3].nil? and o[:dbrbm]
377
+ end
378
+ end
379
+ end
380
+ fh.close
381
+ if n < o[:hits]
382
+ puts "Insuffient hits to estimate one-way AAI: #{n}." unless o[:auto]
383
+ res.puts "Insufficient hits to estimate one-way AAI: #{n}" unless
384
+ o[:res].nil?
385
+ else
386
+ printf "! One-way AAI %d: %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), " +
387
+ "from %i proteins.\n", i, id/n, (sq/n - (id/n)**2)**0.5, n unless
388
+ o[:auto]
389
+ res.puts sprintf "<b>One-way AAI %d:</b> %.#{o[:dec]}f%% " +
390
+ "(SD: %.#{o[:dec]}f%%), from %i proteins.<br/>", i, id/n,
391
+ (sq/n - (id/n)**2)**0.5, n unless o[:res].nil?
392
+ end
393
+ end
394
+ rbm.close unless o[:rbm].nil?
395
+ if n2 < o[:hits]
396
+ puts "Insufficient hits to estimate two-way AAI: #{n2}" unless o[:auto]
397
+ res.puts "Insufficient hits to estimate two-way AAI: #{n2}" unless
398
+ o[:res].nil?
399
+ else
400
+ printf "! Two-way AAI : %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), from %i" +
401
+ " proteins.\n", id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:auto]
402
+ res.puts sprintf "<b>Two-way AAI:</b> %.#{o[:dec]}f%% (SD: " +
403
+ "%.#{o[:dec]}f%%), from %i proteins.<br/>", id2/n2,
404
+ (sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:res].nil?
405
+ unless o[:tab].nil?
406
+ tab = File.open(o[:tab], "w")
407
+ tab.printf "%.#{o[:dec]}f\t%.#{o[:dec]}f\t%i\t%i\n", id2/n2,
408
+ (sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg
409
+ tab.close
410
+ end
411
+ sqlite_db.execute("insert into aai values(?,?,?,?,?,?)",
412
+ seq_names + [id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg]) unless
413
+ o[:sqlite3].nil?
414
+ puts id2/n2 if o[:auto]
415
+ end
416
+ res.close unless o[:res].nil?
417
+ fo.close unless o[:out].nil?
418
+ end
419
+
@@ -0,0 +1,362 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ require "optparse"
7
+ require "tmpdir"
8
+ has_rest_client = true
9
+ has_sqlite3 = true
10
+ begin
11
+ require "rubygems"
12
+ require "restclient"
13
+ rescue LoadError
14
+ has_rest_client = false
15
+ end
16
+ begin
17
+ require "sqlite3"
18
+ rescue LoadError
19
+ has_sqlite3 = false
20
+ end
21
+
22
+ o = {win:1000, step:200, id:70, len:700, correct:true, hits:50, q:false, bin:"",
23
+ program:"blast+", thr:1, dec:2, auto:false, lookupfirst:false,
24
+ dbregions:true, dbrbm: true, min_actg:0.95}
25
+ ARGV << "-h" if ARGV.size==0
26
+ OptionParser.new do |opts|
27
+ opts.banner = "
28
+ Calculates the Average Nucleotide Identity between two genomes.
29
+
30
+ Usage: #{$0} [options]"
31
+ opts.separator ""
32
+ opts.separator "Mandatory"
33
+ opts.on("-1", "--seq1 FILE",
34
+ "Path to the FastA file containing the genome 1."){ |v| o[:seq1] = v }
35
+ opts.on("-2", "--seq2 FILE",
36
+ "Path to the FastA file containing the genome 2."){ |v| o[:seq2] = v }
37
+ if has_rest_client
38
+ opts.separator " Alternatively, you can supply a NCBI-acc with the " +
39
+ "format ncbi:CP014272 instead of files."
40
+ else
41
+ opts.separator " Install rest-client to enable NCBI-acc support."
42
+ end
43
+ opts.separator ""
44
+ opts.separator "Search Options"
45
+ opts.on("-w", "--win INT",
46
+ "Window size in the ANI calculation (in bp). By default: " +
47
+ "#{o[:win].to_s}."){ |v| o[:win] = v.to_i }
48
+ opts.on("-s", "--step INT",
49
+ "Step size in the ANI calculation (in bp). By default: " +
50
+ "#{o[:step].to_s}."){ |v| o[:step] = v.to_i }
51
+ opts.on("-l", "--len INT",
52
+ "Minimum alignment length (in bp). By default: #{o[:len]}."
53
+ ){ |v| o[:len] = v.to_i }
54
+ opts.on("-i", "--id NUM",
55
+ "Minimum alignment identity (in %). By default: #{o[:id]}."
56
+ ){ |v| o[:id] = v.to_f }
57
+ opts.on("-n", "--hits INT",
58
+ "Minimum number of hits. By default: #{o[:hits]}."
59
+ ){ |v| o[:hits] = v.to_i }
60
+ opts.on("-N", "--nocorrection",
61
+ "Report values without post-hoc correction."){ |v| o[:correct] = false }
62
+ opts.on("--min-actg FLOAT",
63
+ "Minimum fraction of ACTGN in the sequences before assuming proteins.",
64
+ "By default: #{o[:min_actg]}."
65
+ ){ |v| o[:min_actg] = v.to_f }
66
+ opts.separator ""
67
+ opts.separator "Software Options"
68
+ opts.on("-b", "--bin DIR",
69
+ "Path to the directory containing the binaries of the search program."
70
+ ){ |v| o[:bin] = v }
71
+ opts.on("-p", "--program STR",
72
+ "Search program to be used. One of: blast+ (default), blast, blat."
73
+ ){ |v| o[:program] = v }
74
+ opts.on("-t", "--threads INT",
75
+ "Number of parallel threads to be used. By default: #{o[:thr]}."
76
+ ){ |v| o[:thr] = v.to_i }
77
+ opts.separator ""
78
+ opts.separator "SQLite3 Options"
79
+ opts.on("-S", "--sqlite3 FILE",
80
+ "Path to the SQLite3 database to create (or update) with the results."
81
+ ){ |v| o[:sqlite3] = v }
82
+ opts.separator " Install sqlite3 gem to enable database support." unless
83
+ has_sqlite3
84
+ opts.on("--name1 STR",
85
+ "Name of --seq1 to use in --sqlite3. By default determined by filename."
86
+ ){ |v| o[:seq1name] = v }
87
+ opts.on("--name2 STR",
88
+ "Name of --seq2 to use in --sqlite3. By default determined by filename."
89
+ ){ |v| o[:seq2name] = v }
90
+ opts.on("--[no-]save-regions",
91
+ "Save (or don't save) the fragments in the --sqlite3 database.",
92
+ "By default: #{o[:dbregions]}."){ |v| o[:dbregions] = !!v }
93
+ opts.on("--[no-]save-rbm",
94
+ "Save (or don't save) the reciprocal best matches in the --sqlite3 db.",
95
+ "By default: #{o[:dbrbm]}."){ |v| o[:dbrbm] = !!v }
96
+ opts.on("--lookup-first",
97
+ "Indicates if the ANI should be looked up first in the database.",
98
+ "Requires --sqlite3, --auto, --name1, and --name2.",
99
+ "Incompatible with --res, --tab, and --out."){ |v| o[:lookupfirst] = v }
100
+ opts.separator ""
101
+ opts.separator "Other Output Options"
102
+ opts.on("-d", "--dec INT",
103
+ "Decimal positions to report. By default: #{o[:dec]}"
104
+ ){ |v| o[:dec] = v.to_i }
105
+ opts.on("-o", "--out FILE",
106
+ "Saves a file describing the alignments used for two-way ANI."
107
+ ){ |v| o[:out] = v }
108
+ opts.on("-r", "--res FILE",
109
+ "Saves a file with the final results."){ |v| o[:res] = v }
110
+ opts.on("-T", "--tab FILE",
111
+ "Saves a file with the final two-way results in a tab-delimited form.",
112
+ "The columns are (in that order):",
113
+ "ANI, standard deviation, fragments used, fragments in the smallest genome."
114
+ ){ |v| o[:tab]=v }
115
+ opts.on("-a", "--auto",
116
+ "ONLY outputs the ANI value in STDOUT (or nothing, if calculation fails)."
117
+ ){ o[:auto] = true }
118
+ opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
119
+ opts.on("-h", "--help", "Display this screen") do
120
+ puts opts
121
+ exit
122
+ end
123
+ opts.separator ""
124
+ end.parse!
125
+ abort "-1 is mandatory" if o[:seq1].nil?
126
+ abort "-2 is mandatory" if o[:seq2].nil?
127
+ abort "SQLite3 requested (-S) but sqlite3 not supported. First install gem " +
128
+ "sqlite3." unless o[:sqlite3].nil? or has_sqlite3
129
+ abort "Step size must be smaller than window size." if o[:step] > o[:win]
130
+ o[:bin] = o[:bin]+"/" if o[:bin].size > 0
131
+ if o[:lookupfirst]
132
+ abort "--lookup-first needs --sqlite3" if o[:sqlite3].nil?
133
+ abort "--lookup-first requires --auto" unless o[:auto]
134
+ abort "--lookup-first requires --name1" if o[:seq1name].nil?
135
+ abort "--lookup-first requires --name2" if o[:seq2name].nil?
136
+ abort "--lookup-first conflicts with --res" unless o[:res].nil?
137
+ abort "--lookup-first conflicts with --tab" unless o[:tab].nil?
138
+ abort "--lookup-first conflicts with --out" unless o[:out].nil?
139
+ end
140
+
141
+ # Create SQLite3 file
142
+ unless o[:sqlite3].nil?
143
+ $stderr.puts "Accessing SQLite3 file: #{o[:sqlite3]}." unless o[:q]
144
+ sqlite_db = SQLite3::Database.new o[:sqlite3]
145
+ sqlite_db.execute "create table if not exists regions( " +
146
+ "seq varchar(256), id int, source varchar(256), `start` int," +
147
+ " `end` int )"
148
+ sqlite_db.execute "create table if not exists rbm( seq1 varchar(256), " +
149
+ "seq2 varchar(256), id1 int, id2 int, id float, evalue float, " +
150
+ "bitscore float )"
151
+ sqlite_db.execute "create table if not exists ani( seq1 varchar(256), " +
152
+ "seq2 varchar(256), ani float, sd float, n int, omega int )"
153
+ end
154
+
155
+ # Look-up first
156
+ if o[:lookupfirst]
157
+ val = sqlite_db.execute "select ani from ani where seq1=? and seq2=?",
158
+ [o[:seq1name], o[:seq2name]]
159
+ val = sqlite_db.execute "select ani from ani where seq1=? and seq2=?",
160
+ [o[:seq2name], o[:seq1name]] if val.empty?
161
+ unless val.empty?
162
+ puts val.first.first
163
+ exit
164
+ end
165
+ end
166
+
167
+ Dir.mktmpdir do |dir|
168
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
169
+
170
+ # Create databases.
171
+ $stderr.puts "Creating databases." unless o[:q]
172
+ minfrg = nil
173
+ seq_names = []
174
+ seq_len = {}
175
+ actg_cnt = {}
176
+ [:seq1, :seq2].each do |seq|
177
+ abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead" if
178
+ /^gi:/.match(o[seq])
179
+ acc = /^ncbi:(\S+)/.match(o[seq])
180
+ if not acc.nil?
181
+ abort "NCBI-acc requested but rest-client not supported. First " +
182
+ "install gem rest-client." unless has_rest_client
183
+ response = RestClient.get(
184
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
185
+ {params:{db:"nuccore",rettype:"fasta",id:acc[1],idtype:"acc"}})
186
+ abort "Unable to reach NCBI EUtils, error code " +
187
+ response.code.to_s + "." unless response.code == 200
188
+ o[seq] = "#{dir}/ncbi-#{seq.to_s}.fa"
189
+ fo = File.open(o[seq], "w")
190
+ fo.puts response.to_str
191
+ fo.close
192
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
193
+ "ncbi:#{acc[1]}" : o[ "#{seq}name".to_sym ] )
194
+ else
195
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
196
+ File.basename(o[seq], ".*") : o[ "#{seq}name".to_sym ] )
197
+ end
198
+ $stderr.puts " Reading FastA file: #{o[seq]}" unless o[:q]
199
+ sqlite_db.execute("delete from regions where seq=?",
200
+ [seq_names.last]) unless o[:sqlite3].nil?
201
+ buffer = ""
202
+ frgs = 0
203
+ seq_len[seq] = 0
204
+ actg_cnt[seq] = 0
205
+ seqs = 0
206
+ disc = 0
207
+ seqn = ""
208
+ from = 1
209
+ fi = File.open(o[seq], "r")
210
+ fo = File.open("#{dir}/#{seq.to_s}.fa", "w")
211
+ fi.each_line do |ln|
212
+ if ln =~ /^>(\S+)/
213
+ seqs += 1
214
+ disc += buffer.size
215
+ buffer = ""
216
+ seqn = $1
217
+ from = 1
218
+ else
219
+ ln.gsub!(/[^A-Za-z]/, '')
220
+ seq_len[seq] += ln.length
221
+ actg_cnt[seq] += ln.gsub(/[^ACTGNactgn]/,"").length
222
+ buffer = buffer + ln
223
+ while buffer.size > o[:win]
224
+ seq_i = buffer[0, o[:win]]
225
+ if seq_i =~ /^N+$/
226
+ disc += seq_i.size
227
+ else
228
+ frgs += 1
229
+ fo.puts ">#{frgs}"
230
+ fo.puts seq_i
231
+ sqlite_db.execute("insert into regions values(?,?,?,?,?)",
232
+ [seq_names.last, frgs, seqn, from, from+o[:win]]) if
233
+ not o[:sqlite3].nil? and o[:dbregions]
234
+ end
235
+ buffer = buffer[o[:step] .. -1]
236
+ from += o[:win]
237
+ end
238
+ end
239
+ end
240
+ fi.close
241
+ fo.close
242
+ actg_frx = actg_cnt[seq].to_f/seq_len[seq].to_f
243
+ abort "Input sequences appear to be proteins " +
244
+ "(ACTGN fraction: %.2f%%)." % (actg_frx*100) if actg_frx < o[:min_actg]
245
+ $stderr.puts " Created #{frgs} fragments from #{seqs} sequences, " +
246
+ "discarded #{disc} bp." unless o[:q]
247
+ minfrg ||= frgs
248
+ minfrg = frgs if minfrg > frgs
249
+ case o[:program].downcase
250
+ when "blast"
251
+ `"#{o[:bin]}formatdb" -i "#{dir}/#{seq.to_s}.fa" -p F`
252
+ when "blast+"
253
+ `"#{o[:bin]}makeblastdb" -in "#{dir}/#{seq.to_s}.fa" -dbtype nucl`
254
+ when "blat"
255
+ # Nothing to do
256
+ else
257
+ abort "Unsupported program: #{o[:program]}."
258
+ end
259
+ end # [:seq1, :seq2].each
260
+
261
+ # Best-hits.
262
+ $stderr.puts "Running one-way comparisons." unless o[:q]
263
+ rbh = []
264
+ id2 = 0
265
+ sq2 = 0
266
+ n2 = 0
267
+ unless o[:sqlite3].nil?
268
+ sqlite_db.execute "delete from rbm where seq1=? and seq2=?", seq_names
269
+ sqlite_db.execute "delete from ani where seq1=? and seq2=?", seq_names
270
+ end
271
+ unless o[:out].nil?
272
+ fo = File.open(o[:out], "w")
273
+ fo.puts %w(identity aln.len mismatch gap.open evalue bitscore).join("\t")
274
+ end
275
+ res = File.open(o[:res], "w") unless o[:res].nil?
276
+ [1,2].each do |i|
277
+ qry_seen = []
278
+ q = "#{dir}/seq#{i}.fa"
279
+ s = "#{dir}/seq#{i==1?2:1}.fa"
280
+ case o[:program].downcase
281
+ when "blast"
282
+ `"#{o[:bin]}blastall" -p blastn -d "#{s}" -i "#{q}" \
283
+ -F F -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
284
+ when "blast+"
285
+ `"#{o[:bin]}blastn" -db "#{s}" -query "#{q}" \
286
+ -dust no -max_target_seqs 1 \
287
+ -num_threads #{o[:thr]} -outfmt 6 -out "#{dir}/#{i}.tab"`
288
+ when "blat"
289
+ `#{o[:bin]}blat "#{s}" "#{q}" -out=blast8 "#{dir}/#{i}.tab"`
290
+ else
291
+ abort "Unsupported program: #{o[:program]}."
292
+ end
293
+ fh = File.open("#{dir}/#{i}.tab", "r")
294
+ id = 0
295
+ sq = 0
296
+ n = 0
297
+ fh.each_line do |ln|
298
+ ln.chomp!
299
+ row = ln.split(/\t/)
300
+ if qry_seen[ row[0].to_i ].nil? and row[3].to_i >= o[:len] and
301
+ row[2].to_f >= o[:id]
302
+ qry_seen[ row[0].to_i ] = 1
303
+ identity_corr = 100 - (100-row[2].to_f)/(o[:correct] ? 0.8621 : 1.0)
304
+ id += identity_corr
305
+ sq += identity_corr ** 2
306
+ n += 1
307
+ if i==1
308
+ rbh[ row[0].to_i ] = row[1].to_i
309
+ else
310
+ if !rbh[ row[1].to_i ].nil? and rbh[ row[1].to_i ]==row[0].to_i
311
+ id2 += identity_corr
312
+ sq2 += identity_corr ** 2
313
+ n2 += 1
314
+ fo.puts [identity_corr,row[3..5],
315
+ row[10..11]].join("\t") unless o[:out].nil?
316
+ sqlite_db.execute("insert into rbm values(?,?,?,?,?,?,?)",
317
+ seq_names + [row[1], row[0], row[2], row[10], row[11]]
318
+ ) if not o[:sqlite3].nil? and o[:dbrbm]
319
+ end
320
+ end
321
+ end
322
+ end
323
+ fh.close
324
+ if n < o[:hits]
325
+ puts "Insuffient hits to estimate one-way ANI: #{n}." unless o[:auto]
326
+ res.puts "Insufficient hits to estimate one-way ANI: #{n}" unless
327
+ o[:res].nil?
328
+ else
329
+ printf "! One-way ANI %d: %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), " +
330
+ "from %i fragments.\n", i, id/n, (sq/n - (id/n)**2)**0.5, n unless
331
+ o[:auto]
332
+ res.puts sprintf "<b>One-way ANI %d:</b> %.#{o[:dec]}f%% " +
333
+ "(SD: %.#{o[:dec]}f%%), from %i fragments.<br/>", i, id/n,
334
+ (sq/n - (id/n)**2)**0.5, n unless o[:res].nil?
335
+ end
336
+ end # [1,2].each
337
+ if n2 < o[:hits]
338
+ puts "Insufficient hits to estimate two-way ANI: #{n2}" unless o[:auto]
339
+ res.puts "Insufficient hits to estimate two-way ANI: #{n2}" unless
340
+ o[:res].nil?
341
+ else
342
+ ani = id2/n2
343
+ ani_sd = (sq2/n2 - (id2/n2)**2)**0.5
344
+ printf "! Two-way ANI : %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), " +
345
+ "from %i fragments.\n", ani, ani_sd, n2 unless o[:auto]
346
+ res.puts sprintf "<b>Two-way ANI:</b> %.#{o[:dec]}f%% " +
347
+ "(SD: %.#{o[:dec]}f%%), from %i fragments.<br/>",
348
+ ani, ani_sd, n2 unless o[:res].nil?
349
+ unless o[:tab].nil?
350
+ tab = File.open(o[:tab], "w")
351
+ tab.printf "%.#{o[:dec]}f\t%.#{o[:dec]}f\t%i\t%i\n",
352
+ ani, ani_sd, n2, minfrg
353
+ tab.close
354
+ end
355
+ sqlite_db.execute("insert into ani values(?,?,?,?,?,?)",
356
+ seq_names + [ani, ani_sd, n2, minfrg]) unless o[:sqlite3].nil?
357
+ puts ani if o[:auto]
358
+ end
359
+ res.close unless o[:res].nil?
360
+ fo.close unless o[:out].nil?
361
+ end
362
+