miga-base 0.7.26.0 → 0.7.26.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +1 -1
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  7. data/utils/FastAAI/README.md +84 -0
  8. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/enveomics/Docs/recplot2.md +244 -0
  10. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  11. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  13. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  14. data/utils/enveomics/LICENSE.txt +73 -0
  15. data/utils/enveomics/Makefile +52 -0
  16. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  17. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  18. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  19. data/utils/enveomics/Manifest/Tasks/fasta.json +766 -0
  20. data/utils/enveomics/Manifest/Tasks/fastq.json +243 -0
  21. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  22. data/utils/enveomics/Manifest/Tasks/mapping.json +67 -0
  23. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  24. data/utils/enveomics/Manifest/Tasks/other.json +829 -0
  25. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  26. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +501 -0
  27. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  28. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  29. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  30. data/utils/enveomics/Manifest/categories.json +156 -0
  31. data/utils/enveomics/Manifest/examples.json +154 -0
  32. data/utils/enveomics/Manifest/tasks.json +4 -0
  33. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  34. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  35. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  36. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  37. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  38. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  39. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  48. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  49. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  50. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  51. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  52. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  53. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  54. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  62. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  63. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  64. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  65. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  66. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  67. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  68. data/utils/enveomics/README.md +42 -0
  69. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  70. data/utils/enveomics/Scripts/Aln.cat.rb +163 -0
  71. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  72. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  73. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  74. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  75. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  76. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  77. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  78. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  79. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  80. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  81. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  82. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  83. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  84. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  85. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  86. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  87. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  88. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  89. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  90. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  91. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  92. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  93. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  94. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  95. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  96. data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
  97. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  98. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  99. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  100. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  101. data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
  102. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  103. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  104. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  105. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  106. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  107. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  108. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  109. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  110. data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
  111. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  112. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  113. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  114. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  115. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  116. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  117. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  118. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  119. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  120. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  121. data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
  122. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  123. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  124. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  125. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  126. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  127. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  128. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  129. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  130. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  131. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  132. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  133. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  134. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  135. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  136. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  137. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  138. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  139. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  140. data/utils/enveomics/Scripts/SRA.download.bash +57 -0
  141. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  142. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  143. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  144. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  145. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  146. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  147. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  148. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  149. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  150. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  151. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  152. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  153. data/utils/enveomics/Scripts/aai.rb +418 -0
  154. data/utils/enveomics/Scripts/ani.rb +362 -0
  155. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  156. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  157. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  158. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  159. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  160. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  161. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  162. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  163. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  164. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  165. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  166. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
  167. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  168. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  169. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  170. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  171. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  172. data/utils/enveomics/Scripts/ogs.rb +104 -0
  173. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  174. data/utils/enveomics/Scripts/rbm.rb +146 -0
  175. data/utils/enveomics/Tests/Makefile +10 -0
  176. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  177. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  178. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  179. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  180. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  181. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  182. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  183. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  184. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  185. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  186. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  187. data/utils/enveomics/Tests/alkB.nwk +1 -0
  188. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  189. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  190. data/utils/enveomics/Tests/hiv1.faa +59 -0
  191. data/utils/enveomics/Tests/hiv1.fna +134 -0
  192. data/utils/enveomics/Tests/hiv2.faa +70 -0
  193. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  194. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  195. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  196. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  197. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  198. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  199. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  200. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  201. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  202. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  203. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  204. data/utils/enveomics/build_enveomics_r.bash +45 -0
  205. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  206. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  207. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  208. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  209. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  210. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  211. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  212. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  213. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  214. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  215. data/utils/enveomics/enveomics.R/R/utils.R +50 -0
  216. data/utils/enveomics/enveomics.R/README.md +80 -0
  217. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  218. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  219. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  220. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  221. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  222. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  223. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  224. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  225. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  226. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  227. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  228. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -0
  229. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -0
  230. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -0
  231. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  232. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  233. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -0
  234. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -0
  235. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -0
  236. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -0
  237. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -0
  242. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  243. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  244. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  245. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  246. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -0
  247. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  248. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  249. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -0
  250. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  251. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  252. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  253. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  254. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -0
  255. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  256. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -0
  257. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +37 -0
  258. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -0
  259. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  260. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  261. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -0
  262. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -0
  263. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  264. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  265. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  266. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  267. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -0
  268. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -0
  269. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -0
  270. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -0
  271. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  272. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  273. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  274. data/utils/enveomics/globals.mk +8 -0
  275. data/utils/enveomics/manifest.json +9 -0
  276. metadata +277 -4
@@ -0,0 +1,244 @@
1
+ # Recruitment plots
2
+
3
+ ## Aims
4
+
5
+ This document aims to cover the technical aspects of the recruitment plot functions in the
6
+ `enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
7
+
8
+ ## Caveats
9
+
10
+ This is a __*working document*__, describing unstable and/or experimental code. The material
11
+ here is susceptible of changes without warning, pay attention to the modification date and (if
12
+ in doubt) the commit history. The definitions and default parameters of the functions described
13
+ here may change in the near future as result of further experimentation or more stable
14
+ implementations.
15
+
16
+ The current document was generated and tested with the `enveomics.R` package version 1.3. To
17
+ check your current version in R, use `packageVersion('enveomics.R')`.
18
+
19
+ > **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
20
+ > Carefully evaluate all your results.
21
+
22
+ ---
23
+
24
+ ## Package: `enveomics.R`
25
+
26
+ The functionalities described here are provided by the `enveomics.R` package. Some features
27
+ described here are updated more frequently than the official
28
+ [CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
29
+ updates (package HEAD), download (or update), and install this git repository.
30
+
31
+ ### Quick installation guide
32
+
33
+ :globe_with_meridians: To install the latest stable version available in CRAN, use in R:
34
+
35
+ ```R
36
+ install.packages(c('enveomics.R','optparse'))
37
+ ```
38
+
39
+ :octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
40
+
41
+ ```R
42
+ install.packages('devtools')
43
+ library('devtools')
44
+ install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Recruitment plots: `enve.recplot2`
50
+
51
+ The first step in this analysis is the mapping of reads to the genome, processed with
52
+ [BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
53
+ We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
54
+ prefix of the processed files.
55
+
56
+ Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
57
+ For this, you'll have two options.
58
+
59
+ ### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
60
+
61
+ The stand-alone script
62
+ [BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
63
+ is the easiest option to run, and should be the preferred method if you're automating
64
+ this analysis to process several mappings, but it doesn't offer access to advanced options.
65
+
66
+ You can run it like this using two CPUs:
67
+
68
+ ```bash
69
+ BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
70
+ ```
71
+
72
+ > **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
73
+ > map against contigs. However, if you did map reads against genes, you may want to use the
74
+ > `--pos-breaks 0` option to use each gene as a recruitment window.
75
+ >
76
+ > **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
77
+ > `--peaks-col darkred` option.
78
+
79
+ Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
80
+ object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
81
+
82
+ ### Option 2: Using the `enve.recplot2` R function
83
+
84
+ If you require access to advanced options, or for some other reason prefer to calculate the
85
+ recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
86
+ and example session in R:
87
+
88
+ ```R
89
+ # Load the package
90
+ library(enveomics.R)
91
+ # Open the PDF
92
+ pdf('my-recplot.pdf')
93
+ # Build and plot the object using two threads and no peak detection
94
+ # (to turn on peak detection, simply remove `peaks.col=NA`)
95
+ rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
96
+ # Close the PDF
97
+ dev.off()
98
+ # Save the object
99
+ save(rp, file='my-recplot.rdata')
100
+ ```
101
+
102
+ > **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
103
+ > before closing the R session.
104
+
105
+ Naturally, you may want to see what other (advanced) options you have. You can access the
106
+ documentation of the function in R using `?enve.recplot2`.
107
+
108
+ ---
109
+
110
+ ## Summary statistics
111
+
112
+ Here we explore some frequently used summary statistics from recruitment plots. First, load the
113
+ package and the `enve.RecPlot2` object you saved previously, in R:
114
+
115
+ ```R
116
+ library(enveomics.R)
117
+ load('my-recplot.rdata')
118
+ ```
119
+
120
+ ### Centrality measures of sequencing depth
121
+
122
+ ```R
123
+ mean(enve.recplot2.seqdepth(rp)) # <- Average
124
+ median(enve.recplot2.seqdepth(rp)) # <- Median
125
+ enve.truncate(enve.recplot2.seqdepth(rp)) # <- 95% Central Truncated Mean
126
+ enve.truncate(enve.recplot2.seqdepth(rp), 0.9) # <- 90% Central Truncated Mean
127
+ ```
128
+
129
+ The functions above only use hits with identity above the cutoff for "in-group" (by default: 95%).
130
+ In order to estimate the sequencing depth with a different identity cutoff, modify the cutoff first:
131
+
132
+ ```R
133
+ rp98 <- enve.recplot2.changeCutoff(rp, 98) # <- Change to ≥98%
134
+ mean(enve.recplot2.seqdepth(rp98)) # <- Average (for the new object)
135
+ median(enve.recplot2.seqdepth(rp98)) # <- Median (for the new object)
136
+ ```
137
+
138
+ ### Average and median sequencing depth excluding zero-coverage windows
139
+
140
+ ```R
141
+ seqdepth <- enve.recplot2.seqdepth(rp)
142
+ mean(seqdepth[seqdepth>0]) # <- Average
143
+ median(seqdepth[seqdepth>0]) # <- Median
144
+ ```
145
+
146
+ ### Average Nucleotide Identity from reads (ANIr)
147
+
148
+ ```R
149
+ enve.recplot2.ANIr(rp) # <- Complete recruitment plot
150
+ enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
151
+ enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
152
+ enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
153
+ ```
154
+
155
+ ### Coordinates of each sequence window with their respective sequencing depth
156
+
157
+ ```R
158
+ d <- enve.recplot2.coordinates(rp)
159
+ d$seqdepth <- enve.recplot2.seqdepth(rp)
160
+ d
161
+ ```
162
+
163
+ ### Sequencing breadth (upper boundary)
164
+
165
+ This estimate depends on the window size. The smaller the window size, the better the
166
+ estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
167
+ biased (overestimate).
168
+
169
+ ```R
170
+ mean(enve.recplot2.seqdepth(rp) > 0)
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Peak-finder: `enve.recplot2.findPeaks`
176
+
177
+ In this step we will try to identify one or multiple population peaks corresponding to different
178
+ sub-populations and/or composites of sub-populations.
179
+
180
+ > **NOTE** This step can be performed together with the step above, but we separate it here for
181
+ > two reasons: **(1)** This step is much more unstable but less computationally demanding than the
182
+ > step before, so it makes sense to re-run only this part with different parameters and/or
183
+ > package updates; and **(2)** We want to save the R objects independently, so the following steps
184
+ > are more clear.
185
+
186
+ In R:
187
+
188
+ ```R
189
+ # Load the package
190
+ library(enveomics.R)
191
+ # Load the `enve.RecPlot2` object you saved previously
192
+ load('my-recplot.rdata')
193
+ # Find the peaks
194
+ peaks <- enve.recplot2.findPeaks(rp)
195
+ # Save the peaks R object (optional)
196
+ save(peaks, file='my-recplot-peaks.rdata')
197
+ # Plot the peaks in a PDF (optional)
198
+ pdf('my-recplot-peaks.pdf')
199
+ p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
200
+ dev.off()
201
+ ```
202
+
203
+ The key function here is `enve.recplot2.findPeaks`. This function has several parameters, depending on
204
+ the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
205
+ of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
206
+
207
+ ---
208
+
209
+ ## Gene-content diversity: `enve.recplot2.extractWindows`
210
+
211
+ In R:
212
+
213
+ ```R
214
+ # Load the package and the objects (unless you're still in the same session from the last step)
215
+ library(enveomics.R)
216
+ load('my-recplot.rdata')
217
+ load('my-recplot-peaks.rdata')
218
+ # Find the peak representing the core genome
219
+ cp <- enve.recplot2.corePeak(peaks)
220
+ #-----
221
+ # The following functions illustrate how to obtain different results. Please explore the resulting
222
+ # objects and the associated documentation
223
+ #-----
224
+ # Find the coordinates of windows significantly below the average sequencing depth
225
+ div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
226
+ # Add sequencing depth
227
+ div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
228
+ # Save the coordinates as a tab-delimited table
229
+ write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
230
+ # Find all the windows with sequencing depth zero
231
+ zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
232
+ ```
233
+
234
+ ---
235
+
236
+ ## To do
237
+
238
+ - [x] Document structure
239
+ - [x] Package: `enveomics.R`
240
+ - [x] Recruitment plots: `enve.recplot2`
241
+ - [x] Summary statistics
242
+ - [x] Peak-finder: `enve.recplot2.findPeaks`
243
+ - [x] Gene-content diversity: `enve.recplot2.extractWindows`
244
+ - [ ] Compare identity profiles: `enve.recplot2.compareIdentities`
@@ -0,0 +1,66 @@
1
+ #!/bin/bash
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ set -e # <- So it stops if there is an error
7
+ function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
8
+
9
+ OUT=$1 # <- Output file
10
+ [[ -n "$1" ]] && shift
11
+ SEQS=("$@") # <- list of all genomes
12
+ THR=2 # <- Number or threads
13
+ DEF_DIST=0.9 # <- Default distance when AAI cannot be reliably estimated
14
+
15
+ # This is just the help message
16
+ if [[ $# -lt 2 ]] ; then
17
+ echo "
18
+ Use case: Building AAI matrices from a collection of genomes.
19
+
20
+ IMPORTANT
21
+ This script is functional, but it's mainly intended for illustrative purposes.
22
+ Please take a look at the code first.
23
+
24
+ Usage:
25
+ $0 <output.txt> <genomes...>
26
+
27
+ <output.txt> The output AAI list, in tab-delimited form containing the
28
+ following columns: (1) Sequence A, (2) Sequence B, (3)
29
+ AAI, (4) AAI-SD, (5) Proteins used, (6) Number of proteins in
30
+ the smallest genome, (7) Percentage of the genome shared.
31
+ <genomes...> The list of files containing the genomes (at least 2).
32
+
33
+ " >&2
34
+ exit
35
+ fi
36
+
37
+ # 00. Create environment
38
+ export PATH=$(dirname "$0")/../Scripts:$PATH
39
+
40
+ # 01. Calculate AAI
41
+ echo "[01/03] Calculating AAI"
42
+ for i in "${SEQS[@]}" ; do
43
+ for j in "${SEQS[@]}" ; do
44
+ echo -n " o $i vs $j: "
45
+ AAI=$(aai.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
46
+ --no-save-rbm --auto --quiet)
47
+ echo ${AAI:-Below detection}
48
+ [[ "$i" == "$j" ]] && break
49
+ done
50
+ done
51
+
52
+ # 02. Extract matrix
53
+ echo "[02/03] Extracting list"
54
+ echo -e "SeqA\tSeqB\tAAI\tSD\tN\tOmega\tFrx" > "$OUT"
55
+ echo "select seq1, seq2, aai, sd, n, omega, (100.0*n/omega) from aai;" \
56
+ | sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
57
+
58
+ # 03. Make it a distance matrix.
59
+ echo "[03/03] Generating distance matrix"
60
+ echo "
61
+ source('$(dirname $0)/../enveomics.R/R/df2dist.R');
62
+ a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
63
+ aai.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
64
+ write.table(as.matrix(aai.d), '$OUT.dist',
65
+ quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
66
+ " | R --vanilla >/dev/null
@@ -0,0 +1,66 @@
1
+ #!/bin/bash
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ set -e # <- So it stops if there is an error
7
+ function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
8
+
9
+ OUT=$1 # <- Output file
10
+ [[ -n "$1" ]] && shift
11
+ SEQS=("$@") # <- list of all genomes
12
+ THR=2 # <- Number or threads
13
+ DEF_DIST=0.9 # <- Default distance when ANI cannot be reliably estimated
14
+
15
+ # This is just the help message
16
+ if [[ $# -lt 2 ]] ; then
17
+ echo "
18
+ Use case: Building ANI matrices from a collection of genomes.
19
+
20
+ IMPORTANT
21
+ This script is functional, but it's mainly intended for illustrative purposes.
22
+ Please take a look at the code first.
23
+
24
+ Usage:
25
+ $0 <output.txt> <genomes...>
26
+
27
+ <output.txt> The output ANI list, in tab-delimited form containing the
28
+ following columns: (1) Sequence A, (2) Sequence B, (3)
29
+ ANI, (4) ANI-SD, (5) Fragments used, (6) Maximum number
30
+ of fragments, (7) Percentage of the genome shared.
31
+ <genomes...> The list of files containing the genomes (at least 2).
32
+
33
+ " >&2
34
+ exit
35
+ fi
36
+
37
+ # 00. Create environment
38
+ export PATH=$(dirname "$0")/../Scripts:$PATH
39
+
40
+ # 01. Calculate ANI
41
+ echo "[01/03] Calculating ANI"
42
+ for i in "${SEQS[@]}" ; do
43
+ for j in "${SEQS[@]}" ; do
44
+ echo -n " o $i vs $j: "
45
+ ANI=$(ani.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
46
+ --no-save-rbm --no-save-regions --auto --quiet)
47
+ echo ${ANI:-Below detection}
48
+ [[ "$i" == "$j" ]] && break
49
+ done
50
+ done
51
+
52
+ # 02. Extract matrix
53
+ echo "[02/03] Extracting list"
54
+ echo -e "SeqA\tSeqB\tANI\tSD\tN\tOmega\tFrx" > "$OUT"
55
+ echo "select seq1, seq2, ani, sd, n, omega, (100.0*n/omega) from ani;" \
56
+ | sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
57
+
58
+ # 03. Make it a distance matrix.
59
+ echo "[03/03] Generating distance matrix"
60
+ echo "
61
+ source('$(dirname $0)/../enveomics.R/R/df2dist.R');
62
+ a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
63
+ ani.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
64
+ write.table(as.matrix(ani.d), '$OUT.dist',
65
+ quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
66
+ " | R --vanilla >/dev/null
@@ -0,0 +1,105 @@
1
+ #!/bin/bash
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R
5
+ # @update Mar-23-2016
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ set -e # <- So it stops if there is an error
10
+ function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
11
+
12
+ ORG=$1 # <- Organism (see help)
13
+ THR=2 # <- Number or threads
14
+
15
+ # This is just the help message
16
+ if [[ "$ORG" == "" ]] ; then
17
+ echo "
18
+ Use case: Essential genes phylogeny of a species. The essential genes are a
19
+ collection of genes typically found in single copy in archaeal and bacterial
20
+ genomes
21
+
22
+ IMPORTANT
23
+ This script is functional, but it's mainly intended for illustrative purposes.
24
+ Please take a look at the code first.
25
+
26
+ Usage:
27
+ $0 <organism>
28
+
29
+ <organism> The organism to use (e.g., Streptococcus_pneumoniae).
30
+
31
+ " >&2
32
+ exit
33
+ fi
34
+
35
+ # 00. Create environment
36
+ export PATH=$(dirname $0)/../Scripts:$PATH
37
+ if [[ -e $ORG ]] ; then
38
+ echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
39
+ exit 1
40
+ fi
41
+ mkdir $ORG
42
+ for i in 01.proteome 02.essential 03.aln 04.cat 05.raxml 06.autoprune ; do
43
+ mkdir $ORG/$i
44
+ done
45
+
46
+ # 01. Download proteomes
47
+ echo "[01/06] Downloading and guzipping data"
48
+ RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
49
+ rm $ORG/01.proteome/assembly_summary.txt
50
+ for i in $ORG/01.proteome/* ; do
51
+ b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
52
+ if exists $i/*.faa.gz ; then
53
+ for j in $i/*.faa.gz ; do gunzip $j ; done
54
+ cat $i/*.faa > $ORG/01.proteome/$b.faa
55
+ fi
56
+ rm -R $i
57
+ done
58
+
59
+ # 02. Essential genes
60
+ echo "[02/06] Idenfifying essential genes"
61
+ N=0
62
+ for i in $ORG/01.proteome/*.faa ; do # <- This loop could be parallelized
63
+ genomeA=$(basename $i .faa)
64
+ dir=$ORG/02.essential/$genomeA
65
+ mkdir $dir
66
+ HMM.essential.rb -i $i -m $dir/ -R $dir/log.txt -r $genomeA -t $THR
67
+ let N=$N+1
68
+ done
69
+
70
+ # 03. Find core and align groups
71
+ echo "[03/06] Identifying core essentials and aligning groups"
72
+ CORE_ESS=$(basename -s .faa $ORG/02.essential/*/*.faa | sort | uniq -c \
73
+ | awk '$1=='$N'{print $2}')
74
+ for b in $CORE_ESS ; do # <- This loop could be parallelized
75
+ cat $ORG/02.essential/*/$b.faa > $ORG/03.aln/$b.faa
76
+ clustalo -i $ORG/03.aln/$b.faa -o $ORG/03.aln/$b.aln #--threads=$THR
77
+ done
78
+
79
+ # 04. Concatenate alignment
80
+ echo "[04/06] Concatenating alignments and removing invariable sites"
81
+ Aln.cat.rb -I -c $ORG/04.cat/essential.raxcoords -i '|' $ORG/03.aln/*.aln \
82
+ > $ORG/04.cat/essential.aln 2> $ORG/04.cat/essential.log
83
+
84
+ # 05. Run RAxML
85
+ echo "[05/06] Inferring phylogeny"
86
+ # You REALLY should consider running the following with more threads (-T) and,
87
+ # if possible, multi-nodes using MPI
88
+ cd $ORG/05.raxml
89
+ raxmlHPC-PTHREADS -T $THR -p 1234 \
90
+ -s ../04.cat/essential.aln -q ../04.cat/essential.raxcoords \
91
+ -m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
92
+ # before running this line, so you know
93
+ # that you're running what you really want. Check
94
+ # options for bootstrapping and the different
95
+ # algorithms (-f). Note that -m is required, but the
96
+ # file unus.raxcoords specifies "AUTO", so RAxML will
97
+ # attempt to find the model resulting in the highest
98
+ # likelihood.
99
+ cd ../..
100
+
101
+ # 06. Autoprune
102
+ echo "[06/06] Auto-pruning the tree"
103
+ Newick.autoprune.R --t $ORG/05.raxml/RAxML_bestTree.UNUS --min_dist 0.001 \
104
+ $ORG/06.autoprune/essential-pruned.nwk
105
+