miga-base 1.2.15.2 → 1.2.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/download/gtdb.rb +4 -1
  3. data/lib/miga/cli/action/gtdb_get.rb +4 -0
  4. data/lib/miga/daemon.rb +4 -1
  5. data/lib/miga/lair.rb +6 -4
  6. data/lib/miga/remote_dataset/download.rb +3 -2
  7. data/lib/miga/remote_dataset.rb +25 -7
  8. data/lib/miga/taxonomy.rb +6 -0
  9. data/lib/miga/version.rb +2 -2
  10. metadata +6 -302
  11. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +0 -41964
  12. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +0 -32439
  13. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -62056
  14. data/utils/FastAAI/FastAAI +0 -3659
  15. data/utils/FastAAI/FastAAI-legacy/FastAAI +0 -1336
  16. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +0 -1296
  17. data/utils/FastAAI/README.md +0 -84
  18. data/utils/enveomics/Docs/recplot2.md +0 -244
  19. data/utils/enveomics/Examples/aai-matrix.bash +0 -66
  20. data/utils/enveomics/Examples/ani-matrix.bash +0 -66
  21. data/utils/enveomics/Examples/essential-phylogeny.bash +0 -105
  22. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +0 -100
  23. data/utils/enveomics/LICENSE.txt +0 -73
  24. data/utils/enveomics/Makefile +0 -52
  25. data/utils/enveomics/Manifest/Tasks/aasubs.json +0 -103
  26. data/utils/enveomics/Manifest/Tasks/blasttab.json +0 -790
  27. data/utils/enveomics/Manifest/Tasks/distances.json +0 -161
  28. data/utils/enveomics/Manifest/Tasks/fasta.json +0 -802
  29. data/utils/enveomics/Manifest/Tasks/fastq.json +0 -291
  30. data/utils/enveomics/Manifest/Tasks/graphics.json +0 -126
  31. data/utils/enveomics/Manifest/Tasks/mapping.json +0 -137
  32. data/utils/enveomics/Manifest/Tasks/ogs.json +0 -382
  33. data/utils/enveomics/Manifest/Tasks/other.json +0 -906
  34. data/utils/enveomics/Manifest/Tasks/remote.json +0 -355
  35. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +0 -650
  36. data/utils/enveomics/Manifest/Tasks/tables.json +0 -308
  37. data/utils/enveomics/Manifest/Tasks/trees.json +0 -68
  38. data/utils/enveomics/Manifest/Tasks/variants.json +0 -111
  39. data/utils/enveomics/Manifest/categories.json +0 -165
  40. data/utils/enveomics/Manifest/examples.json +0 -162
  41. data/utils/enveomics/Manifest/tasks.json +0 -4
  42. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
  43. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
  44. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
  45. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
  46. data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
  47. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
  48. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
  49. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
  50. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
  51. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
  52. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
  53. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
  54. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
  55. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
  56. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
  57. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
  58. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
  59. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
  60. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
  61. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
  62. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
  63. data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
  64. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
  65. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
  66. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
  67. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
  68. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
  69. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
  70. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
  71. data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
  72. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
  73. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
  74. data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
  75. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
  76. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
  77. data/utils/enveomics/README.md +0 -42
  78. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +0 -171
  79. data/utils/enveomics/Scripts/Aln.cat.rb +0 -221
  80. data/utils/enveomics/Scripts/Aln.convert.pl +0 -35
  81. data/utils/enveomics/Scripts/AlphaDiversity.pl +0 -152
  82. data/utils/enveomics/Scripts/BedGraph.tad.rb +0 -93
  83. data/utils/enveomics/Scripts/BedGraph.window.rb +0 -71
  84. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +0 -102
  85. data/utils/enveomics/Scripts/BlastTab.addlen.rb +0 -63
  86. data/utils/enveomics/Scripts/BlastTab.advance.bash +0 -48
  87. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +0 -55
  88. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +0 -104
  89. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +0 -76
  90. data/utils/enveomics/Scripts/BlastTab.filter.pl +0 -47
  91. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +0 -194
  92. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +0 -104
  93. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +0 -157
  94. data/utils/enveomics/Scripts/BlastTab.recplot2.R +0 -48
  95. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +0 -86
  96. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +0 -119
  97. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +0 -86
  98. data/utils/enveomics/Scripts/BlastTab.subsample.pl +0 -47
  99. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +0 -114
  100. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +0 -90
  101. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +0 -123
  102. data/utils/enveomics/Scripts/Chao1.pl +0 -97
  103. data/utils/enveomics/Scripts/CharTable.classify.rb +0 -234
  104. data/utils/enveomics/Scripts/EBIseq2tax.rb +0 -83
  105. data/utils/enveomics/Scripts/FastA.N50.pl +0 -60
  106. data/utils/enveomics/Scripts/FastA.extract.rb +0 -152
  107. data/utils/enveomics/Scripts/FastA.filter.pl +0 -52
  108. data/utils/enveomics/Scripts/FastA.filterLen.pl +0 -28
  109. data/utils/enveomics/Scripts/FastA.filterN.pl +0 -60
  110. data/utils/enveomics/Scripts/FastA.fragment.rb +0 -100
  111. data/utils/enveomics/Scripts/FastA.gc.pl +0 -42
  112. data/utils/enveomics/Scripts/FastA.interpose.pl +0 -93
  113. data/utils/enveomics/Scripts/FastA.length.pl +0 -38
  114. data/utils/enveomics/Scripts/FastA.mask.rb +0 -89
  115. data/utils/enveomics/Scripts/FastA.per_file.pl +0 -36
  116. data/utils/enveomics/Scripts/FastA.qlen.pl +0 -57
  117. data/utils/enveomics/Scripts/FastA.rename.pl +0 -65
  118. data/utils/enveomics/Scripts/FastA.revcom.pl +0 -23
  119. data/utils/enveomics/Scripts/FastA.sample.rb +0 -98
  120. data/utils/enveomics/Scripts/FastA.slider.pl +0 -85
  121. data/utils/enveomics/Scripts/FastA.split.pl +0 -55
  122. data/utils/enveomics/Scripts/FastA.split.rb +0 -79
  123. data/utils/enveomics/Scripts/FastA.subsample.pl +0 -131
  124. data/utils/enveomics/Scripts/FastA.tag.rb +0 -65
  125. data/utils/enveomics/Scripts/FastA.toFastQ.rb +0 -69
  126. data/utils/enveomics/Scripts/FastA.wrap.rb +0 -48
  127. data/utils/enveomics/Scripts/FastQ.filter.pl +0 -54
  128. data/utils/enveomics/Scripts/FastQ.interpose.pl +0 -90
  129. data/utils/enveomics/Scripts/FastQ.maskQual.rb +0 -89
  130. data/utils/enveomics/Scripts/FastQ.offset.pl +0 -90
  131. data/utils/enveomics/Scripts/FastQ.split.pl +0 -53
  132. data/utils/enveomics/Scripts/FastQ.tag.rb +0 -70
  133. data/utils/enveomics/Scripts/FastQ.test-error.rb +0 -81
  134. data/utils/enveomics/Scripts/FastQ.toFastA.awk +0 -24
  135. data/utils/enveomics/Scripts/GFF.catsbj.pl +0 -127
  136. data/utils/enveomics/Scripts/GenBank.add_fields.rb +0 -84
  137. data/utils/enveomics/Scripts/HMM.essential.rb +0 -351
  138. data/utils/enveomics/Scripts/HMM.haai.rb +0 -168
  139. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +0 -83
  140. data/utils/enveomics/Scripts/JPlace.distances.rb +0 -88
  141. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +0 -320
  142. data/utils/enveomics/Scripts/M5nr.getSequences.rb +0 -81
  143. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +0 -198
  144. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +0 -35
  145. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +0 -49
  146. data/utils/enveomics/Scripts/NCBIacc2tax.rb +0 -92
  147. data/utils/enveomics/Scripts/Newick.autoprune.R +0 -27
  148. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +0 -228
  149. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +0 -32
  150. data/utils/enveomics/Scripts/RefSeq.download.bash +0 -48
  151. data/utils/enveomics/Scripts/SRA.download.bash +0 -55
  152. data/utils/enveomics/Scripts/TRIBS.plot-test.R +0 -36
  153. data/utils/enveomics/Scripts/TRIBS.test.R +0 -39
  154. data/utils/enveomics/Scripts/Table.barplot.R +0 -31
  155. data/utils/enveomics/Scripts/Table.df2dist.R +0 -30
  156. data/utils/enveomics/Scripts/Table.filter.pl +0 -61
  157. data/utils/enveomics/Scripts/Table.merge.pl +0 -77
  158. data/utils/enveomics/Scripts/Table.prefScore.R +0 -60
  159. data/utils/enveomics/Scripts/Table.replace.rb +0 -69
  160. data/utils/enveomics/Scripts/Table.round.rb +0 -63
  161. data/utils/enveomics/Scripts/Table.split.pl +0 -57
  162. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +0 -227
  163. data/utils/enveomics/Scripts/VCF.KaKs.rb +0 -147
  164. data/utils/enveomics/Scripts/VCF.SNPs.rb +0 -88
  165. data/utils/enveomics/Scripts/aai.rb +0 -421
  166. data/utils/enveomics/Scripts/ani.rb +0 -362
  167. data/utils/enveomics/Scripts/anir.rb +0 -137
  168. data/utils/enveomics/Scripts/clust.rand.rb +0 -102
  169. data/utils/enveomics/Scripts/gi2tax.rb +0 -103
  170. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +0 -96
  171. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  172. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  173. data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
  174. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +0 -293
  175. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +0 -175
  176. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +0 -24
  177. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +0 -17
  178. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +0 -30
  179. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +0 -253
  180. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +0 -88
  181. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +0 -182
  182. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +0 -49
  183. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +0 -74
  184. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +0 -237
  185. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +0 -31
  186. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +0 -152
  187. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +0 -3
  188. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +0 -74
  189. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +0 -135
  190. data/utils/enveomics/Scripts/ogs.annotate.rb +0 -88
  191. data/utils/enveomics/Scripts/ogs.core-pan.rb +0 -160
  192. data/utils/enveomics/Scripts/ogs.extract.rb +0 -125
  193. data/utils/enveomics/Scripts/ogs.mcl.rb +0 -186
  194. data/utils/enveomics/Scripts/ogs.rb +0 -104
  195. data/utils/enveomics/Scripts/ogs.stats.rb +0 -131
  196. data/utils/enveomics/Scripts/rbm-legacy.rb +0 -172
  197. data/utils/enveomics/Scripts/rbm.rb +0 -108
  198. data/utils/enveomics/Scripts/sam.filter.rb +0 -148
  199. data/utils/enveomics/Tests/Makefile +0 -10
  200. data/utils/enveomics/Tests/Mgen_M2288.faa +0 -3189
  201. data/utils/enveomics/Tests/Mgen_M2288.fna +0 -8282
  202. data/utils/enveomics/Tests/Mgen_M2321.fna +0 -8288
  203. data/utils/enveomics/Tests/Nequ_Kin4M.faa +0 -2970
  204. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  205. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +0 -7
  206. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +0 -17
  207. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +0 -137
  208. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +0 -123
  209. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +0 -200
  210. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +0 -55
  211. data/utils/enveomics/Tests/alkB.nwk +0 -1
  212. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +0 -13
  213. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +0 -17
  214. data/utils/enveomics/Tests/hiv1.faa +0 -59
  215. data/utils/enveomics/Tests/hiv1.fna +0 -134
  216. data/utils/enveomics/Tests/hiv2.faa +0 -70
  217. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +0 -233
  218. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +0 -1
  219. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +0 -233
  220. data/utils/enveomics/Tests/phyla_counts.tsv +0 -10
  221. data/utils/enveomics/Tests/primate_lentivirus.ogs +0 -11
  222. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +0 -9
  223. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +0 -8
  224. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +0 -6
  225. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +0 -9
  226. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +0 -6
  227. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +0 -6
  228. data/utils/enveomics/build_enveomics_r.bash +0 -45
  229. data/utils/enveomics/enveomics.R/DESCRIPTION +0 -31
  230. data/utils/enveomics/enveomics.R/NAMESPACE +0 -39
  231. data/utils/enveomics/enveomics.R/R/autoprune.R +0 -155
  232. data/utils/enveomics/enveomics.R/R/barplot.R +0 -184
  233. data/utils/enveomics/enveomics.R/R/cliopts.R +0 -135
  234. data/utils/enveomics/enveomics.R/R/df2dist.R +0 -154
  235. data/utils/enveomics/enveomics.R/R/growthcurve.R +0 -331
  236. data/utils/enveomics/enveomics.R/R/prefscore.R +0 -79
  237. data/utils/enveomics/enveomics.R/R/recplot.R +0 -354
  238. data/utils/enveomics/enveomics.R/R/recplot2.R +0 -1631
  239. data/utils/enveomics/enveomics.R/R/tribs.R +0 -583
  240. data/utils/enveomics/enveomics.R/R/utils.R +0 -80
  241. data/utils/enveomics/enveomics.R/README.md +0 -81
  242. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  243. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  244. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -16
  245. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -16
  246. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -16
  247. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +0 -25
  248. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +0 -46
  249. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +0 -23
  250. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +0 -47
  251. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +0 -23
  252. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +0 -23
  253. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +0 -40
  254. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +0 -103
  255. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +0 -67
  256. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +0 -24
  257. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +0 -19
  258. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +0 -45
  259. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +0 -44
  260. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +0 -47
  261. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +0 -75
  262. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +0 -50
  263. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +0 -44
  264. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +0 -139
  265. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +0 -45
  266. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +0 -24
  267. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +0 -77
  268. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +0 -25
  269. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +0 -21
  270. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +0 -19
  271. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +0 -19
  272. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +0 -47
  273. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +0 -29
  274. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +0 -18
  275. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +0 -45
  276. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -36
  277. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +0 -19
  278. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +0 -19
  279. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +0 -27
  280. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +0 -52
  281. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +0 -17
  282. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +0 -51
  283. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +0 -43
  284. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +0 -82
  285. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +0 -59
  286. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +0 -27
  287. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +0 -36
  288. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +0 -23
  289. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +0 -68
  290. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +0 -28
  291. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +0 -27
  292. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +0 -14
  293. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +0 -13
  294. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +0 -78
  295. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +0 -46
  296. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +0 -45
  297. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +0 -125
  298. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +0 -19
  299. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +0 -19
  300. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +0 -19
  301. data/utils/enveomics/globals.mk +0 -8
  302. data/utils/enveomics/manifest.json +0 -9
  303. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  304. data/utils/multitrim/README.md +0 -67
  305. data/utils/multitrim/multitrim.py +0 -1555
  306. data/utils/multitrim/multitrim.yml +0 -13
@@ -1,84 +0,0 @@
1
- # FastAAI
2
- Fast estimation of Average Amino Acid Identities (AAI) for bacterial and viral genomes.
3
- Includes a module for the classification of viral genomes.
4
-
5
- ## Content Table
6
- * [Features](#features)
7
- * [Citation](#citation)
8
- * [Requirements](#requirements)
9
- * [Installation](#installation)
10
- * [Usage](#usage)
11
- * [FAQs](#faqs)
12
- * [License](#license)
13
-
14
- ## Features
15
- Coming soon
16
-
17
- ## Citation
18
- Coming soon
19
-
20
- ## Requirements:
21
- - Programs:
22
- - [HMMER](http://hmmer.org/) >= 3.1
23
- - Python >=3.6,<3.9
24
- - Base Python Modules:
25
- - argparse
26
- - datetime
27
- - pathlib
28
- - shutil
29
- - subprocess
30
- - gzip
31
- - multiprocessing
32
- - textwrap
33
- - pickle
34
- - tempfile
35
- - sys
36
- - functools
37
- - Additional Python Modules:
38
- - numpy
39
-
40
- ## Installation
41
- ### Conda Installation
42
- FastAAIIt appears we need a bunch of pre-requisites to run FastAAI No worries, their installation using Conda is quite easy. If you don't have Conda, you can install it as follows:
43
- 1. Download Anaconda from https://www.anaconda.com/products/individual.
44
- 2. Run `bash Anaconda-latest-Linux-x86_64.sh` and follow the installation instructions.
45
- 3. Once installed you can run `conda -V`. You should get the version of conda that you installed.
46
-
47
- Now, let's add the conda channels required to install the pre-requisites:
48
-
49
- ```bash
50
- conda config --add channels conda-forge
51
- conda config --add channels bioconda
52
- conda config --add channels cruizperez
53
- ```
54
-
55
- Then, create an environment for MicrobeAnnotator:
56
-
57
- ```bash
58
- conda create -n fastaai hmmer prodigal numpy python=3.7 fastaai
59
- ```
60
-
61
- And activate it:
62
-
63
- ```bash
64
- conda activate microbeannotator
65
- ```
66
-
67
- Both main scripts (microbeannotator and microbeannotator_db_builder) should be in your path ready for use!
68
- This should take care of most of the requirements except for Aspera Connect and KofamScan, which are a little more involved. Let's install those.
69
-
70
- ### Pip Installation
71
- #Once you have installed the pre-requisites to run MicrobeAnnotator, or if you already had them and you are not using Conda, you can install MicrobeAnnotator using pip:
72
-
73
-
74
- ## Usage
75
- ### Database creation
76
-
77
-
78
- ## FAQs
79
-
80
-
81
-
82
- ## License
83
-
84
- See LICENSE
@@ -1,244 +0,0 @@
1
- # Recruitment plots
2
-
3
- ## Aims
4
-
5
- This document aims to cover the technical aspects of the recruitment plot functions in the
6
- `enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
7
-
8
- ## Caveats
9
-
10
- This is a __*working document*__, describing unstable and/or experimental code. The material
11
- here is susceptible of changes without warning, pay attention to the modification date and (if
12
- in doubt) the commit history. The definitions and default parameters of the functions described
13
- here may change in the near future as result of further experimentation or more stable
14
- implementations.
15
-
16
- The current document was generated and tested with the `enveomics.R` package version 1.3. To
17
- check your current version in R, use `packageVersion('enveomics.R')`.
18
-
19
- > **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
20
- > Carefully evaluate all your results.
21
-
22
- ---
23
-
24
- ## Package: `enveomics.R`
25
-
26
- The functionalities described here are provided by the `enveomics.R` package. Some features
27
- described here are updated more frequently than the official
28
- [CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
29
- updates (package HEAD), download (or update), and install this git repository.
30
-
31
- ### Quick installation guide
32
-
33
- :globe_with_meridians: To install the latest stable version available in CRAN, use in R:
34
-
35
- ```R
36
- install.packages(c('enveomics.R','optparse'))
37
- ```
38
-
39
- :octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
40
-
41
- ```R
42
- install.packages('devtools')
43
- library('devtools')
44
- install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
45
- ```
46
-
47
- ---
48
-
49
- ## Recruitment plots: `enve.recplot2`
50
-
51
- The first step in this analysis is the mapping of reads to the genome, processed with
52
- [BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
53
- We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
54
- prefix of the processed files.
55
-
56
- Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
57
- For this, you'll have two options.
58
-
59
- ### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
60
-
61
- The stand-alone script
62
- [BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
63
- is the easiest option to run, and should be the preferred method if you're automating
64
- this analysis to process several mappings, but it doesn't offer access to advanced options.
65
-
66
- You can run it like this using two CPUs:
67
-
68
- ```bash
69
- BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
70
- ```
71
-
72
- > **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
73
- > map against contigs. However, if you did map reads against genes, you may want to use the
74
- > `--pos-breaks 0` option to use each gene as a recruitment window.
75
- >
76
- > **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
77
- > `--peaks-col darkred` option.
78
-
79
- Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
80
- object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
81
-
82
- ### Option 2: Using the `enve.recplot2` R function
83
-
84
- If you require access to advanced options, or for some other reason prefer to calculate the
85
- recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
86
- and example session in R:
87
-
88
- ```R
89
- # Load the package
90
- library(enveomics.R)
91
- # Open the PDF
92
- pdf('my-recplot.pdf')
93
- # Build and plot the object using two threads and no peak detection
94
- # (to turn on peak detection, simply remove `peaks.col=NA`)
95
- rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
96
- # Close the PDF
97
- dev.off()
98
- # Save the object
99
- save(rp, file='my-recplot.rdata')
100
- ```
101
-
102
- > **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
103
- > before closing the R session.
104
-
105
- Naturally, you may want to see what other (advanced) options you have. You can access the
106
- documentation of the function in R using `?enve.recplot2`.
107
-
108
- ---
109
-
110
- ## Summary statistics
111
-
112
- Here we explore some frequently used summary statistics from recruitment plots. First, load the
113
- package and the `enve.RecPlot2` object you saved previously, in R:
114
-
115
- ```R
116
- library(enveomics.R)
117
- load('my-recplot.rdata')
118
- ```
119
-
120
- ### Centrality measures of sequencing depth
121
-
122
- ```R
123
- mean(enve.recplot2.seqdepth(rp)) # <- Average
124
- median(enve.recplot2.seqdepth(rp)) # <- Median
125
- enve.truncate(enve.recplot2.seqdepth(rp)) # <- 95% Central Truncated Mean
126
- enve.truncate(enve.recplot2.seqdepth(rp), 0.9) # <- 90% Central Truncated Mean
127
- ```
128
-
129
- The functions above only use hits with identity above the cutoff for "in-group" (by default: 95%).
130
- In order to estimate the sequencing depth with a different identity cutoff, modify the cutoff first:
131
-
132
- ```R
133
- rp98 <- enve.recplot2.changeCutoff(rp, 98) # <- Change to ≥98%
134
- mean(enve.recplot2.seqdepth(rp98)) # <- Average (for the new object)
135
- median(enve.recplot2.seqdepth(rp98)) # <- Median (for the new object)
136
- ```
137
-
138
- ### Average and median sequencing depth excluding zero-coverage windows
139
-
140
- ```R
141
- seqdepth <- enve.recplot2.seqdepth(rp)
142
- mean(seqdepth[seqdepth>0]) # <- Average
143
- median(seqdepth[seqdepth>0]) # <- Median
144
- ```
145
-
146
- ### Average Nucleotide Identity from reads (ANIr)
147
-
148
- ```R
149
- enve.recplot2.ANIr(rp) # <- Complete recruitment plot
150
- enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
151
- enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
152
- enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
153
- ```
154
-
155
- ### Coordinates of each sequence window with their respective sequencing depth
156
-
157
- ```R
158
- d <- enve.recplot2.coordinates(rp)
159
- d$seqdepth <- enve.recplot2.seqdepth(rp)
160
- d
161
- ```
162
-
163
- ### Sequencing breadth (upper boundary)
164
-
165
- This estimate depends on the window size. The smaller the window size, the better the
166
- estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
167
- biased (overestimate).
168
-
169
- ```R
170
- mean(enve.recplot2.seqdepth(rp) > 0)
171
- ```
172
-
173
- ---
174
-
175
- ## Peak-finder: `enve.recplot2.findPeaks`
176
-
177
- In this step we will try to identify one or multiple population peaks corresponding to different
178
- sub-populations and/or composites of sub-populations.
179
-
180
- > **NOTE** This step can be performed together with the step above, but we separate it here for
181
- > two reasons: **(1)** This step is much more unstable but less computationally demanding than the
182
- > step before, so it makes sense to re-run only this part with different parameters and/or
183
- > package updates; and **(2)** We want to save the R objects independently, so the following steps
184
- > are more clear.
185
-
186
- In R:
187
-
188
- ```R
189
- # Load the package
190
- library(enveomics.R)
191
- # Load the `enve.RecPlot2` object you saved previously
192
- load('my-recplot.rdata')
193
- # Find the peaks
194
- peaks <- enve.recplot2.findPeaks(rp)
195
- # Save the peaks R object (optional)
196
- save(peaks, file='my-recplot-peaks.rdata')
197
- # Plot the peaks in a PDF (optional)
198
- pdf('my-recplot-peaks.pdf')
199
- p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
200
- dev.off()
201
- ```
202
-
203
- The key function here is `enve.recplot2.findPeaks`. This function has several parameters, depending on
204
- the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
205
- of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
206
-
207
- ---
208
-
209
- ## Gene-content diversity: `enve.recplot2.extractWindows`
210
-
211
- In R:
212
-
213
- ```R
214
- # Load the package and the objects (unless you're still in the same session from the last step)
215
- library(enveomics.R)
216
- load('my-recplot.rdata')
217
- load('my-recplot-peaks.rdata')
218
- # Find the peak representing the core genome
219
- cp <- enve.recplot2.corePeak(peaks)
220
- #-----
221
- # The following functions illustrate how to obtain different results. Please explore the resulting
222
- # objects and the associated documentation
223
- #-----
224
- # Find the coordinates of windows significantly below the average sequencing depth
225
- div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
226
- # Add sequencing depth
227
- div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
228
- # Save the coordinates as a tab-delimited table
229
- write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
230
- # Find all the windows with sequencing depth zero
231
- zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
232
- ```
233
-
234
- ---
235
-
236
- ## To do
237
-
238
- - [x] Document structure
239
- - [x] Package: `enveomics.R`
240
- - [x] Recruitment plots: `enve.recplot2`
241
- - [x] Summary statistics
242
- - [x] Peak-finder: `enve.recplot2.findPeaks`
243
- - [x] Gene-content diversity: `enve.recplot2.extractWindows`
244
- - [ ] Compare identity profiles: `enve.recplot2.compareIdentities`
@@ -1,66 +0,0 @@
1
- #!/bin/bash
2
-
3
- # @author Luis M. Rodriguez-R
4
- # @license Artistic-2.0
5
-
6
- set -e # <- So it stops if there is an error
7
- function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
8
-
9
- OUT=$1 # <- Output file
10
- [[ -n "$1" ]] && shift
11
- SEQS=("$@") # <- list of all genomes
12
- THR=2 # <- Number or threads
13
- DEF_DIST=0.9 # <- Default distance when AAI cannot be reliably estimated
14
-
15
- # This is just the help message
16
- if [[ $# -lt 2 ]] ; then
17
- echo "
18
- Use case: Building AAI matrices from a collection of genomes.
19
-
20
- IMPORTANT
21
- This script is functional, but it's mainly intended for illustrative purposes.
22
- Please take a look at the code first.
23
-
24
- Usage:
25
- $0 <output.txt> <genomes...>
26
-
27
- <output.txt> The output AAI list, in tab-delimited form containing the
28
- following columns: (1) Sequence A, (2) Sequence B, (3)
29
- AAI, (4) AAI-SD, (5) Proteins used, (6) Number of proteins in
30
- the smallest genome, (7) Percentage of the genome shared.
31
- <genomes...> The list of files containing the genomes (at least 2).
32
-
33
- " >&2
34
- exit
35
- fi
36
-
37
- # 00. Create environment
38
- export PATH=$(dirname "$0")/../Scripts:$PATH
39
-
40
- # 01. Calculate AAI
41
- echo "[01/03] Calculating AAI"
42
- for i in "${SEQS[@]}" ; do
43
- for j in "${SEQS[@]}" ; do
44
- echo -n " o $i vs $j: "
45
- AAI=$(aai.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
46
- --no-save-rbm --auto --quiet)
47
- echo ${AAI:-Below detection}
48
- [[ "$i" == "$j" ]] && break
49
- done
50
- done
51
-
52
- # 02. Extract matrix
53
- echo "[02/03] Extracting list"
54
- echo -e "SeqA\tSeqB\tAAI\tSD\tN\tOmega\tFrx" > "$OUT"
55
- echo "select seq1, seq2, aai, sd, n, omega, (100.0*n/omega) from aai;" \
56
- | sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
57
-
58
- # 03. Make it a distance matrix.
59
- echo "[03/03] Generating distance matrix"
60
- echo "
61
- source('$(dirname $0)/../enveomics.R/R/df2dist.R');
62
- a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
63
- aai.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
64
- write.table(as.matrix(aai.d), '$OUT.dist',
65
- quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
66
- " | R --vanilla >/dev/null
@@ -1,66 +0,0 @@
1
- #!/bin/bash
2
-
3
- # @author Luis M. Rodriguez-R
4
- # @license Artistic-2.0
5
-
6
- set -e # <- So it stops if there is an error
7
- function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
8
-
9
- OUT=$1 # <- Output file
10
- [[ -n "$1" ]] && shift
11
- SEQS=("$@") # <- list of all genomes
12
- THR=2 # <- Number or threads
13
- DEF_DIST=0.9 # <- Default distance when ANI cannot be reliably estimated
14
-
15
- # This is just the help message
16
- if [[ $# -lt 2 ]] ; then
17
- echo "
18
- Use case: Building ANI matrices from a collection of genomes.
19
-
20
- IMPORTANT
21
- This script is functional, but it's mainly intended for illustrative purposes.
22
- Please take a look at the code first.
23
-
24
- Usage:
25
- $0 <output.txt> <genomes...>
26
-
27
- <output.txt> The output ANI list, in tab-delimited form containing the
28
- following columns: (1) Sequence A, (2) Sequence B, (3)
29
- ANI, (4) ANI-SD, (5) Fragments used, (6) Maximum number
30
- of fragments, (7) Percentage of the genome shared.
31
- <genomes...> The list of files containing the genomes (at least 2).
32
-
33
- " >&2
34
- exit
35
- fi
36
-
37
- # 00. Create environment
38
- export PATH=$(dirname "$0")/../Scripts:$PATH
39
-
40
- # 01. Calculate ANI
41
- echo "[01/03] Calculating ANI"
42
- for i in "${SEQS[@]}" ; do
43
- for j in "${SEQS[@]}" ; do
44
- echo -n " o $i vs $j: "
45
- ANI=$(ani.rb -1 "$i" -2 "$j" -S "$OUT.db" -t "$THR" \
46
- --no-save-rbm --no-save-regions --auto --quiet)
47
- echo ${ANI:-Below detection}
48
- [[ "$i" == "$j" ]] && break
49
- done
50
- done
51
-
52
- # 02. Extract matrix
53
- echo "[02/03] Extracting list"
54
- echo -e "SeqA\tSeqB\tANI\tSD\tN\tOmega\tFrx" > "$OUT"
55
- echo "select seq1, seq2, ani, sd, n, omega, (100.0*n/omega) from ani;" \
56
- | sqlite3 "$OUT.db" | tr '|' '\t' >> "$OUT"
57
-
58
- # 03. Make it a distance matrix.
59
- echo "[03/03] Generating distance matrix"
60
- echo "
61
- source('$(dirname $0)/../enveomics.R/R/df2dist.R');
62
- a <- read.table('$OUT', sep = '\\t', header = TRUE, as.is = TRUE, quote = '');
63
- ani.d <- enve.df2dist(a, default.d = $DEF_DIST, max.sim = 100);
64
- write.table(as.matrix(ani.d), '$OUT.dist',
65
- quote = FALSE, col.names = NA, row.names = TRUE, sep = '\\t')
66
- " | R --vanilla >/dev/null
@@ -1,105 +0,0 @@
1
- #!/bin/bash
2
-
3
- #
4
- # @author Luis M. Rodriguez-R
5
- # @update Mar-23-2016
6
- # @license artistic license 2.0
7
- #
8
-
9
- set -e # <- So it stops if there is an error
10
- function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
11
-
12
- ORG=$1 # <- Organism (see help)
13
- THR=2 # <- Number or threads
14
-
15
- # This is just the help message
16
- if [[ "$ORG" == "" ]] ; then
17
- echo "
18
- Use case: Essential genes phylogeny of a species. The essential genes are a
19
- collection of genes typically found in single copy in archaeal and bacterial
20
- genomes
21
-
22
- IMPORTANT
23
- This script is functional, but it's mainly intended for illustrative purposes.
24
- Please take a look at the code first.
25
-
26
- Usage:
27
- $0 <organism>
28
-
29
- <organism> The organism to use (e.g., Streptococcus_pneumoniae).
30
-
31
- " >&2
32
- exit
33
- fi
34
-
35
- # 00. Create environment
36
- export PATH=$(dirname $0)/../Scripts:$PATH
37
- if [[ -e $ORG ]] ; then
38
- echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
39
- exit 1
40
- fi
41
- mkdir $ORG
42
- for i in 01.proteome 02.essential 03.aln 04.cat 05.raxml 06.autoprune ; do
43
- mkdir $ORG/$i
44
- done
45
-
46
- # 01. Download proteomes
47
- echo "[01/06] Downloading and guzipping data"
48
- RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
49
- rm $ORG/01.proteome/assembly_summary.txt
50
- for i in $ORG/01.proteome/* ; do
51
- b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
52
- if exists $i/*.faa.gz ; then
53
- for j in $i/*.faa.gz ; do gunzip $j ; done
54
- cat $i/*.faa > $ORG/01.proteome/$b.faa
55
- fi
56
- rm -R $i
57
- done
58
-
59
- # 02. Essential genes
60
- echo "[02/06] Idenfifying essential genes"
61
- N=0
62
- for i in $ORG/01.proteome/*.faa ; do # <- This loop could be parallelized
63
- genomeA=$(basename $i .faa)
64
- dir=$ORG/02.essential/$genomeA
65
- mkdir $dir
66
- HMM.essential.rb -i $i -m $dir/ -R $dir/log.txt -r $genomeA -t $THR
67
- let N=$N+1
68
- done
69
-
70
- # 03. Find core and align groups
71
- echo "[03/06] Identifying core essentials and aligning groups"
72
- CORE_ESS=$(basename -s .faa $ORG/02.essential/*/*.faa | sort | uniq -c \
73
- | awk '$1=='$N'{print $2}')
74
- for b in $CORE_ESS ; do # <- This loop could be parallelized
75
- cat $ORG/02.essential/*/$b.faa > $ORG/03.aln/$b.faa
76
- clustalo -i $ORG/03.aln/$b.faa -o $ORG/03.aln/$b.aln #--threads=$THR
77
- done
78
-
79
- # 04. Concatenate alignment
80
- echo "[04/06] Concatenating alignments and removing invariable sites"
81
- Aln.cat.rb -I -c $ORG/04.cat/essential.raxcoords -i '|' $ORG/03.aln/*.aln \
82
- > $ORG/04.cat/essential.aln 2> $ORG/04.cat/essential.log
83
-
84
- # 05. Run RAxML
85
- echo "[05/06] Inferring phylogeny"
86
- # You REALLY should consider running the following with more threads (-T) and,
87
- # if possible, multi-nodes using MPI
88
- cd $ORG/05.raxml
89
- raxmlHPC-PTHREADS -T $THR -p 1234 \
90
- -s ../04.cat/essential.aln -q ../04.cat/essential.raxcoords \
91
- -m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
92
- # before running this line, so you know
93
- # that you're running what you really want. Check
94
- # options for bootstrapping and the different
95
- # algorithms (-f). Note that -m is required, but the
96
- # file unus.raxcoords specifies "AUTO", so RAxML will
97
- # attempt to find the model resulting in the highest
98
- # likelihood.
99
- cd ../..
100
-
101
- # 06. Autoprune
102
- echo "[06/06] Auto-pruning the tree"
103
- Newick.autoprune.R --t $ORG/05.raxml/RAxML_bestTree.UNUS --min_dist 0.001 \
104
- $ORG/06.autoprune/essential-pruned.nwk
105
-
@@ -1,100 +0,0 @@
1
- #!/bin/bash
2
-
3
- #
4
- # @author Luis M. Rodriguez-R
5
- # @update Oct-20-2015
6
- # @license artistic license 2.0
7
- #
8
-
9
- ORG=$1 # <- Organism (see help)
10
- THR=2 # <- Number or threads
11
-
12
- # This is just the help message
13
- if [[ "$ORG" == "" ]] ; then
14
- echo "
15
- Use case: Unus genome phylogeny of a species. The unus genome is the collection
16
- of orthologous groups in a set of genomes that has exactly one gene per genome,
17
- i.e., the core genome minus in-paralogs.
18
-
19
- IMPORTANT
20
- This script is functional, but it's mainly intended for illustrative purposes.
21
- Please take a look at the code first.
22
-
23
- Usage:
24
- $0 <organism>
25
-
26
- <organism> The organism to use (e.g., Streptococcus_pneumoniae).
27
-
28
- " >&2
29
- exit
30
- fi
31
-
32
- # 00. Create environment
33
- export PATH=$(dirname $0)/../Scripts:$PATH
34
- if [[ -e $ORG ]] ; then
35
- echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
36
- exit 1
37
- fi
38
- mkdir $ORG
39
- for i in 01.proteome 02.rbm 03.ogs 04.aln 05.cat 06.raxml ; do
40
- mkdir $ORG/$i
41
- done
42
-
43
- # 01. Download proteomes
44
- echo "[01/06] Downloading and guzipping data"
45
- RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
46
- rm $ORG/01.proteome/assembly_summary.txt
47
- for i in $ORG/01.proteome/* ; do
48
- b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
49
- for j in $i/*.faa.gz ; do gunzip $j ; done
50
- cat $i/*.faa > $ORG/01.proteome/$b.faa.tmp
51
- FastA.tag.rb -i $ORG/01.proteome/$b.faa.tmp -o $ORG/01.proteome/$b.faa.tmp -d
52
- rm -R $i $ORG/01.proteome/$b.faa.tmp
53
- done
54
-
55
- # 02. Reciprocal Best Matches
56
- echo "[02/06] Idenfifying Reciprocal Best Matches"
57
- for i in $ORG/01.proteome/*.faa ; do # <- This nested loop could be parallelized
58
- genomeA=$(basename $i .faa)
59
- for j in $ORG/01.proteome/*.faa ; do
60
- genomeB=$(basename $j .faa)
61
- rbm.rb -1 $i -2 $j -t $THR > $ORG/02.rbm/$genomeA-$genomeB.rbm
62
- [[ "$i" == "$j" ]] && continue # <- Ignore if it simplifies distribution
63
- done
64
- done
65
-
66
- # 03. Orthologous Groups
67
- echo "[03/06] Compiling Orthologous Groups"
68
- ogs.mcl.rb -d $ORG/02.rbm -o $ORG/03.ogs/pangenome.ogs -t $THR
69
-
70
- # 04. Extract unus genome and align groups
71
- echo "[04/06] Extracting unus genome and aligning OGs"
72
- ogs.extract.rb -i $ORG/03.ogs/pangenome.ogs -s $ORG/01.proteome/%s.faa \
73
- -o $ORG/04.aln/ -c 1 -d 1 -p
74
- for i in $ORG/04.aln/*.fa ; do # <- This loop could be parallelized
75
- b=$(basename $i .fa)
76
- clustalo -i $i -o $ORG/04.aln/$b.aln --threads=$THR
77
- done
78
-
79
- # 05. Concatenate alignment
80
- echo "[05/06] Concatenating alignments and removing invariable sites"
81
- Aln.cat.rb -I -c $ORG/05.cat/unus.raxcoords -i - $ORG/04.aln/*.aln \
82
- > $ORG/05.cat/unus.aln 2> $ORG/05.cat/unus.log
83
-
84
- # 06. Run RAxML
85
- echo "[06/06] Inferring phylogeny"
86
- # You REALLY should consider running the following with more threads (-T) and,
87
- # if possible, multi-nodes using MPI
88
- cd $ORG/06.raxml
89
- raxmlHPC-PTHREADS -T $THR -p 1234 \
90
- -s ../05.cat/unus.aln -q ../05.cat/unus.raxcoords \
91
- -m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
92
- # before running this line, so you know
93
- # that you're running what you really
94
- # want. Check options for bootstrapping
95
- # and the different algorithms (-f). Note
96
- # that -m is required, but the file
97
- # unus.raxcoords specifies "AUTO", so
98
- # RAxML will attempt to find the model
99
- # resulting in the highest likelihood.
100
-