miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license artistic license 2.0
5
+ # @update Jul-05-2015
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+ use Symbol;
11
+
12
+ my ($file, $base, $outN) = @ARGV;
13
+
14
+ $outN ||= 2;
15
+ ($file and $base) or die "
16
+ Usage
17
+ $0 in_file.fq out_base[ no_files]
18
+
19
+ in_file.fq Input file in FastA format.
20
+ out_base Prefix for the name of the output files. It will
21
+ be appended with .<i>.fastq, where <i> is a consecutive
22
+ number starting in 1.
23
+ no_files Number of files to generate. By default: 2.
24
+
25
+ ";
26
+
27
+
28
+ my @outSym = ();
29
+ for my $i (1 .. $outN){
30
+ $outSym[$i-1] = gensym;
31
+ open $outSym[$i-1], ">", "$base.$i.fastq" or die "I can not create the file: $base.$i.fa: $!\n";
32
+ }
33
+
34
+
35
+ my($i, $seq) = (-1, '');
36
+ open FILE, "<", $file or die "I can not read the file: $file: $!\n";
37
+ while(my $ln=<FILE>){
38
+ if($.%4 == 1){
39
+ print { $outSym[$i % $outN] } $seq if $seq;
40
+ $i++;
41
+ $seq = '';
42
+ }
43
+ $seq.=$ln;
44
+ }
45
+ print { $outSym[$i % $outN] } $seq if $seq;
46
+ close FILE;
47
+
48
+ for(my $j=0; $j<$outN; $j++){
49
+ close $outSym[$j];
50
+ }
51
+
52
+ print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
53
+
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ $VERSION = 1.1
8
+
9
+ o = { q: false, p: '', s: '' }
10
+ OptionParser.new do |opts|
11
+ opts.version = $VERSION
12
+ Enveomics.opt_banner(
13
+ opts, 'Generates easy-to-parse tagged reads from FastQ files',
14
+ "#{File.basename($0)} -i in.fasta -o out.fasta [options]"
15
+ )
16
+
17
+ opts.separator 'Mandatory'
18
+ opts.on(
19
+ '-i', '--in FILE',
20
+ 'Path to the FastQ file containing the sequences',
21
+ 'Supports compression with .gz extension, use - for STDIN'
22
+ ) { |v| o[:in] = v }
23
+ opts.on(
24
+ '-o', '--out FILE', 'Path to the FastQ to create',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:out] = v }
27
+ opts.separator ''
28
+ opts.separator 'ID options'
29
+ opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
30
+ opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
31
+ opts.separator ''
32
+ opts.separator 'Other Options'
33
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
34
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
35
+ opts.separator ''
36
+ end.parse!
37
+
38
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
39
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
40
+
41
+ begin
42
+ ifh = reader(o[:in])
43
+ ofh = writer(o[:out])
44
+ i = 0
45
+ lno = 0
46
+ ifh.each do |ln|
47
+ ln.chomp!
48
+ lno += 1
49
+ case lno % 4
50
+ when 1
51
+ ln =~ /^@/ or
52
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
53
+ i += 1
54
+ ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
55
+ when 3
56
+ ln =~ /^\+/ or
57
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
58
+ ofh.puts '+'
59
+ else
60
+ ofh.puts ln
61
+ end
62
+ end
63
+ ifh.close
64
+ ofh.close
65
+ rescue => err
66
+ $stderr.puts "Exception: #{err}\n\n"
67
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
68
+ err
69
+ end
70
+
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ o = {q:false, key:2}
6
+ ARGV << '-h' if ARGV.empty?
7
+ OptionParser.new do |opts|
8
+ opts.banner = "
9
+ Compares the estimated error of sequencing reads (Q-score) with
10
+ observed mismatches (identity against a know reference sequence).
11
+
12
+ Usage: #{$0} [options]"
13
+ opts.separator ""
14
+ opts.separator "Mandatory"
15
+ opts.on("-f", "--fastq FILE",
16
+ "Path to the FastQ file containing the sequences."){ |v| o[:fastq] = v }
17
+ opts.on("-b", "--blast FILE",
18
+ "Path to the tabular BLAST file mapping reads to reference sequences."
19
+ ){ |v| o[:blast] = v }
20
+ opts.on("-o", "--out FILE",
21
+ "Path to the output tab-delimited file to create."){ |v| o[:out] = v }
22
+ opts.separator ""
23
+ opts.separator "Other Options"
24
+ opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
25
+ opts.on("-h", "--help", "Display this screen") do
26
+ puts opts
27
+ exit
28
+ end
29
+ opts.separator ""
30
+ end.parse!
31
+ abort "-f is mandatory" if o[:fastq].nil?
32
+ abort "-b is mandatory" if o[:blast].nil?
33
+ abort "-o is mandatory" if o[:out].nil?
34
+
35
+ # Read the Q scores and estimate expected mismatches
36
+ mm = {} # <- Hash with read IDs as key, and arrays as values:
37
+ # [ expected mismatches, variance of mismatches, length ]
38
+ $stderr.puts "Reading FastQ file" unless o[:q]
39
+ File.open(o[:fastq], "r") do |fh|
40
+ id = nil
41
+ fh.each_line do |ln|
42
+ case $.%4
43
+ when 1
44
+ ln =~ /^@(\S+)/ or raise "Unexpected defline format: #{ln}"
45
+ id = $1
46
+ $stderr.print " #{mm.size} reads...\r" unless o[:q]
47
+ when 0
48
+ ln.chomp!
49
+ # I'm assuming ALWAYS Phred+33!!!
50
+ p = ln.split('').map{ |i| (i.ord - 33).to_f }.map{ |q| 10.0**(-q/10.0) }
51
+ mu = p.inject(:+)
52
+ var = p.map{ |i| i*(1.0-i) }.inject(:+)
53
+ mm[id] = [mu, var, p.size]
54
+ end
55
+ end
56
+ $stderr.puts " Found: #{mm.size} reads." unless o[:q]
57
+ end
58
+
59
+ ofh = File.open(o[:out], "w")
60
+ ofh.puts %w[id obs_subs obs_id aln_len obs_ins obs_del obs_gap mu var len].join("\t")
61
+
62
+ # Read Identities and compare against expectation
63
+ $stderr.puts "Reading Tabular BLAST file" unless o[:q]
64
+ File.open(o[:blast], "r") do |fh|
65
+ k = 0
66
+ fh.each_line do |ln|
67
+ r = ln.chomp.split("\t")
68
+ id = r[0]
69
+ next if mm[id].nil?
70
+ k += 1
71
+ $stderr.print " #{k} alignments...\r" unless o[:q]
72
+ obs_m = r[4].to_i + (r[6].to_i - 1) + (mm[id][2] - r[7].to_i)
73
+ obs_del = r[3].to_i - (r[7].to_i - r[6].to_i).abs
74
+ obs_ins = r[3].to_i - (r[9].to_i - r[8].to_i).abs
75
+ ofh.puts ([id, obs_m, r[2], r[7].to_i - r[6].to_i + 1,
76
+ obs_ins, obs_del, r[5]] + mm[id]).join("\t")
77
+ end
78
+ $stderr.puts " Found #{k} alignments." unless o[:q]
79
+ end
80
+
81
+ ofh.close
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env awk -f
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Dec-26-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ BEGIN {
9
+ for (i = 0; i < ARGC; i++) {
10
+ if(ARGV[i] == "--help"){
11
+ print "Description:\n"
12
+ print " Translates FastQ files into FastA.\n"
13
+ print "Usage:\n"
14
+ print " FastQ.toFastA.awk < in.fq > out.fa\n"
15
+ exit
16
+ }
17
+ }
18
+ }
19
+
20
+ NR%4 == 1, NR%4 == 2 {
21
+ if(NR%4 == 1){ gsub(/^@/,">") }
22
+ print $0
23
+ }
24
+
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ use warnings;
7
+ use strict;
8
+ use List::Util qw/min max/;
9
+ use Getopt::Std;
10
+
11
+ sub HELP_MESSAGE { die "
12
+
13
+ Description:
14
+ Generates a list of coordinates from a GFF table concatenating the subject
15
+ sequences.
16
+
17
+ See also: BlastTab.recplot2.R and BlastTab.catsbj.pl
18
+
19
+ Usage:
20
+ $0 [options] seq.fa map.gff > abs-coords.tsv
21
+
22
+ seq.fa Subject sequences (contigs) in FastA format.
23
+ map.gff Features to map in GFF.
24
+
25
+ Options:
26
+ -L path Generate a file with the absolute coordinates of the
27
+ concatenated contigs. This is identical to the .lim file
28
+ generated by BlastTab.catsbj.pl.
29
+ -i Preserve exact coordinates and include inter-feature windows as
30
+ separate bins. By default, the coordinates are set in the
31
+ midpoint between features when non-contiguous.
32
+ -s The FastA provided is to be treated as a subset of the subject.
33
+ By default, it expects all the contigs to be present in the
34
+ BLAST.
35
+ -q Run quietly.
36
+ -h Display this message and exit.
37
+
38
+ "; }
39
+
40
+ my %o;
41
+ getopts('L:isqh', \%o);
42
+ my($fa, $map) = @ARGV;
43
+ ($fa and $map) or &HELP_MESSAGE;
44
+ $o{h} and &HELP_MESSAGE;
45
+
46
+ my %seq = ();
47
+ my @seq = ();
48
+ my $tot = 0;
49
+
50
+ SEQ:{
51
+ print STDERR "== Reading reference sequences\n" unless $o{q};
52
+ open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
53
+ my $cur_seq = '';
54
+ while(<FA>){
55
+ chomp;
56
+ if(m/^>(\S+)/){
57
+ my $c = $1;
58
+ $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
59
+ push @seq, $c;
60
+ $cur_seq = $c;
61
+ }else{
62
+ s/[^A-Za-z]//g;
63
+ $seq{$cur_seq} += length $_;
64
+ }
65
+ }
66
+ close FA;
67
+ print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
68
+ }
69
+
70
+ $o{L} ||= '/dev/null';
71
+ open LIM, ">", $o{L} or die "Cannot create the file: $o{L}: $!\n";
72
+ my $l = 0;
73
+ for my $s (@seq){
74
+ print LIM "$s\t".(++$l)."\t$seq{$s}\n";
75
+ ($l, $seq{$s}) = ($seq{$s}, $l);
76
+ }
77
+ close LIM;
78
+
79
+ MAP: {
80
+ print STDERR "== Reading mapping\n" unless $o{q};
81
+ open GFF, "<", $map or die "Cannot read the file: $map: $!\n";
82
+ my $last_end = 1;
83
+ my $last_name = "NA";
84
+ print "1\tNA\tNA\n";
85
+ my $i = 0;
86
+ FEATURE: while(<GFF>){
87
+ next if /^\s*(#.*)?$/; # Blank or comment lines
88
+ chomp;
89
+ my @ln = split /\t/;
90
+ $ln[4] or die "Cannot parse line $map:$.: $_\n";
91
+ unless(exists $seq{$ln[0]}){
92
+ die "Cannot find the subject sequence: $ln[0]\n" unless $o{s};
93
+ next FEATURE;
94
+ }
95
+ $i++;
96
+ my $start = $seq{$ln[0]}+$ln[3];
97
+ my $end = $seq{$ln[0]}+$ln[4];
98
+ my $name = "feat_$i";
99
+ if($ln[8] =~ /^gene_id=(\d+)/){ # <- GeneMark style
100
+ $name = "gene_id_$1";
101
+ }elsif($ln[8] =~ /^ID=\d+_(\d+)/){ # <- Prodigal style
102
+ $name = $ln[0]."_".$1;
103
+ }elsif($ln[8] =~ /^ID=([^;]+)/){
104
+ $name = $1;
105
+ }
106
+ if($o{i}){
107
+ $start = $last_end if $start < $last_end;
108
+ print "$start\t$last_name~$name\tGAP\n" unless $start==$last_end;
109
+ print "$end\t$name\tFEAT\n";
110
+ }else{
111
+ my $midpoint = int(($last_end + $start)/2);
112
+ print "$last_end\t$last_name\tFEAT\n" unless $last_end==1;
113
+ }
114
+ $last_name = $name;
115
+ $last_end = $end;
116
+ }
117
+ if($last_end > 1){
118
+ if($o{i}){
119
+ print "$l\t$last_name~NA\tGAP\n" unless $last_end==$l;
120
+ }else{
121
+ print "$l\t$last_name\tFEAT\n";
122
+ }
123
+ }
124
+ close GFF;
125
+ print STDERR " done.\n" unless $o{q};
126
+ }
127
+
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author: Luis M. Rodriguez-R
5
+ # @update: Feb-06-2015
6
+ # @license: artistic license 2.0
7
+ #
8
+
9
+ require 'optparse'
10
+
11
+ o = {:q=>FALSE, :k=>1, :split=>"#"}
12
+ ARGV << '-h' if ARGV.size==0
13
+ OptionParser.new do |opts|
14
+ opts.banner = "
15
+ Adds annotations to GenBank files.
16
+
17
+ Usage: #{$0} [options]"
18
+ opts.separator ""
19
+ opts.separator "Mandatory"
20
+ opts.on("-g", "--genbank FILE", "Input GenBank file."){ |v| o[:gb]=v }
21
+ opts.on("-t", "--table FILE", "Input file containing the annotations. It must be a ",
22
+ "tab-delimited raw table including a header row with ",
23
+ "the names of the fields."){ |v| o[:table]=v }
24
+ opts.on("-o", "--out FILE", "Output file containing the annotated GenBank."){ |v| o[:out]=v }
25
+ opts.separator ""
26
+ opts.separator "Other Options"
27
+ opts.on("-k", "--key NUMBER", "Key of the column to use as identifier. By default: #{o[:k]}"){ |v| o[:k] = v.to_i }
28
+ opts.on("-s", "--split STRING", "String that separates multiple entries in the annotation features. By default: \"#{o[:split]}\""){ |v| o[:k] = v.to_i }
29
+ opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
30
+ opts.on("-h", "--help", "Display this screen.") do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ""
35
+ end.parse!
36
+ abort "-g is mandatory" if o[:gb].nil?
37
+ abort "-t is mandatory" if o[:table].nil?
38
+ abort "-o is mandatory" if o[:out].nil?
39
+
40
+ ##### MAIN:
41
+ begin
42
+ puts "Reading annotation table: #{o[:table]}." unless o[:q]
43
+ ifh = File.open(o[:table], "r")
44
+ header = ifh.gets.chomp.split(/\t/)
45
+ puts " * using #{header[ o[:k]-1 ]} column as feature identifier."
46
+ annot = {}
47
+ while ln=ifh.gets
48
+ row = ln.chomp.split(/\t/)
49
+ warn "WARNING: #{header[ o[:k]-1 ]} #{row[ o[:k]-1 ]} found more than once." unless annot[ row[ o[:k]-1 ] ].nil?
50
+ annot[ row[ o[:k]-1 ] ] = row
51
+ end
52
+ ifh.close
53
+ puts " * found #{annot.size} annotation entries with #{header.size} fields." unless o[:q]
54
+ puts "Annotating GenBank." unless o[:q]
55
+ ifh = File.open(o[:gb], "r")
56
+ ofh = File.open(o[:out], "w")
57
+ found = 0
58
+ notfound = 0
59
+ while ln=ifh.gets
60
+ ofh.print ln
61
+ m = /^(?<sp>\s+)\/#{header[ o[:k]-1 ]}="(?<id>.+)"/.match(ln)
62
+ next if m.nil?
63
+ if annot[ m[:id] ].nil?
64
+ notfound += 1
65
+ next
66
+ end
67
+ found += 1
68
+ annot[ m[:id] ].each_index do |i|
69
+ next if i == o[:k]-1 or annot[ m[:id] ][i]==""
70
+ annot[ m[:id] ][i].split(/#{o[:split]}/).each{ |v| ofh.puts "#{m[:sp]}/#{header[i]}=\"#{v}\"" }
71
+ end
72
+ end
73
+ ofh.close
74
+ ifh.close
75
+ puts " * annotated #{found} features." unless o[:q]
76
+ puts " * couldn't find #{notfound} features in the annotation table." unless o[:q] or notfound==0
77
+ $stderr.puts "Done.\n" unless o[:q]
78
+ rescue => err
79
+ $stderr.puts "Exception: #{err}\n\n"
80
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
81
+ err
82
+ end
83
+
84
+
@@ -0,0 +1,351 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license artistic license 2.0
5
+
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ use 'tmpdir'
9
+ use 'zlib'
10
+
11
+ o = {
12
+ bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
+ archaea: false, genomeeq: false, metagenome: false, list: false,
14
+ collection: 'dupont_2012'
15
+ }
16
+ OptionParser.new do |opts|
17
+ opts.banner = "
18
+ Finds and extracts a collection of essential proteins suitable for genome
19
+ completeness evaluation and phylogenetic analyses. Important note: most complete
20
+ bacterial genomes contain only 106/111 genes in this collection, therefore
21
+ producing a completeness of 95.5%, and most archaeal genomes only contain 26/111
22
+ genes, producing a completeness of 23.4%. Use the options --bacteria and/or
23
+ --archaea to ignore models often missing in one or both domains. Note that even
24
+ with these options, some complete archaeal genomes result in very low values of
25
+ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
26
+
27
+ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
28
+
29
+ Usage: #{$0} [options]"
30
+ opts.separator ''
31
+ opts.separator 'Mandatory'
32
+ opts.on(
33
+ '-i', '--in FILE',
34
+ 'Path to the FastA file (.gz allowed) with all the proteins in a genome'
35
+ ) { |v| o[:in] = v }
36
+ opts.separator ''
37
+ opts.separator 'Options'
38
+ opts.on(
39
+ '-c', '--collection STR',
40
+ 'Reference collection of essential proteins to use. One of:',
41
+ '> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
42
+ ' modified by https://doi.org/10.1038/ismej.2015.5',
43
+ '> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
44
+ ' modified by https://doi.org/10.7717/peerj.1319'
45
+ ) { |v| o[:collection] = v }
46
+ opts.on(
47
+ '-o', '--out FILE',
48
+ 'Path to the output FastA file with the translated essential genes',
49
+ 'By default the file is not produced'
50
+ ) { |v| o[:out] = v }
51
+ opts.on(
52
+ '-m', '--per-model STR',
53
+ 'Prefix of translated genes in independent files with the name of the',
54
+ 'model appended. By default files are not produced'
55
+ ) { |v| o[:permodel] = v }
56
+ opts.on(
57
+ '-R', '--report FILE',
58
+ 'Path to the report file. By default, the report is sent to the STDOUT'
59
+ ) { |v| o[:report] = v }
60
+ opts.on(
61
+ '--hmm-out FILE',
62
+ 'Save HMMsearch output in this file. By default, not saved'
63
+ ) { |v| o[:hmmout] = v }
64
+ opts.on(
65
+ '--alignments FILE',
66
+ 'Save the aligned proteins in this file. By default, not saved'
67
+ ) { |v| o[:alignments] = v }
68
+ opts.on(
69
+ '-B', '--bacteria',
70
+ 'If set, ignores models typically missing in Bacteria'
71
+ ) { |v| o[:bacteria] = v }
72
+ opts.on(
73
+ '-A', '--archaea',
74
+ 'If set, ignores models typically missing in Archaea'
75
+ ) { |v| o[:archaea] = v }
76
+ opts.on(
77
+ '-G', '--genome-eq',
78
+ 'If set, ignores models not suitable for genome-equivalents estimations',
79
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
80
+ ) { |v| o[:genomeeq] = v }
81
+ opts.on(
82
+ '-r', '--rename STR',
83
+ 'If set, renames the sequences with the string provided and appends it',
84
+ 'with pipe and the gene name (except in --per-model files)'
85
+ ) { |v| o[:rename] = v }
86
+ opts.on(
87
+ '-n', '--no-stats',
88
+ 'If set, no statistics are reported on genome evaluation'
89
+ ) { |v| o[:stats] = v }
90
+ opts.on(
91
+ '-s', '--no-genes',
92
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes'
93
+ ) { |v| o[:genes] = v }
94
+ opts.on(
95
+ '-M', '--metagenome',
96
+ 'If set, it allows for multiple copies of each gene and turns on',
97
+ 'metagenomic report mode'
98
+ ) { |v| o[:metagenome] = v }
99
+ opts.separator ''
100
+ opts.separator 'Other Options'
101
+ opts.on(
102
+ '-L', '--list-models',
103
+ 'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
104
+ 'and -q; ignores all other parameters'
105
+ ) { |v| o[:list] = v }
106
+ opts.on(
107
+ '-b', '--bin DIR',
108
+ 'Path to the directory containing the binaries of HMMer 3.0+'
109
+ ) { |v| o[:bin] = v }
110
+ opts.on(
111
+ '--model-file',
112
+ 'External file containing models to search'
113
+ ) { |v| o[:model_file] = v }
114
+ opts.on(
115
+ '-t', '--threads INT', Integer,
116
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
117
+ ) { |v| o[:thr] = v }
118
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
119
+ opts.on('-h', '--help', 'Display this screen') do
120
+ puts opts
121
+ exit
122
+ end
123
+ opts.separator ''
124
+ end.parse!
125
+ abort '-i is mandatory' if o[:in].nil? and not o[:list]
126
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
127
+ o[:rename] = nil if o[:metagenome]
128
+
129
+ case o[:collection]
130
+ when 'dupont_2012'
131
+ not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
132
+ TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
133
+ TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
134
+ TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
135
+ TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
136
+ TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
137
+ TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
138
+ TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
139
+ TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
140
+ TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
141
+ TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
142
+ TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
143
+ TIGR02729 TIGR03263 TIGR03594}
144
+ not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
145
+ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
146
+ TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
147
+ when 'lee_2019'
148
+ not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
149
+ EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
150
+ Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
151
+ Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
152
+ Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
153
+ Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
154
+ RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
155
+ not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
156
+ CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
157
+ DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
158
+ Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
159
+ Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
160
+ Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
161
+ Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
162
+ Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
163
+ not_as_genomeeq = not_in_archaea + not_in_bacteria
164
+ else
165
+ raise "Unsupported collection: '#{o[:collection]}'"
166
+ end
167
+
168
+ begin
169
+ Dir.mktmpdir do |dir|
170
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
171
+ if o[:in] =~ /\.gz/
172
+ tmp_in = File.expand_path('sequences.fa', dir)
173
+ Zlib::GzipReader.open(o[:in]) do |ifh|
174
+ File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
175
+ end
176
+ o[:in] = tmp_in
177
+ end
178
+
179
+ # Create database.
180
+ $stderr.puts 'Searching models.' unless o[:q]
181
+ models = {}
182
+ model_id = nil
183
+ dbh = File.open("#{dir}/essential.hmm", 'w')
184
+ o[:model_file] ||= File.expand_path(
185
+ "../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
186
+ mfh = (File.extname(o[:model_file]) == '.gz') ?
187
+ Zlib::GzipReader.open(o[:model_file]) :
188
+ File.open(o[:model_file], 'r')
189
+ while ln = mfh.gets
190
+ dbh.print ln
191
+ ln.chomp!
192
+ model_id = $1 if ln =~ /^NAME\s+(.+)/
193
+ models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
194
+ end
195
+ dbh.close
196
+ mfh.close
197
+ models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
198
+ models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
199
+ models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
200
+ if o[:list]
201
+ models.each_pair{ |id,desc| puts [id,desc].join("\t") }
202
+ exit
203
+ end
204
+
205
+ # Check HMMer version and run HMMsearch.
206
+ if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
207
+ raise 'You have provided an unsupported version of HMMER. ' +
208
+ 'This script requires HMMER 3.0+.'
209
+ end
210
+ o[:hmmout] ||= "#{dir}/hmmsearch"
211
+ `'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
212
+ -A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
213
+ > '#{dir}/hmmsearch.log'`
214
+
215
+ # Parse output
216
+ $stderr.puts 'Parsing results.' unless o[:q]
217
+ trash = []
218
+ genes = {}
219
+ File.open(o[:hmmout], 'r') do |resh|
220
+ while ln = resh.gets
221
+ next if ln =~ /^#/
222
+ r = ln.split /\s+/
223
+ next unless models.include? r[2]
224
+ if o[:metagenome]
225
+ genes[ r[2] ] = [] if genes[ r[2] ].nil?
226
+ genes[ r[2] ] << r[0]
227
+ elsif genes[ r[2] ].nil?
228
+ genes[ r[2] ] = r[0]
229
+ else
230
+ trash << r[2]
231
+ end
232
+ end
233
+ end
234
+
235
+ # Report statistics
236
+ if o[:stats]
237
+ reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
238
+ modifiers = [:bacteria, :archaea, :genomeeq]
239
+ .map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
240
+ reph.puts "! Collection: #{o[:collection]} #{modifiers}"
241
+ if o[:metagenome]
242
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
243
+ gc = [0] * (models.size - genes.size) +
244
+ genes.values.map{ |g| g.length }.sort
245
+ reph.printf "! Mean number of copies per model: %.3f.\n",
246
+ gc.inject(:+).to_f / models.size
247
+ reph.printf "! Median number of copies per model: %.1f.\n",
248
+ gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
249
+ if o[:genes] and genes.size != models.size
250
+ reph.printf "! Missing genes: %s\n",
251
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
252
+ map{|m| "#{m}: #{models[m]}."}).join("\n! ")
253
+ end
254
+ else
255
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
256
+ reph.printf "! Completeness: %.1f%%.\n",
257
+ 100.0 * genes.size / models.size
258
+ reph.printf "! Contamination: %.1f%%.\n",
259
+ 100.0 * trash.size / models.size
260
+ if o[:genes]
261
+ reph.printf "! Multiple copies: %s\n",
262
+ ([''] + trash.uniq.
263
+ map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
264
+ join("\n! ") unless trash.empty?
265
+ reph.printf "! Missing genes: %s\n",
266
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
267
+ map{ |m| "#{m}: #{models[m]}." }).
268
+ join("\n! ") unless genes.size == models.size
269
+ end
270
+ end
271
+ reph.close unless o[:report].nil?
272
+ end
273
+
274
+ # Extract sequences
275
+ unless o[:out].nil? and o[:permodel].nil?
276
+ $stderr.puts 'Extracting sequences.' unless o[:q]
277
+ faah = File.open(o[:in], 'r')
278
+ outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
279
+ geneh = nil
280
+ in_gene = nil
281
+ unless o[:permodel].nil?
282
+ genes.keys.each do |m|
283
+ File.open("#{o[:permodel]}#{m}.faa", 'w').close
284
+ end
285
+ end
286
+ while ln = faah.gets
287
+ if ln =~ /^>(\S+)/
288
+ if o[:metagenome]
289
+ in_gene = genes.keys.
290
+ map{ |k| genes[k].include?($1) ? k : nil }.compact.first
291
+ in_gene = [in_gene, $1] unless in_gene.nil?
292
+ else
293
+ in_gene = genes.rassoc($1)
294
+ end
295
+ next if in_gene.nil?
296
+ geneh.close unless geneh.nil?
297
+ geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
298
+ o[:permodel].nil?
299
+ outh.print(o[:rename].nil? ?
300
+ ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
301
+ geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
302
+ geneh.nil?
303
+ else
304
+ next if in_gene.nil?
305
+ outh.print ln unless outh.nil?
306
+ geneh.print ln unless geneh.nil?
307
+ end
308
+ end
309
+ geneh.close unless geneh.nil?
310
+ outh.close unless outh.nil?
311
+ faah.close
312
+ end
313
+
314
+ unless o[:alignments].nil?
315
+ aln = {}
316
+ File.open("#{dir}/a.sto", 'r') do |fh|
317
+ cur_model = nil
318
+ mask = []
319
+ fh.each_line do |ln|
320
+ case ln.chomp
321
+ when /^# STOCKHOLM/
322
+ cur_model = nil
323
+ mask = []
324
+ when /^#=GS (\S+)\/([\d\-]+)\s+DE/
325
+ cur_model ||= ( genes.rassoc($1) || [] ).first
326
+ aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
327
+ when /^#=GC RF\s+(\S+)/
328
+ aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
329
+ mask.each{ |d| i[d] = '' }
330
+ end
331
+ when /^[^#]\S*\s+(\S+)/
332
+ next if aln[ cur_model ][ 2 ]
333
+ aln[ cur_model ][ 2 ] = $1.upcase
334
+ mask = aln[ cur_model ][ 2 ].split('').each_with_index.
335
+ map{ |v, k| v == '.' ? k : nil }.compact.reverse
336
+ aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
337
+ end
338
+ end
339
+ end
340
+ File.open(o[:alignments], 'w') do |fh|
341
+ aln.each { |k, v| v.each{ |i| fh.puts i } }
342
+ end
343
+ end
344
+
345
+ $stderr.puts 'Done.' unless o[:q]
346
+ end # |dir|
347
+ rescue => err
348
+ $stderr.puts "Exception: #{err}\n\n"
349
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
350
+ err
351
+ end