miga-base 1.2.17.0 → 1.2.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +1 -1
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI +3659 -0
  7. data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
  8. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/FastAAI/README.md +84 -0
  10. data/utils/enveomics/Docs/recplot2.md +244 -0
  11. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  13. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  14. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  15. data/utils/enveomics/LICENSE.txt +73 -0
  16. data/utils/enveomics/Makefile +52 -0
  17. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  18. data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
  19. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  20. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  21. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  22. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  23. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  24. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  25. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  26. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  27. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
  28. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  29. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  30. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  31. data/utils/enveomics/Manifest/categories.json +165 -0
  32. data/utils/enveomics/Manifest/examples.json +162 -0
  33. data/utils/enveomics/Manifest/tasks.json +4 -0
  34. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  35. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  36. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  37. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  38. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  39. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  48. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  49. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  50. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  51. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  52. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  53. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  54. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  62. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  63. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  64. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  65. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  66. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  67. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  68. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  69. data/utils/enveomics/README.md +42 -0
  70. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  71. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  72. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  73. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  74. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  75. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  76. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  77. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  78. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  79. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  80. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  81. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  82. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  83. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  84. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  85. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  86. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  87. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  88. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  89. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  90. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  91. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  92. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  93. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
  94. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  95. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  96. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  97. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  98. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  99. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  100. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  101. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  102. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  103. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  104. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  105. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  106. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  107. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  108. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  109. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  110. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  111. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  112. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  113. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  114. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  115. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  116. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  117. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  118. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  119. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  120. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  121. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  122. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  123. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  124. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  125. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  126. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  127. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  128. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  129. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  130. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  131. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  132. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  133. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  134. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  135. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  136. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  137. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  138. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  139. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  140. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  141. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  142. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  143. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  144. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  145. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  146. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  147. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  148. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  149. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  150. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  151. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  152. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  153. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  154. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  155. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  156. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  157. data/utils/enveomics/Scripts/aai.rb +421 -0
  158. data/utils/enveomics/Scripts/ani.rb +362 -0
  159. data/utils/enveomics/Scripts/anir.rb +137 -0
  160. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  161. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  162. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  163. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  164. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  165. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  166. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  167. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  168. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  169. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  170. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  171. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  172. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
  173. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  174. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  175. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  176. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  177. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  178. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  179. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  180. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
  181. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  182. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  183. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  184. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  185. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  186. data/utils/enveomics/Scripts/ogs.rb +104 -0
  187. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  188. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  189. data/utils/enveomics/Scripts/rbm.rb +108 -0
  190. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  191. data/utils/enveomics/Tests/Makefile +10 -0
  192. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  193. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  194. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  195. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  196. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  197. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  198. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  199. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  200. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  201. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  202. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  203. data/utils/enveomics/Tests/alkB.nwk +1 -0
  204. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  205. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  206. data/utils/enveomics/Tests/hiv1.faa +59 -0
  207. data/utils/enveomics/Tests/hiv1.fna +134 -0
  208. data/utils/enveomics/Tests/hiv2.faa +70 -0
  209. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  210. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  211. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  212. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  213. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  214. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  215. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  216. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  217. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  218. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  219. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  220. data/utils/enveomics/build_enveomics_r.bash +45 -0
  221. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  222. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  223. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  224. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  225. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  226. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  227. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  228. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  229. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  230. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  231. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  232. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  233. data/utils/enveomics/enveomics.R/README.md +81 -0
  234. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  235. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  236. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  237. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  238. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  239. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  240. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  241. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  242. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  243. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  244. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  245. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  246. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  247. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  248. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  249. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  250. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  251. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  252. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  253. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  254. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  255. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  256. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  257. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  258. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  259. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  260. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  261. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  262. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  263. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  264. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  265. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  266. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  267. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  268. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  269. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  270. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  271. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  272. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  273. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  274. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  275. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  276. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  277. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  278. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  279. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  280. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  282. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  283. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  284. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  285. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  286. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  287. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  288. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  289. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  290. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  291. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  292. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  293. data/utils/enveomics/globals.mk +8 -0
  294. data/utils/enveomics/manifest.json +9 -0
  295. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  296. data/utils/multitrim/README.md +67 -0
  297. data/utils/multitrim/multitrim.py +1555 -0
  298. data/utils/multitrim/multitrim.yml +13 -0
  299. metadata +301 -5
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ use warnings;
7
+ use strict;
8
+ use List::Util qw/min max/;
9
+ use Getopt::Std;
10
+
11
+ sub HELP_MESSAGE { die "
12
+
13
+ Description:
14
+ Generates a list of coordinates from a GFF table concatenating the subject
15
+ sequences.
16
+
17
+ See also: BlastTab.recplot2.R and BlastTab.catsbj.pl
18
+
19
+ Usage:
20
+ $0 [options] seq.fa map.gff > abs-coords.tsv
21
+
22
+ seq.fa Subject sequences (contigs) in FastA format.
23
+ map.gff Features to map in GFF.
24
+
25
+ Options:
26
+ -L path Generate a file with the absolute coordinates of the
27
+ concatenated contigs. This is identical to the .lim file
28
+ generated by BlastTab.catsbj.pl.
29
+ -i Preserve exact coordinates and include inter-feature windows as
30
+ separate bins. By default, the coordinates are set in the
31
+ midpoint between features when non-contiguous.
32
+ -s The FastA provided is to be treated as a subset of the subject.
33
+ By default, it expects all the contigs to be present in the
34
+ BLAST.
35
+ -q Run quietly.
36
+ -h Display this message and exit.
37
+
38
+ "; }
39
+
40
+ my %o;
41
+ getopts('L:isqh', \%o);
42
+ my($fa, $map) = @ARGV;
43
+ ($fa and $map) or &HELP_MESSAGE;
44
+ $o{h} and &HELP_MESSAGE;
45
+
46
+ my %seq = ();
47
+ my @seq = ();
48
+ my $tot = 0;
49
+
50
+ SEQ:{
51
+ print STDERR "== Reading reference sequences\n" unless $o{q};
52
+ open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
53
+ my $cur_seq = '';
54
+ while(<FA>){
55
+ chomp;
56
+ if(m/^>(\S+)/){
57
+ my $c = $1;
58
+ $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
59
+ push @seq, $c;
60
+ $cur_seq = $c;
61
+ }else{
62
+ s/[^A-Za-z]//g;
63
+ $seq{$cur_seq} += length $_;
64
+ }
65
+ }
66
+ close FA;
67
+ print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
68
+ }
69
+
70
+ $o{L} ||= '/dev/null';
71
+ open LIM, ">", $o{L} or die "Cannot create the file: $o{L}: $!\n";
72
+ my $l = 0;
73
+ for my $s (@seq){
74
+ print LIM "$s\t".(++$l)."\t$seq{$s}\n";
75
+ ($l, $seq{$s}) = ($seq{$s}, $l);
76
+ }
77
+ close LIM;
78
+
79
+ MAP: {
80
+ print STDERR "== Reading mapping\n" unless $o{q};
81
+ open GFF, "<", $map or die "Cannot read the file: $map: $!\n";
82
+ my $last_end = 1;
83
+ my $last_name = "NA";
84
+ print "1\tNA\tNA\n";
85
+ my $i = 0;
86
+ FEATURE: while(<GFF>){
87
+ next if /^\s*(#.*)?$/; # Blank or comment lines
88
+ chomp;
89
+ my @ln = split /\t/;
90
+ $ln[4] or die "Cannot parse line $map:$.: $_\n";
91
+ unless(exists $seq{$ln[0]}){
92
+ die "Cannot find the subject sequence: $ln[0]\n" unless $o{s};
93
+ next FEATURE;
94
+ }
95
+ $i++;
96
+ my $start = $seq{$ln[0]}+$ln[3];
97
+ my $end = $seq{$ln[0]}+$ln[4];
98
+ my $name = "feat_$i";
99
+ if($ln[8] =~ /^gene_id=(\d+)/){ # <- GeneMark style
100
+ $name = "gene_id_$1";
101
+ }elsif($ln[8] =~ /^ID=\d+_(\d+)/){ # <- Prodigal style
102
+ $name = $ln[0]."_".$1;
103
+ }elsif($ln[8] =~ /^ID=([^;]+)/){
104
+ $name = $1;
105
+ }
106
+ if($o{i}){
107
+ $start = $last_end if $start < $last_end;
108
+ print "$start\t$last_name~$name\tGAP\n" unless $start==$last_end;
109
+ print "$end\t$name\tFEAT\n";
110
+ }else{
111
+ my $midpoint = int(($last_end + $start)/2);
112
+ print "$last_end\t$last_name\tFEAT\n" unless $last_end==1;
113
+ }
114
+ $last_name = $name;
115
+ $last_end = $end;
116
+ }
117
+ if($last_end > 1){
118
+ if($o{i}){
119
+ print "$l\t$last_name~NA\tGAP\n" unless $last_end==$l;
120
+ }else{
121
+ print "$l\t$last_name\tFEAT\n";
122
+ }
123
+ }
124
+ close GFF;
125
+ print STDERR " done.\n" unless $o{q};
126
+ }
127
+
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author: Luis M. Rodriguez-R
5
+ # @update: Feb-06-2015
6
+ # @license: artistic license 2.0
7
+ #
8
+
9
+ require 'optparse'
10
+
11
+ o = {:q=>FALSE, :k=>1, :split=>"#"}
12
+ ARGV << '-h' if ARGV.size==0
13
+ OptionParser.new do |opts|
14
+ opts.banner = "
15
+ Adds annotations to GenBank files.
16
+
17
+ Usage: #{$0} [options]"
18
+ opts.separator ""
19
+ opts.separator "Mandatory"
20
+ opts.on("-g", "--genbank FILE", "Input GenBank file."){ |v| o[:gb]=v }
21
+ opts.on("-t", "--table FILE", "Input file containing the annotations. It must be a ",
22
+ "tab-delimited raw table including a header row with ",
23
+ "the names of the fields."){ |v| o[:table]=v }
24
+ opts.on("-o", "--out FILE", "Output file containing the annotated GenBank."){ |v| o[:out]=v }
25
+ opts.separator ""
26
+ opts.separator "Other Options"
27
+ opts.on("-k", "--key NUMBER", "Key of the column to use as identifier. By default: #{o[:k]}"){ |v| o[:k] = v.to_i }
28
+ opts.on("-s", "--split STRING", "String that separates multiple entries in the annotation features. By default: \"#{o[:split]}\""){ |v| o[:k] = v.to_i }
29
+ opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
30
+ opts.on("-h", "--help", "Display this screen.") do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ""
35
+ end.parse!
36
+ abort "-g is mandatory" if o[:gb].nil?
37
+ abort "-t is mandatory" if o[:table].nil?
38
+ abort "-o is mandatory" if o[:out].nil?
39
+
40
+ ##### MAIN:
41
+ begin
42
+ puts "Reading annotation table: #{o[:table]}." unless o[:q]
43
+ ifh = File.open(o[:table], "r")
44
+ header = ifh.gets.chomp.split(/\t/)
45
+ puts " * using #{header[ o[:k]-1 ]} column as feature identifier."
46
+ annot = {}
47
+ while ln=ifh.gets
48
+ row = ln.chomp.split(/\t/)
49
+ warn "WARNING: #{header[ o[:k]-1 ]} #{row[ o[:k]-1 ]} found more than once." unless annot[ row[ o[:k]-1 ] ].nil?
50
+ annot[ row[ o[:k]-1 ] ] = row
51
+ end
52
+ ifh.close
53
+ puts " * found #{annot.size} annotation entries with #{header.size} fields." unless o[:q]
54
+ puts "Annotating GenBank." unless o[:q]
55
+ ifh = File.open(o[:gb], "r")
56
+ ofh = File.open(o[:out], "w")
57
+ found = 0
58
+ notfound = 0
59
+ while ln=ifh.gets
60
+ ofh.print ln
61
+ m = /^(?<sp>\s+)\/#{header[ o[:k]-1 ]}="(?<id>.+)"/.match(ln)
62
+ next if m.nil?
63
+ if annot[ m[:id] ].nil?
64
+ notfound += 1
65
+ next
66
+ end
67
+ found += 1
68
+ annot[ m[:id] ].each_index do |i|
69
+ next if i == o[:k]-1 or annot[ m[:id] ][i]==""
70
+ annot[ m[:id] ][i].split(/#{o[:split]}/).each{ |v| ofh.puts "#{m[:sp]}/#{header[i]}=\"#{v}\"" }
71
+ end
72
+ end
73
+ ofh.close
74
+ ifh.close
75
+ puts " * annotated #{found} features." unless o[:q]
76
+ puts " * couldn't find #{notfound} features in the annotation table." unless o[:q] or notfound==0
77
+ $stderr.puts "Done.\n" unless o[:q]
78
+ rescue => err
79
+ $stderr.puts "Exception: #{err}\n\n"
80
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
81
+ err
82
+ end
83
+
84
+
@@ -0,0 +1,351 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license artistic license 2.0
5
+
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ use 'tmpdir'
9
+ use 'zlib'
10
+
11
+ o = {
12
+ bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
+ archaea: false, genomeeq: false, metagenome: false, list: false,
14
+ collection: 'dupont_2012'
15
+ }
16
+ OptionParser.new do |opts|
17
+ opts.banner = "
18
+ Finds and extracts a collection of essential proteins suitable for genome
19
+ completeness evaluation and phylogenetic analyses. Important note: most complete
20
+ bacterial genomes contain only 106/111 genes in this collection, therefore
21
+ producing a completeness of 95.5%, and most archaeal genomes only contain 26/111
22
+ genes, producing a completeness of 23.4%. Use the options --bacteria and/or
23
+ --archaea to ignore models often missing in one or both domains. Note that even
24
+ with these options, some complete archaeal genomes result in very low values of
25
+ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
26
+
27
+ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
28
+
29
+ Usage: #{$0} [options]"
30
+ opts.separator ''
31
+ opts.separator 'Mandatory'
32
+ opts.on(
33
+ '-i', '--in FILE',
34
+ 'Path to the FastA file (.gz allowed) with all the proteins in a genome'
35
+ ) { |v| o[:in] = v }
36
+ opts.separator ''
37
+ opts.separator 'Options'
38
+ opts.on(
39
+ '-c', '--collection STR',
40
+ 'Reference collection of essential proteins to use. One of:',
41
+ '> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
42
+ ' modified by https://doi.org/10.1038/ismej.2015.5',
43
+ '> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
44
+ ' modified by https://doi.org/10.7717/peerj.1319'
45
+ ) { |v| o[:collection] = v }
46
+ opts.on(
47
+ '-o', '--out FILE',
48
+ 'Path to the output FastA file with the translated essential genes',
49
+ 'By default the file is not produced'
50
+ ) { |v| o[:out] = v }
51
+ opts.on(
52
+ '-m', '--per-model STR',
53
+ 'Prefix of translated genes in independent files with the name of the',
54
+ 'model appended. By default files are not produced'
55
+ ) { |v| o[:permodel] = v }
56
+ opts.on(
57
+ '-R', '--report FILE',
58
+ 'Path to the report file. By default, the report is sent to the STDOUT'
59
+ ) { |v| o[:report] = v }
60
+ opts.on(
61
+ '--hmm-out FILE',
62
+ 'Save HMMsearch output in this file. By default, not saved'
63
+ ) { |v| o[:hmmout] = v }
64
+ opts.on(
65
+ '--alignments FILE',
66
+ 'Save the aligned proteins in this file. By default, not saved'
67
+ ) { |v| o[:alignments] = v }
68
+ opts.on(
69
+ '-B', '--bacteria',
70
+ 'If set, ignores models typically missing in Bacteria'
71
+ ) { |v| o[:bacteria] = v }
72
+ opts.on(
73
+ '-A', '--archaea',
74
+ 'If set, ignores models typically missing in Archaea'
75
+ ) { |v| o[:archaea] = v }
76
+ opts.on(
77
+ '-G', '--genome-eq',
78
+ 'If set, ignores models not suitable for genome-equivalents estimations',
79
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
80
+ ) { |v| o[:genomeeq] = v }
81
+ opts.on(
82
+ '-r', '--rename STR',
83
+ 'If set, renames the sequences with the string provided and appends it',
84
+ 'with pipe and the gene name (except in --per-model files)'
85
+ ) { |v| o[:rename] = v }
86
+ opts.on(
87
+ '-n', '--no-stats',
88
+ 'If set, no statistics are reported on genome evaluation'
89
+ ) { |v| o[:stats] = v }
90
+ opts.on(
91
+ '-s', '--no-genes',
92
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes'
93
+ ) { |v| o[:genes] = v }
94
+ opts.on(
95
+ '-M', '--metagenome',
96
+ 'If set, it allows for multiple copies of each gene and turns on',
97
+ 'metagenomic report mode'
98
+ ) { |v| o[:metagenome] = v }
99
+ opts.separator ''
100
+ opts.separator 'Other Options'
101
+ opts.on(
102
+ '-L', '--list-models',
103
+ 'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
104
+ 'and -q; ignores all other parameters'
105
+ ) { |v| o[:list] = v }
106
+ opts.on(
107
+ '-b', '--bin DIR',
108
+ 'Path to the directory containing the binaries of HMMer 3.0+'
109
+ ) { |v| o[:bin] = v }
110
+ opts.on(
111
+ '--model-file',
112
+ 'External file containing models to search'
113
+ ) { |v| o[:model_file] = v }
114
+ opts.on(
115
+ '-t', '--threads INT', Integer,
116
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
117
+ ) { |v| o[:thr] = v }
118
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
119
+ opts.on('-h', '--help', 'Display this screen') do
120
+ puts opts
121
+ exit
122
+ end
123
+ opts.separator ''
124
+ end.parse!
125
+ abort '-i is mandatory' if o[:in].nil? and not o[:list]
126
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
127
+ o[:rename] = nil if o[:metagenome]
128
+
129
+ case o[:collection]
130
+ when 'dupont_2012'
131
+ not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
132
+ TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
133
+ TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
134
+ TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
135
+ TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
136
+ TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
137
+ TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
138
+ TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
139
+ TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
140
+ TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
141
+ TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
142
+ TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
143
+ TIGR02729 TIGR03263 TIGR03594}
144
+ not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
145
+ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
146
+ TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
147
+ when 'lee_2019'
148
+ not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
149
+ EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
150
+ Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
151
+ Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
152
+ Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
153
+ Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
154
+ RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
155
+ not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
156
+ CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
157
+ DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
158
+ Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
159
+ Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
160
+ Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
161
+ Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
162
+ Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
163
+ not_as_genomeeq = not_in_archaea + not_in_bacteria
164
+ else
165
+ raise "Unsupported collection: '#{o[:collection]}'"
166
+ end
167
+
168
+ begin
169
+ Dir.mktmpdir do |dir|
170
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
171
+ if o[:in] =~ /\.gz/
172
+ tmp_in = File.expand_path('sequences.fa', dir)
173
+ Zlib::GzipReader.open(o[:in]) do |ifh|
174
+ File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
175
+ end
176
+ o[:in] = tmp_in
177
+ end
178
+
179
+ # Create database.
180
+ $stderr.puts 'Searching models.' unless o[:q]
181
+ models = {}
182
+ model_id = nil
183
+ dbh = File.open("#{dir}/essential.hmm", 'w')
184
+ o[:model_file] ||= File.expand_path(
185
+ "../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
186
+ mfh = (File.extname(o[:model_file]) == '.gz') ?
187
+ Zlib::GzipReader.open(o[:model_file]) :
188
+ File.open(o[:model_file], 'r')
189
+ while ln = mfh.gets
190
+ dbh.print ln
191
+ ln.chomp!
192
+ model_id = $1 if ln =~ /^NAME\s+(.+)/
193
+ models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
194
+ end
195
+ dbh.close
196
+ mfh.close
197
+ models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
198
+ models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
199
+ models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
200
+ if o[:list]
201
+ models.each_pair{ |id,desc| puts [id,desc].join("\t") }
202
+ exit
203
+ end
204
+
205
+ # Check HMMer version and run HMMsearch.
206
+ if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
207
+ raise 'You have provided an unsupported version of HMMER. ' +
208
+ 'This script requires HMMER 3.0+.'
209
+ end
210
+ o[:hmmout] ||= "#{dir}/hmmsearch"
211
+ `'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
212
+ -A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
213
+ > '#{dir}/hmmsearch.log'`
214
+
215
+ # Parse output
216
+ $stderr.puts 'Parsing results.' unless o[:q]
217
+ trash = []
218
+ genes = {}
219
+ File.open(o[:hmmout], 'r') do |resh|
220
+ while ln = resh.gets
221
+ next if ln =~ /^#/
222
+ r = ln.split /\s+/
223
+ next unless models.include? r[2]
224
+ if o[:metagenome]
225
+ genes[ r[2] ] = [] if genes[ r[2] ].nil?
226
+ genes[ r[2] ] << r[0]
227
+ elsif genes[ r[2] ].nil?
228
+ genes[ r[2] ] = r[0]
229
+ else
230
+ trash << r[2]
231
+ end
232
+ end
233
+ end
234
+
235
+ # Report statistics
236
+ if o[:stats]
237
+ reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
238
+ modifiers = [:bacteria, :archaea, :genomeeq]
239
+ .map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
240
+ reph.puts "! Collection: #{o[:collection]} #{modifiers}"
241
+ if o[:metagenome]
242
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
243
+ gc = [0] * (models.size - genes.size) +
244
+ genes.values.map{ |g| g.length }.sort
245
+ reph.printf "! Mean number of copies per model: %.3f.\n",
246
+ gc.inject(:+).to_f / models.size
247
+ reph.printf "! Median number of copies per model: %.1f.\n",
248
+ gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
249
+ if o[:genes] and genes.size != models.size
250
+ reph.printf "! Missing genes: %s\n",
251
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
252
+ map{|m| "#{m}: #{models[m]}."}).join("\n! ")
253
+ end
254
+ else
255
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
256
+ reph.printf "! Completeness: %.1f%%.\n",
257
+ 100.0 * genes.size / models.size
258
+ reph.printf "! Contamination: %.1f%%.\n",
259
+ 100.0 * trash.size / models.size
260
+ if o[:genes]
261
+ reph.printf "! Multiple copies: %s\n",
262
+ ([''] + trash.uniq.
263
+ map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
264
+ join("\n! ") unless trash.empty?
265
+ reph.printf "! Missing genes: %s\n",
266
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
267
+ map{ |m| "#{m}: #{models[m]}." }).
268
+ join("\n! ") unless genes.size == models.size
269
+ end
270
+ end
271
+ reph.close unless o[:report].nil?
272
+ end
273
+
274
+ # Extract sequences
275
+ unless o[:out].nil? and o[:permodel].nil?
276
+ $stderr.puts 'Extracting sequences.' unless o[:q]
277
+ faah = File.open(o[:in], 'r')
278
+ outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
279
+ geneh = nil
280
+ in_gene = nil
281
+ unless o[:permodel].nil?
282
+ genes.keys.each do |m|
283
+ File.open("#{o[:permodel]}#{m}.faa", 'w').close
284
+ end
285
+ end
286
+ while ln = faah.gets
287
+ if ln =~ /^>(\S+)/
288
+ if o[:metagenome]
289
+ in_gene = genes.keys.
290
+ map{ |k| genes[k].include?($1) ? k : nil }.compact.first
291
+ in_gene = [in_gene, $1] unless in_gene.nil?
292
+ else
293
+ in_gene = genes.rassoc($1)
294
+ end
295
+ next if in_gene.nil?
296
+ geneh.close unless geneh.nil?
297
+ geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
298
+ o[:permodel].nil?
299
+ outh.print(o[:rename].nil? ?
300
+ ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
301
+ geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
302
+ geneh.nil?
303
+ else
304
+ next if in_gene.nil?
305
+ outh.print ln unless outh.nil?
306
+ geneh.print ln unless geneh.nil?
307
+ end
308
+ end
309
+ geneh.close unless geneh.nil?
310
+ outh.close unless outh.nil?
311
+ faah.close
312
+ end
313
+
314
+ unless o[:alignments].nil?
315
+ aln = {}
316
+ File.open("#{dir}/a.sto", 'r') do |fh|
317
+ cur_model = nil
318
+ mask = []
319
+ fh.each_line do |ln|
320
+ case ln.chomp
321
+ when /^# STOCKHOLM/
322
+ cur_model = nil
323
+ mask = []
324
+ when /^#=GS (\S+)\/([\d\-]+)\s+DE/
325
+ cur_model ||= ( genes.rassoc($1) || [] ).first
326
+ aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
327
+ when /^#=GC RF\s+(\S+)/
328
+ aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
329
+ mask.each{ |d| i[d] = '' }
330
+ end
331
+ when /^[^#]\S*\s+(\S+)/
332
+ next if aln[ cur_model ][ 2 ]
333
+ aln[ cur_model ][ 2 ] = $1.upcase
334
+ mask = aln[ cur_model ][ 2 ].split('').each_with_index.
335
+ map{ |v, k| v == '.' ? k : nil }.compact.reverse
336
+ aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
337
+ end
338
+ end
339
+ end
340
+ File.open(o[:alignments], 'w') do |fh|
341
+ aln.each { |k, v| v.each{ |i| fh.puts i } }
342
+ end
343
+ end
344
+
345
+ $stderr.puts 'Done.' unless o[:q]
346
+ end # |dir|
347
+ rescue => err
348
+ $stderr.puts "Exception: #{err}\n\n"
349
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
350
+ err
351
+ end