miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,293 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'enveomics_rb/stats'
4
+ require 'fileutils'
5
+ require 'shellwords'
6
+ require 'tmpdir'
7
+ require 'zlib'
8
+
9
+ module Enveomics
10
+ # Wrapper class for ANIr estimation
11
+ #
12
+ # Use as: +ANIr.new(opts).go!+
13
+ class ANIr
14
+ # Options hash
15
+ attr :opts
16
+
17
+ # Identities list (unsorted)
18
+ attr :identities
19
+
20
+ def initialize(opts)
21
+ @opts = opts
22
+ @identities = []
23
+ end
24
+
25
+ # --------------------------------------------------[ High-level pipelines ]
26
+
27
+ # Perform all the analyses
28
+ def go!
29
+ read_input
30
+ detect_identity
31
+ estimate_ani_r
32
+ end
33
+
34
+ # Identify input/output mode and read mapping
35
+ def read_input
36
+ if opts[:m_format] != :list
37
+ @tmpdir = Dir.mktmpdir
38
+ @filter_contigs = !opts[:g].nil?
39
+ opts[:m] = File.join(@tmpdir, 'map.sam') if opts[:m].nil?
40
+ run_mapping unless File.exist? opts[:m]
41
+ load_contigs_to_filter if @filter_contigs
42
+ end
43
+ read_mapping = :"read_mapping_from_#{opts[:m_format]}"
44
+ raise Enveomics::OptionError.new(
45
+ "Unsupported mapping format: #{opts[:m_format]}"
46
+ ) unless respond_to? read_mapping
47
+ @identities = []
48
+ send(read_mapping)
49
+ say "- Unfiltered average identity: #{sample.mean}"
50
+ say "- Reads mapped: #{sample.n}"
51
+ save_identities
52
+ save_histogram
53
+ ensure
54
+ @tmpdir ||= nil
55
+ FileUtils.rm_rf @tmpdir if @tmpdir
56
+ end
57
+
58
+ # Identify the identity threshold
59
+ def detect_identity
60
+ say 'Detecting identity threshold'
61
+ if opts[:algorithm] == :auto
62
+ say "- Bimodality: #{bimodality}"
63
+ opts[:algorithm] = bimodality >= opts[:bimodality] ? :gmm : :fix
64
+ end
65
+ say "- Algorithm: #{opts[:algorithm]}"
66
+ if opts[:algorithm] == :gmm
67
+ detect_identity_by_gmm
68
+ end
69
+ end
70
+
71
+ # Estimate ANIr
72
+ def estimate_ani_r
73
+ say 'Estimating ANIr'
74
+ @sample = nil # Empty cached sample
75
+ @identities.delete_if { |i| i < opts[:identity] }
76
+ say "- ANIr: #{sample.mean}"
77
+ end
78
+
79
+ # -----------------------------------------------------------------[ Utils ]
80
+
81
+ # Show progress unless +opts[:q]+
82
+ def say(*msg)
83
+ o = '[%s] %s' % [Time.now, msg.join('')]
84
+ $stderr.puts(o) unless opts[:q]
85
+ File.open(opts[:log], 'a') { |fh| fh.puts o } if opts[:log]
86
+ end
87
+
88
+ # Execute command in the shell
89
+ def run(cmd)
90
+ say " - Running: #{cmd.join(' ')}"
91
+ `#{cmd.shelljoin} 2>&1 | tee >> #{opts[:log] || '/dev/null'}`
92
+ unless $?.success?
93
+ raise Enveomics::CommandError.new("#{cmd.first} failed: #{$?}")
94
+ end
95
+ end
96
+
97
+ # Returns an open file handler for the file, supporting .gz
98
+ def reader(file)
99
+ file =~ /\.gz$/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
100
+ end
101
+
102
+ # Is the mapping in SAM format?
103
+ def sam?
104
+ opts[:m_format] == :sam
105
+ end
106
+
107
+ # ------------------------------------------------------------[ Map it out ]
108
+
109
+ # Execute Bowtie2 and generate SAM file
110
+ def run_mapping
111
+ say 'Running mapping using Bowtie2'
112
+ raise Enveomics::OptionError.new(
113
+ 'Only SAM output is supported for mapping'
114
+ ) unless sam?
115
+
116
+ @filter_contigs = false
117
+ say '- Indexing input sequences'
118
+ raise Enveomics::OptionError.new(
119
+ 'Only FastA genome input is supported for mapping'
120
+ ) unless opts[:g_format] == :fasta
121
+
122
+ idx = File.join(@tmpdir, 'genome.idx')
123
+ run(['bowtie2-build', opts[:g], idx])
124
+
125
+ say '- Mapping metagenomic reads to genome assembly'
126
+ cmd = [
127
+ 'bowtie2', '-x', idx, '-p', opts[:threads], '-S', opts[:m], '--no-mixed'
128
+ ]
129
+ cmd << '-f' if opts[:r_format] == :fasta
130
+ cmd +=
131
+ case opts[:r_type]
132
+ when :single
133
+ ['-U', opts[:r]]
134
+ when :coupled
135
+ pairs = opts[:r].split(',', 2)
136
+ ['-1', pairs[0], '-2', pairs[1], '--no-discordant']
137
+ when :interleaved
138
+ ['--interleaved', opts[:r], '--no-discordant']
139
+ else
140
+ raise Enveomics::OptionError.new(
141
+ "Unsupported reads type: #{o[:r_type]}"
142
+ )
143
+ end
144
+ run(cmd)
145
+ end
146
+
147
+ # If +@filter_contigs+ is true, reads the genome assembly and saves contig
148
+ # names to filter the mapping
149
+ def load_contigs_to_filter
150
+ return unless @filter_contigs
151
+ say 'Loading contigs to filter'
152
+ reader = reader(opts[:g])
153
+ @contigs_to_filter =
154
+ case opts[:g_format]
155
+ when :fasta
156
+ reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
157
+ when :list
158
+ reader.each.map(&:chomp)
159
+ else
160
+ raise Enveomics::OptionError.new(
161
+ "Unsupported genome assembly format: #{opts[:g_format]}"
162
+ )
163
+ end
164
+ reader.close
165
+ say "- Got #{@contigs_to_filter.size} contigs"
166
+ end
167
+
168
+ # Reads the mapping file assuming SAM format
169
+ def read_mapping_from_sam
170
+ say 'Reading mapping from SAM file'
171
+ reader = reader(opts[:m])
172
+ reader.each { |ln| parse_sam_line(ln) }
173
+ reader.close
174
+ end
175
+
176
+ # Reads the mapping file assuming BAM format
177
+ def read_mapping_from_bam
178
+ say 'Reading mapping from BAM file'
179
+ IO.popen(['samtools', 'view', opts[:m]].shelljoin) do |fh|
180
+ fh.each { |ln| parse_sam_line(ln) }
181
+ end
182
+ end
183
+
184
+ # Reads the mapping file assuming a Tabular BLAST format
185
+ def read_mapping_from_tab
186
+ say 'Reading mapping from Tabular BLAST file'
187
+ reader = reader(opts[:m])
188
+ reader.each do |ln|
189
+ next if ln =~ /^\s*(#.*)?$/ # Comment or empty line
190
+ row = ln.chomp.split("\t")
191
+ next if @filter_contigs && !@contigs_to_filter.include?(row[1])
192
+ @identities << row[2].to_f
193
+ end
194
+ reader.close
195
+ end
196
+
197
+ # Reads the identities from a raw-text list
198
+ def read_mapping_from_list
199
+ say 'Reading identities from raw text list'
200
+ reader = reader(opts[:m])
201
+ @identities = reader.each.map(&:to_f)
202
+ reader.close
203
+ end
204
+
205
+ # Parses one line in SAM format
206
+ def parse_sam_line(ln)
207
+ return if ln =~ /^@/ || ln =~ /^\s*$/
208
+ row = ln.chomp.split("\t")
209
+ return if row[2] == '*'
210
+ return if @filter_contigs && !@contigs_to_filter.include?(row[2])
211
+ length = row[9].size
212
+ row.shift(11) # Discard non-flag columns
213
+ flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
214
+ return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
215
+ unless flags['MD']
216
+ raise Enveomics::ParseError.new(
217
+ "SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
218
+ )
219
+ end
220
+ mismatches = flags['MD'].scan(/[^\d]/).count
221
+ @identities << 100.0 * (length - mismatches) / length
222
+ end
223
+
224
+ # Save identites as raw text
225
+ def save_identities
226
+ return unless opts[:L]
227
+ say '- Saving identities'
228
+ File.open(opts[:L], 'w') do |fh|
229
+ identities.each { |i| fh.puts i }
230
+ end
231
+ end
232
+
233
+ # Save identity histogram as raw text
234
+ def save_histogram
235
+ return unless opts[:H]
236
+ say '- Saving histogram'
237
+ File.open(opts[:H], 'w') do |fh|
238
+ fh.puts "from\tto\tcount"
239
+ sample.histo_ranges.each_with_index do |r, k|
240
+ fh.puts (r + [sample.histo_counts[k]]).join("\t")
241
+ end
242
+ end
243
+ end
244
+
245
+ # -----------------------------------------------------------[ Peak finder ]
246
+
247
+ # Detect identity threshold by gaussian mixture model EM
248
+ def detect_identity_by_gmm
249
+ model_identities_by_gmm_em
250
+ detect_valley_by_gmm
251
+ end
252
+
253
+ # Model identities as a 2-gaussian mix by EM
254
+ def model_identities_by_gmm_em
255
+ say 'Modeling identities by gaussian mixture model using EM'
256
+ # TODO: Implement
257
+ raise Enveomics::UnimplementedError.new('Unimplemented operation')
258
+ end
259
+
260
+ # Detect valley by gaussian mix
261
+ def detect_valley_by_gmm
262
+ say 'Detecting valley by gaussian mixture model'
263
+ # TODO: Implement
264
+ raise Enveomics::UnimplementedError.new('Unimplemented operation')
265
+ end
266
+
267
+ # -----------------------------------------------------------[ Do the math ]
268
+
269
+ # Identities as a Enveomics::Stats::Sample object
270
+ def sample
271
+ @sample ||= Enveomics::Stats::Sample.new(
272
+ identities,
273
+ effective_range: [nil, 100.0],
274
+ histo_bin_size: opts[:bin_size]
275
+ )
276
+ end
277
+
278
+ # Returns the bimodality coefficient indicated by +opts[:coefficient]+
279
+ def bimodality
280
+ @bimodality ||=
281
+ case opts[:coefficient]
282
+ when :sarle
283
+ sample.sarle_bimodality
284
+ when :dma
285
+ sample.dma_bimodality
286
+ else
287
+ raise Enveomics::OptionError.new(
288
+ "Unsupported coefficient of bimodality: #{opts[:coefficient]}"
289
+ )
290
+ end
291
+ end
292
+ end
293
+ end
@@ -0,0 +1,175 @@
1
+
2
+ require 'enveomics_rb/enveomics'
3
+ require 'enveomics_rb/match'
4
+ use 'tmpdir'
5
+ use 'shellwords'
6
+
7
+ module Enveomics
8
+ class BMset
9
+ attr :qry, :sbj, :set, :opt
10
+
11
+ ##
12
+ # Initialize Enveomics::BMset object with sequence paths +qry+ and +sbj+,
13
+ # and options Hash +opts+ (see #opt for supported options) with Symbol keys
14
+ def initialize(qry, sbj, opts = {})
15
+ @qry = qry
16
+ @sbj = sbj
17
+ @set = nil
18
+ @opt = opts
19
+ end
20
+
21
+ ##
22
+ # Returns option with key +k+ as defined by #initialize or by default
23
+ # Supported options include [defaults in brackets]:
24
+ # - len [0]: Minimum alignment length in residues
25
+ # - id [0.0]: Minimum alignment identity in percent
26
+ # - fract [0.0]: Minimum alignment length as fraction of the query
27
+ # - score [0.0]: Minimum alignment score in bits
28
+ # - nucl [false]: The sequences are in nucleotides
29
+ # - thr [1]: Number of threads to use
30
+ # - bin ['']: Path to the directory containing binaries
31
+ # - program [:blast+]: Search engine to use
32
+ def opt(k)
33
+ @defaults ||= {
34
+ len: 0, id: 0.0, fract: 0.0, score: 0.0,
35
+ nucl: false, thr: 1, bin: '', program: :'blast+'
36
+ }
37
+ k = k.to_sym
38
+ @opt[k] = @defaults[k] if @opt[k].nil?
39
+ @opt[k]
40
+ end
41
+
42
+ ##
43
+ # Array of Enveomics::Match objects
44
+ def set
45
+ match_and_filter! if @set.nil?
46
+ @set
47
+ end
48
+
49
+ ##
50
+ # Returns the best match of query +qry+ as Enveomics::Match or nil if
51
+ # no qualifying match was found
52
+ def [](qry)
53
+ set[qry]
54
+ end
55
+
56
+ ##
57
+ # Number of matches found
58
+ def count
59
+ set.count
60
+ end
61
+
62
+ ##
63
+ # Execute search and filter matches
64
+ def match_and_filter!
65
+ @set = {}
66
+ match!.each do |match|
67
+ # Already a better match?
68
+ next if self[match.qry] && self[match.qry].score >= match.score
69
+
70
+ # Is this a good enough match?
71
+ next unless %i[len id score fract].all? do |metric|
72
+ match.send(metric) >= opt(metric)
73
+ end
74
+
75
+ # Save match
76
+ @set[match.qry] = match
77
+ end
78
+ end
79
+
80
+ ##
81
+ # Find all matches and return as an array of Enveomics::Match objects
82
+ def match!
83
+ y = []
84
+ Dir.mktmpdir do |dir|
85
+ # Determine commands
86
+ say('Temporal directory: ', dir)
87
+ db_path = File.join(dir, 'sbj.db')
88
+ out_path = File.join(dir, 'out.tsv')
89
+ cmds = []
90
+ case opt(:program)
91
+ when :blast
92
+ cmds << [
93
+ 'formatdb', '-i', sbj, '-n', db_path, '-l', File.join(dir, 'log'),
94
+ '-p', opt(:nucl) ? 'F' : 'T'
95
+ ]
96
+ cmd << [
97
+ 'blastall', '-p', opt(:nucl) ? 'blastn' : 'blastp', '-d', db_path,
98
+ '-i', qry, '-v', '1', '-b', '1', '-a', opt(:thr).to_s, '-m', '8',
99
+ '-o', out_path
100
+ ]
101
+ when :'blast+'
102
+ cmds << [
103
+ 'makeblastdb', '-in', sbj, '-out', db_path,
104
+ '-dbtype', opt(:nucl) ? 'nucl' : 'prot'
105
+ ]
106
+ cmds << [
107
+ opt(:nucl) ? 'blastn' : 'blastp', '-db', db_path, '-query', qry,
108
+ '-num_threads', opt(:thr).to_s, '-out', out_path, '-outfmt',
109
+ '6 qseqid sseqid pident length mismatch gapopen qstart qend ' \
110
+ 'sstart send evalue bitscore qlen slen'
111
+ ]
112
+ when :diamond
113
+ raise Enveomics::OptionError.new(
114
+ 'Unsupported search engine diamond for nucleotides'
115
+ ) if opt(:nucl)
116
+ cmds << [
117
+ 'diamond', 'makedb', '--in', sbj, '--db', db_path,
118
+ '--threads', opt(:thr).to_s
119
+ ]
120
+ cmds << [
121
+ 'diamond', 'blastp', '--threads', opt(:thr).to_s,
122
+ '--db', db_path, '--query', qry, '--daa', "#{out_path}.daa",
123
+ '--quiet', '--sensitive'
124
+ ]
125
+ cmds << [
126
+ 'diamond', 'view', '--daa', "#{out_path}.daa", '--out', out_path,
127
+ '--quiet', '--outfmt'
128
+ ] + %w[6 qseqid sseqid pident length mismatch gapopen qstart] +
129
+ %w[qend sstart send evalue bitscore qlen slen]
130
+ when :blat
131
+ cmds << ['blat', sbj, qry, '-out=blast8', out_path]
132
+ cmds[0] << '-prot' unless opt(:nucl)
133
+ else
134
+ raise Enveomics::OptionError.new(
135
+ "Unsupported search engine: #{opt(:program)}"
136
+ )
137
+ end
138
+
139
+ # Run commands
140
+ say('Running comparison')
141
+ say('Query: ', qry)
142
+ say('Subject: ', sbj)
143
+ cmd_err = File.join(dir, 'err')
144
+ begin
145
+ cmds.each do |cmd|
146
+ cmd[0] = File.join(opt(:bin), cmd[0]) unless opt(:bin) == ''
147
+ run_cmd(cmd, stderr: cmd_err)
148
+ end
149
+ rescue Enveomics::CommandError => e
150
+ $stderr.puts e
151
+ $stderr.puts ''
152
+ $stderr.puts '[ Error log ]'
153
+ $stderr.puts File.read(cmd_err)
154
+ exit
155
+ end
156
+
157
+ # Parse output
158
+ File.open(out_path, 'r') do |fh|
159
+ fh.each { |ln| y << Enveomics::Match.new(ln) }
160
+ end
161
+ end
162
+ y
163
+ end
164
+
165
+ ##
166
+ # Enumerate RBMs and yield +blk+
167
+ def each(&blk)
168
+ if block_given?
169
+ set.each { |_, bm| blk.call(bm) }
170
+ else
171
+ to_enum(:each)
172
+ end
173
+ end
174
+ end
175
+ end