miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,583 @@
1
+
2
+ # Use as:
3
+ # > # Estimate reference (null) model:
4
+ # > tab <- read.table('Ecoli-ML-dmatrix.txt', sep='\t', h=T, row.names=1)
5
+ # > dist <- as.dist(tab);
6
+ # > all.dist <- enve.tribs(dist);
7
+ # >
8
+ # > # Estimate subset (test) model:
9
+ # > lee <- read.table('LEE-strains.txt', as.is=T)$V1
10
+ # > lee.dist <- enve.tribs(dist, lee, subsamples=seq(0,1,by=0.05), threads=12,
11
+ # + verbosity=2, pre.tribs=all.dist.merge);
12
+ # ...
13
+ # >
14
+ # > # Plot reference and selection at different subsampling levels:
15
+ # > plot(all.dist, t='boxplot');
16
+ # > plot(lee, new=FALSE, col='darkred');
17
+ # ...
18
+ # >
19
+ # > # Test significance of overclustering (or overdispersion):
20
+ # > lee.test <- enve.tribs.test(dist, lee, pre.tribs=all.dist.merge,
21
+ # + verbosity=2, threads=12);
22
+ # > summary(lee.test);
23
+ # > plot(lee.test);
24
+ # ...
25
+
26
+
27
+
28
+ #==============> Define S4 classes
29
+
30
+ #' Enveomics: TRIBS S4 Class
31
+ #'
32
+ #' Enve-omics representation of "Transformed-space Resampling In Biased Sets
33
+ #' (TRIBS)". This object represents sets of distances between objects,
34
+ #' sampled nearly-uniformly at random in "distance space". Subsampling
35
+ #' without selection is trivial, since both the distances space and the
36
+ #' selection occur in the same transformed space. However, it's useful to
37
+ #' compare randomly subsampled sets against a selected set of objects. This
38
+ #' is intended to identify overdispersion or overclustering (see
39
+ #' \code{\link{enve.TRIBStest}}) of a subset against the entire collection of objects
40
+ #' with minimum impact of sampling biases. This object can be produced by
41
+ #' \code{\link{enve.tribs}} and supports S4 methods \code{plot} and \code{summary}.
42
+ #'
43
+ #' @slot distance \code{(numeric)} Centrality measurement of the distances
44
+ #' between the selected objects (without subsampling).
45
+ #' @slot points \code{(matrix)} Position of the different objects in distance
46
+ #' space.
47
+ #' @slot distances \code{(matrix)} Subsampled distances, where the rows are
48
+ #' replicates and the columns are subsampling levels.
49
+ #' @slot spaceSize \code{(numeric)} Number of objects.
50
+ #' @slot selSize \code{(numeric)} Number of selected objects.
51
+ #' @slot dimensions \code{(numeric)} Number of dimensions in the distance space.
52
+ #' @slot subsamples \code{(numeric)} Subsampling levels (as fractions, from
53
+ #' 0 to 1).
54
+ #' @slot call \code{(call)} Call producing this object.
55
+ #'
56
+ #' @author Luis M. Rodriguez-R [aut, cre]
57
+ #'
58
+ #' @exportClass
59
+
60
+ enve.TRIBS <- setClass("enve.TRIBS",
61
+ representation(
62
+ distance='numeric',
63
+ points='matrix',
64
+ distances='matrix',
65
+ spaceSize='numeric',
66
+ selSize='numeric',
67
+ dimensions='numeric',
68
+ subsamples='numeric',
69
+ call='call')
70
+ ,package='enveomics.R'
71
+ );
72
+
73
+ #' Enveomics: TRIBS Test S4 Class
74
+ #'
75
+ #' Test of significance of overclustering or overdispersion in a selected
76
+ #' set of objects with respect to the entire set (see \code{\link{enve.TRIBS}}). This
77
+ #' object can be produced by \code{\link{enve.tribs.test}} and supports S4 methods
78
+ #' \code{plot} and \code{summary}.
79
+ #'
80
+ #' @slot pval.gt \code{(numeric)}
81
+ #' P-value for the overdispersion test.
82
+ #' @slot pval.lt \code{(numeric)}
83
+ #' P-value for the overclustering test.
84
+ #' @slot all.dist \code{(numeric)}
85
+ #' Empiric PDF of distances for the entire dataset (subsampled at selection
86
+ #' size).
87
+ #' @slot sel.dist \code{(numeric)}
88
+ #' Empiric PDF of distances for the selected objects (without subsampling).
89
+ #' @slot diff.dist \code{(numeric)}
90
+ #' Empiric PDF of the difference between \code{all.dist} and \code{sel.dist}.
91
+ #' The p-values are estimating by comparing areas in this PDF greater than and
92
+ #' lesser than zero.
93
+ #' @slot dist.mids \code{(numeric)}
94
+ #' Midpoints of the empiric PDFs of distances.
95
+ #' @slot diff.mids \code{(numeric)}
96
+ #' Midpoints of the empiric PDF of difference of distances.
97
+ #' @slot call \code{(call)}
98
+ #' Call producing this object.
99
+ #'
100
+ #' @author Luis M. Rodriguez-R [aut, cre]
101
+ #'
102
+ #' @exportClass
103
+
104
+ enve.TRIBStest <- setClass("enve.TRIBStest",
105
+ representation(
106
+ pval.gt='numeric',
107
+ pval.lt='numeric',
108
+ all.dist='numeric',
109
+ sel.dist='numeric',
110
+ diff.dist='numeric',
111
+ dist.mids='numeric',
112
+ diff.mids='numeric',
113
+ call='call')
114
+ ,package='enveomics.R'
115
+ );
116
+
117
+ #==============> Define S4 methods
118
+
119
+ #' Enveomics: TRIBS Summary
120
+ #'
121
+ #' Summary of an \code{\link{enve.TRIBS}} object.
122
+ #'
123
+ #' @param object
124
+ #' \code{\link{enve.TRIBS}} object.
125
+ #' @param ...
126
+ #' No additional parameters are currently supported.
127
+ #'
128
+ #' @author Luis M. Rodriguez-R [aut, cre]
129
+ #'
130
+ #' @method summary enve.TRIBS
131
+ #' @export
132
+
133
+ summary.enve.TRIBS <- function
134
+ (object,
135
+ ...
136
+ ){
137
+ cat('===[ enve.TRIBS ]-------------------------\n');
138
+ cat('Selected',attr(object,'selSize'),'of',
139
+ attr(object,'spaceSize'),'objects in',
140
+ attr(object,'dimensions'),'dimensions.\n');
141
+ cat('Collected',length(attr(object,'subsamples')),'subsamples with',
142
+ nrow(attr(object,'distances')),'replicates each.\n');
143
+ cat('------------------------------------------\n');
144
+ cat('call:',as.character(attr(object,'call')),'\n');
145
+ cat('------------------------------------------\n');
146
+ }
147
+
148
+ #' Enveomics: TRIBS Plot
149
+ #'
150
+ #' Plot an \code{\link{enve.TRIBS}} object.
151
+ #'
152
+ #' @param x
153
+ #' \code{\link{enve.TRIBS}} object to plot.
154
+ #' @param new
155
+ #' Should a new canvas be drawn?
156
+ #' @param type
157
+ #' Type of plot. The \strong{points} plot shows all the replicates, the
158
+ #' \strong{boxplot} plot represents the values found by
159
+ #' \code{\link[grDevices]{boxplot.stats}}.
160
+ #' as areas, and plots the outliers as points.
161
+ #' @param col
162
+ #' Color of the areas and/or the points.
163
+ #' @param pt.cex
164
+ #' Size of the points.
165
+ #' @param pt.pch
166
+ #' Points character.
167
+ #' @param pt.col
168
+ #' Color of the points.
169
+ #' @param ln.col
170
+ #' Color of the lines.
171
+ #' @param ...
172
+ #' Any additional parameters supported by \code{plot}.
173
+ #'
174
+ #' @author Luis M. Rodriguez-R [aut, cre]
175
+ #'
176
+ #' @method plot enve.TRIBS
177
+ #' @export
178
+
179
+ plot.enve.TRIBS <- function
180
+ (x,
181
+ new=TRUE,
182
+ type=c('boxplot', 'points'),
183
+ col='#00000044',
184
+ pt.cex=1/2,
185
+ pt.pch=19,
186
+ pt.col=col,
187
+ ln.col=col,
188
+ ...
189
+ ){
190
+ type <- match.arg(type);
191
+ plot.opts <- list(xlim=range(attr(x,'subsamples'))*attr(x,'selSize'),
192
+ ylim=range(attr(x,'distances')), ..., t='n', x=1);
193
+ if(new) do.call(plot, plot.opts);
194
+ abline(h=attr(x,'distance'), lty=3, col=ln.col);
195
+ replicates <- nrow(attr(x,'distances'));
196
+ if(type=='points'){
197
+ for(i in 1:ncol(attr(x,'distances')))
198
+ points(rep(round(attr(x,'subsamples')[i]*attr(x,'selSize')),
199
+ replicates), attr(x,'distances')[,i], cex=pt.cex, pch=pt.pch,
200
+ col=pt.col);
201
+ }else{
202
+ stats <- matrix(NA, nrow=7, ncol=ncol(attr(x,'distances')));
203
+ for(i in 1:ncol(attr(x,'distances'))){
204
+ b <- boxplot.stats(attr(x,'distances')[,i]);
205
+ points(rep(round(attr(x,'subsamples')[i]*attr(x,'selSize')),
206
+ length(b$out)), b$out, cex=pt.cex, pch=pt.pch, col=pt.col);
207
+ stats[, i] <- c(b$conf, b$stats[c(1,5,2,4,3)]);
208
+ }
209
+ x <- round(attr(x,'subsamples')*attr(x,'selSize'))
210
+ for(i in c(1,3,5))
211
+ polygon(c(x, rev(x)), c(stats[i,], rev(stats[i+1,])), border=NA,
212
+ col=col);
213
+ lines(x, stats[7,], col=ln.col, lwd=2);
214
+ }
215
+ }
216
+
217
+ #' Enveomics: TRIBS Summary Test
218
+ #'
219
+ #' Summary of an \code{\link{enve.TRIBStest}} object.
220
+ #'
221
+ #' @param object
222
+ #' \code{\link{enve.TRIBStest}} object.
223
+ #' @param ...
224
+ #' No additional parameters are currently supported.
225
+ #'
226
+ #' @author Luis M. Rodriguez-R [aut, cre]
227
+ #'
228
+ #' @method summary enve.TRIBStest
229
+ #' @export
230
+
231
+ summary.enve.TRIBStest <- function
232
+ (object,
233
+ ...
234
+ ){
235
+ cat('===[ enve.TRIBStest ]---------------------\n');
236
+ cat('Alternative hypothesis:\n');
237
+ cat(' The distances in the selection are\n');
238
+ if(attr(object, 'pval.gt') > attr(object, 'pval.lt')){
239
+ cat(' smaller than in the entire dataset\n (overclustering)\n');
240
+ }else{
241
+ cat(' larger than in the entire dataset\n (overdispersion)\n');
242
+ }
243
+ p.val <- min(attr(object, 'pval.gt'), attr(object, 'pval.lt'));
244
+ if(p.val==0){
245
+ diff.dist <- attr(object, 'diff.dist');
246
+ p.val.lim <- min(diff.dist[diff.dist>0]);
247
+ cat('\n P-value <= ', signif(p.val.lim, 4), sep='');
248
+ }else{
249
+ p.val.lim <- p.val;
250
+ cat('\n P-value: ', signif(p.val, 4), sep='');
251
+ }
252
+ cat(' ', ifelse(p.val.lim<=0.01, "**", ifelse(p.val.lim<=0.05, "*", "")),
253
+ '\n', sep='');
254
+ cat('------------------------------------------\n');
255
+ cat('call:',as.character(attr(object,'call')),'\n');
256
+ cat('------------------------------------------\n');
257
+ }
258
+
259
+ #' Enveomics: TRIBS Plot Test
260
+ #'
261
+ #' Plots an \code{\link{enve.TRIBStest}} object.
262
+ #'
263
+ #' @param x
264
+ #' \code{\link{enve.TRIBStest}} object to plot.
265
+ #' @param type
266
+ #' What to plot. \code{overlap} generates a plot of the two contrasting empirical
267
+ #' PDFs (to compare against each other), \code{difference} produces a plot of the
268
+ #' differences between the empirical PDFs (to compare against zero).
269
+ #' @param col
270
+ #' Main color of the plot if type=\code{difference}.
271
+ #' @param col1
272
+ #' First color of the plot if type=\code{overlap}.
273
+ #' @param col2
274
+ #' Second color of the plot if type=\code{overlap}.
275
+ #' @param ylab
276
+ #' Y-axis label.
277
+ #' @param xlim
278
+ #' X-axis limits.
279
+ #' @param ylim
280
+ #' Y-axis limits.
281
+ #' @param ...
282
+ #' Any other graphical arguments.
283
+ #'
284
+ #' @author Luis M. Rodriguez-R [aut, cre]
285
+ #'
286
+ #' @method plot enve.TRIBStest
287
+ #' @export
288
+
289
+ plot.enve.TRIBStest <- function
290
+ (x,
291
+ type=c('overlap', 'difference'),
292
+ col='#00000044',
293
+ col1=col,
294
+ col2='#44001144',
295
+ ylab='Probability',
296
+ xlim=range(attr(x, 'dist.mids')),
297
+ ylim=c(0,max(c(attr(x, 'all.dist'), attr(x, 'sel.dist')))),
298
+ ...
299
+ ){
300
+ type <- match.arg(type);
301
+ if(type=='overlap'){
302
+ plot.opts <- list(xlim=xlim, ylim=ylim, ylab=ylab, ..., t='n', x=1);
303
+ do.call(plot, plot.opts);
304
+ bins <- length(attr(x, 'dist.mids'))
305
+ polygon(attr(x, 'dist.mids')[c(1, 1:bins, bins)],
306
+ c(0,attr(x, 'all.dist'),0), col=col1,
307
+ border=do.call(rgb, as.list(c(col2rgb(col1)/256, 0.5))));
308
+ polygon(attr(x, 'dist.mids')[c(1, 1:bins, bins)],
309
+ c(0,attr(x, 'sel.dist'),0), col=col2,
310
+ border=do.call(rgb, as.list(c(col2rgb(col2)/256, 0.5))));
311
+ }else{
312
+ plot.opts <- list(xlim=range(attr(x, 'diff.mids')),
313
+ ylim=c(0,max(attr(x, 'diff.dist'))), ylab=ylab, ..., t='n', x=1);
314
+ do.call(plot, plot.opts);
315
+ bins <- length(attr(x, 'diff.mids'));
316
+ polygon(attr(x, 'diff.mids')[c(1, 1:bins, bins)],
317
+ c(0,attr(x, 'diff.dist'),0), col=col,
318
+ border=do.call(rgb, as.list(c(col2rgb(col)/256, 0.5))));
319
+ }
320
+ }
321
+
322
+ #' Enveomics: TRIBS Merge
323
+ #'
324
+ #' Merges two \code{\link{enve.TRIBS}} objects generated from the same objects at
325
+ #' different subsampling levels.
326
+ #'
327
+ #' @param x
328
+ #' First \code{\link{enve.TRIBS}} object.
329
+ #' @param y
330
+ #' Second \code{\link{enve.TRIBS}} object.
331
+ #'
332
+ #' @return Returns an \code{\link{enve.TRIBS}} object.
333
+ #'
334
+ #' @author Luis M. Rodriguez-R [aut, cre]
335
+ #'
336
+ #' @export
337
+
338
+ enve.TRIBS.merge <- function
339
+ (x,
340
+ y
341
+ ){
342
+ # Check consistency
343
+ if(attr(x,'distance') != attr(y,'distance'))
344
+ stop('Total distances in objects are different.');
345
+ if(any(attr(x,'points') != attr(y,'points')))
346
+ stop('Points in objects are different.');
347
+ if(attr(x,'spaceSize') != attr(y,'spaceSize'))
348
+ stop('Space size in objects are different.');
349
+ if(attr(x,'selSize') != attr(y,'selSize'))
350
+ stop('Selection size in objects are different.');
351
+ if(attr(x,'dimensions') != attr(y,'dimensions'))
352
+ stop('Dimensions in objects are different.');
353
+ if(nrow(attr(x,'distances')) != nrow(attr(y,'distances')))
354
+ stop('Replicates in objects are different.');
355
+ # Merge
356
+ a <- attr(x,'subsamples');
357
+ b <- attr(y,'subsamples');
358
+ o <- order(c(a,b));
359
+ o <- o[!duplicated(c(a,b)[o])] ;
360
+ d <- cbind(attr(x,'distances'), attr(y,'distances'))[, o] ;
361
+ z <- new('enve.TRIBS',
362
+ distance=attr(x,'distance'), points=attr(x,'points'),
363
+ distances=d, spaceSize=attr(x,'spaceSize'),
364
+ selSize=attr(x,'selSize'), dimensions=attr(x,'dimensions'),
365
+ subsamples=c(a,b)[o], call=match.call());
366
+ return(z) ;
367
+ }
368
+
369
+ #==============> Define core functions
370
+
371
+ #' Enveomics: TRIBS Test
372
+ #'
373
+ #' Estimates the empirical difference between all the distances in a set of
374
+ #' objects and a subset, together with its statistical significance.
375
+ #'
376
+ #' @param dist
377
+ #' Distances as \code{dist} object.
378
+ #' @param selection
379
+ #' Selection defining the subset.
380
+ #' @param bins
381
+ #' Number of bins to evaluate in the range of distances.
382
+ #' @param ...
383
+ #' Any other parameters supported by \code{\link{enve.tribs}},
384
+ #' except \code{subsamples}.
385
+ #'
386
+ #' @return Returns an \code{\link{enve.TRIBStest}} object.
387
+ #'
388
+ #' @author Luis M. Rodriguez-R [aut, cre]
389
+ #'
390
+ #' @export
391
+
392
+ enve.tribs.test <- function
393
+ (dist,
394
+ selection,
395
+ bins=50,
396
+ ...
397
+ ){
398
+ s.tribs <- enve.tribs(dist, selection, subsamples=c(0,1), ...);
399
+ a.tribs <- enve.tribs(dist,
400
+ subsamples=c(0,attr(s.tribs, 'selSize')/attr(s.tribs, 'spaceSize')), ...);
401
+ s.dist <- attr(s.tribs, 'distances')[, 2];
402
+ a.dist <- attr(a.tribs, 'distances')[, 2];
403
+ range <- range(c(s.dist, a.dist));
404
+ a.f <- hist(a.dist, breaks=seq(range[1], range[2], length.out=bins),
405
+ plot=FALSE);
406
+ s.f <- hist(s.dist, breaks=seq(range[1], range[2], length.out=bins),
407
+ plot=FALSE);
408
+ zp.f <- c(); zz.f <- 0; zn.f <- c();
409
+ p.x <- a.f$counts/sum(a.f$counts);
410
+ p.y <- s.f$counts/sum(s.f$counts);
411
+ for(z in 1:length(a.f$mids)){
412
+ zn.f[z] <- 0;
413
+ zz.f <- 0;
414
+ zp.f[z] <- 0;
415
+ for(k in 1:length(a.f$mids)){
416
+ if(z < k){
417
+ zp.f[z] <- zp.f[z] + p.x[k]*p.y[k-z];
418
+ zn.f[z] <- zn.f[z] + p.x[k-z]*p.y[k];
419
+ }
420
+ zz.f <- zz.f + p.x[k]*p.y[k];
421
+ }
422
+ }
423
+ return(new('enve.TRIBStest',
424
+ pval.gt=sum(c(zz.f, zp.f)), pval.lt=sum(c(zz.f, zn.f)),
425
+ all.dist=p.x, sel.dist=p.y, diff.dist=c(rev(zn.f), zz.f, zp.f),
426
+ dist.mids=a.f$mids,
427
+ diff.mids=seq(diff(range(a.f$mids)), -diff(range(a.f$mids)),
428
+ length.out=1+2*length(a.f$mids)),
429
+ call=match.call()));
430
+ }
431
+
432
+ #' Enveomics: TRIBS
433
+ #'
434
+ #' Subsample any objects in "distance space" to reduce the effect of
435
+ #' sample-clustering. This function was originally designed to subsample
436
+ #' genomes in "phylogenetic distance space", a clear case of strong
437
+ #' clustering bias in sampling, by Luis M. Rodriguez-R and Michael R
438
+ #' Weigand.
439
+ #'
440
+ #' @param dist
441
+ #' Distances as a \code{dist} object.
442
+ #' @param selection
443
+ #' Objects to include in the subsample. By default, all objects are
444
+ #' selected.
445
+ #' @param replicates
446
+ #' Number of replications per point.
447
+ #' @param summary.fx
448
+ #' Function to summarize the distance distributions in a given replicate. By
449
+ #' default, the median distance is estimated.
450
+ #' @param dist.method
451
+ #' Distance method between random points and samples in the transformed
452
+ #' space. See \code{dist}.
453
+ #' @param subsamples
454
+ #' Subsampling fractions.
455
+ #' @param dimensions
456
+ #' Dimensions to use in the NMDS. By default, 5\% of the selection length.
457
+ #' @param metaMDS.opts
458
+ #' Any additional options to pass to metaMDS, as \code{list}.
459
+ #' @param threads
460
+ #' Number of threads to use.
461
+ #' @param verbosity
462
+ #' Verbosity. Use 0 to run quietly, increase for additional information.
463
+ #' @param points
464
+ #' Optional. If passed, the MDS step is skipped and this object is used
465
+ #' instead. It can be the \code{$points} slot of class \code{metaMDS}
466
+ #' (from \code{vegan}).
467
+ #' It must be a matrix or matrix-coercible object, with samples as rows and
468
+ #' dimensions as columns.
469
+ #' @param pre.tribs
470
+ #' Optional. If passed, the points are recovered from this object (except if
471
+ #' \code{points} is also passed. This should be an \code{\link{enve.TRIBS}} object
472
+ #' estimated on the same objects (the selection is unimportant).
473
+ #'
474
+ #' @return Returns an \code{\link{enve.TRIBS}} object.
475
+ #'
476
+ #' @author Luis M. Rodriguez-R [aut, cre]
477
+ #'
478
+ #' @export
479
+
480
+ enve.tribs <- function
481
+ (dist,
482
+ selection=labels(dist),
483
+ replicates=1000,
484
+ summary.fx=median,
485
+ dist.method='euclidean',
486
+ subsamples=seq(0,1,by=0.01),
487
+ dimensions=ceiling(length(selection)*0.05),
488
+ metaMDS.opts=list(),
489
+ threads=2,
490
+ verbosity=1,
491
+ points,
492
+ pre.tribs
493
+ ){
494
+ if(!is(dist, 'dist'))
495
+ stop('`dist` parameter must be a `dist` object.');
496
+ # 1. NMDS
497
+ if(missing(points)){
498
+ if(missing(pre.tribs)){
499
+ if(verbosity > 0)
500
+ cat('===[ Estimating NMDS ]\n');
501
+ if(!suppressPackageStartupMessages(
502
+ requireNamespace("vegan", quietly=TRUE)))
503
+ stop('Unavailable required package: `vegan`.');
504
+ mds.args <- c(metaMDS.opts, list(comm=dist, k=dimensions,
505
+ trace=verbosity));
506
+ points <- do.call(vegan::metaMDS, mds.args)$points;
507
+ }else{
508
+ points <- attr(pre.tribs, 'points');
509
+ dimensions <- ncol(points);
510
+ }
511
+ }else{
512
+ points <- as.matrix(points);
513
+ dimensions <- ncol(points);
514
+ }
515
+ # 2. Pad ranges
516
+ if(verbosity > 0) cat('===[ Padding ranges ]\n');
517
+ dots <- matrix(NA, nrow=nrow(points), ncol=dimensions,
518
+ dimnames=list(rownames(points), 1:dimensions));
519
+ selection <- selection[!is.na(match(selection, rownames(dots)))];
520
+ for(dim in 1:dimensions){
521
+ dimRange <- range(points[,dim]) +
522
+ c(-1,1)*diff(range(points[,1]))/length(selection);
523
+ dots[, dim] <- (points[,dim]-dimRange[1])/diff(dimRange);
524
+ }
525
+ # 3. Select points and summarize distances
526
+ if(verbosity > 0) cat('===[ Sub-sampling ]\n');
527
+ distances <- matrix(NA, nrow=replicates, ncol=length(subsamples),
528
+ dimnames=list(1:replicates, as.character(subsamples)));
529
+ cl <- makeCluster(threads);
530
+ for(frx in subsamples){
531
+ if(verbosity > 1) cat('Sub-sampling at ',(frx*100),'%\n',sep='');
532
+ distances[, as.character(frx)] = parSapply(cl, 1:replicates, enve.__tribs,
533
+ frx, match(selection, rownames(dots)), dimensions, dots, dist.method,
534
+ summary.fx, dist);
535
+ }
536
+ stopCluster(cl);
537
+ # 4. Build object and return
538
+ return(new('enve.TRIBS',
539
+ distance=do.call(summary.fx, list(as.matrix(dist)[selection, selection])),
540
+ points=points, distances=distances, spaceSize=nrow(points),
541
+ selSize=length(selection), dimensions=dimensions, subsamples=subsamples,
542
+ call=match.call()));
543
+ }
544
+
545
+ #' Enveomics: TRIBS - Internal Ancillary Function
546
+ #'
547
+ #' Internal ancillary function (see \code{\link{enve.tribs}}).
548
+ #'
549
+ #' @param rep Replicates
550
+ #' @param frx Fraction
551
+ #' @param selection Selection
552
+ #' @param dimensions Dimensions
553
+ #' @param dots Sampling points
554
+ #' @param dist.method Distance method
555
+ #' @param summary.fx Summary function
556
+ #' @param dist Distance
557
+ #'
558
+ #' @author Luis M. Rodriguez-R [aut, cre]
559
+ #'
560
+ #' @export
561
+
562
+ enve.__tribs <- function
563
+ (rep, frx, selection, dimensions, dots, dist.method, summary.fx, dist){
564
+ sample <- c();
565
+ if(frx==0) return(0);
566
+ for(point in 1:round(frx*length(selection))){
567
+ rand.point <- runif(dimensions);
568
+ closest.dot <- '';
569
+ closest.dist <- Inf;
570
+ for(dot in selection){
571
+ dot.dist <- as.numeric(dist(matrix(c(rand.point, dots[dot,]), nrow=2,
572
+ byrow=TRUE), method=dist.method));
573
+ if(dot.dist < closest.dist){
574
+ closest.dot <- dot;
575
+ closest.dist <- dot.dist;
576
+ }
577
+ }
578
+ sample <- c(sample, closest.dot);
579
+ }
580
+ return( do.call(summary.fx, list(as.matrix(dist)[sample, sample])) );
581
+ }
582
+
583
+