miga-base 0.3.0.0 → 0.3.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (260) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +21 -4
  3. data/actions/init.rb +258 -0
  4. data/actions/run_local.rb +1 -2
  5. data/actions/test_taxonomy.rb +4 -1
  6. data/bin/miga +8 -1
  7. data/lib/miga/dataset.rb +4 -4
  8. data/lib/miga/dataset_result.rb +7 -4
  9. data/lib/miga/version.rb +2 -2
  10. data/scripts/_distances_noref_nomulti.bash +3 -1
  11. data/scripts/clade_finding.bash +1 -1
  12. data/scripts/init.bash +1 -1
  13. data/scripts/miga.bash +1 -1
  14. data/scripts/mytaxa.bash +78 -72
  15. data/scripts/mytaxa_scan.bash +67 -62
  16. data/scripts/ogs.bash +1 -1
  17. data/scripts/trimmed_fasta.bash +4 -3
  18. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  19. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  20. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  21. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  22. data/utils/enveomics/LICENSE.txt +73 -0
  23. data/utils/enveomics/Makefile +52 -0
  24. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +703 -0
  26. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  27. data/utils/enveomics/Manifest/Tasks/fasta.json +571 -0
  28. data/utils/enveomics/Manifest/Tasks/fastq.json +208 -0
  29. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  30. data/utils/enveomics/Manifest/Tasks/ogs.json +339 -0
  31. data/utils/enveomics/Manifest/Tasks/other.json +746 -0
  32. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  33. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +454 -0
  34. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  35. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  36. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  37. data/utils/enveomics/Manifest/categories.json +132 -0
  38. data/utils/enveomics/Manifest/examples.json +154 -0
  39. data/utils/enveomics/Manifest/tasks.json +4 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  48. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  49. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  50. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  51. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  52. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  53. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  54. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  62. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  63. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  64. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  65. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  66. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  67. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  68. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  69. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  70. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  71. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  72. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  73. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  74. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  75. data/utils/enveomics/README.md +40 -0
  76. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  77. data/utils/enveomics/Scripts/Aln.cat.rb +162 -0
  78. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  79. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  80. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  81. data/utils/enveomics/Scripts/BlastTab.addlen.rb +61 -0
  82. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  83. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  84. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +106 -0
  85. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  86. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  87. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  88. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  89. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  90. data/utils/enveomics/Scripts/BlastTab.recplot2.R +40 -0
  91. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  92. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  93. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  94. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  95. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  96. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  97. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  98. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  99. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  100. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  101. data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
  102. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  103. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  104. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  105. data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
  106. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  107. data/utils/enveomics/Scripts/FastA.interpose.pl +87 -0
  108. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  109. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  110. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  111. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  112. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  113. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  114. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  115. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  116. data/utils/enveomics/Scripts/FastA.tag.rb +64 -0
  117. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  118. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  119. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  120. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  121. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  122. data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
  123. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  124. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  125. data/utils/enveomics/Scripts/HMM.essential.rb +254 -0
  126. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  127. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  128. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +306 -0
  129. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  130. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  131. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  132. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  133. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  134. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  135. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  136. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  137. data/utils/enveomics/Scripts/SRA.download.bash +50 -0
  138. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  139. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  140. data/utils/enveomics/Scripts/Table.barplot.R +30 -0
  141. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  142. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  143. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  144. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  145. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  146. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  147. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  148. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  149. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  150. data/utils/enveomics/Scripts/aai.rb +373 -0
  151. data/utils/enveomics/Scripts/ani.rb +362 -0
  152. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  153. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  154. data/utils/enveomics/Scripts/lib/data/essential.hmm.gz +0 -0
  155. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +26 -0
  156. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  157. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  158. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  159. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  160. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
  161. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  162. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  163. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  164. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  165. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  166. data/utils/enveomics/Scripts/ogs.rb +104 -0
  167. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  168. data/utils/enveomics/Scripts/rbm.rb +137 -0
  169. data/utils/enveomics/Tests/Makefile +10 -0
  170. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  171. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  172. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  173. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  174. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  175. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  176. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  177. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  178. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  179. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  180. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  181. data/utils/enveomics/Tests/alkB.nwk +1 -0
  182. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  183. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  184. data/utils/enveomics/Tests/hiv1.faa +59 -0
  185. data/utils/enveomics/Tests/hiv1.fna +134 -0
  186. data/utils/enveomics/Tests/hiv2.faa +70 -0
  187. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  188. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  189. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  190. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  191. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  192. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  193. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  194. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  195. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  196. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  197. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  198. data/utils/enveomics/build_enveomics_r.bash +44 -0
  199. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  200. data/utils/enveomics/enveomics.R/NAMESPACE +35 -0
  201. data/utils/enveomics/enveomics.R/R/autoprune.R +121 -0
  202. data/utils/enveomics/enveomics.R/R/barplot.R +165 -0
  203. data/utils/enveomics/enveomics.R/R/cliopts.R +119 -0
  204. data/utils/enveomics/enveomics.R/R/df2dist.R +117 -0
  205. data/utils/enveomics/enveomics.R/R/growthcurve.R +263 -0
  206. data/utils/enveomics/enveomics.R/R/recplot.R +320 -0
  207. data/utils/enveomics/enveomics.R/R/recplot2.R +745 -0
  208. data/utils/enveomics/enveomics.R/R/tribs.R +423 -0
  209. data/utils/enveomics/enveomics.R/R/utils.R +16 -0
  210. data/utils/enveomics/enveomics.R/README.md +52 -0
  211. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  212. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  213. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +30 -0
  214. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +43 -0
  215. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +19 -0
  216. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +37 -0
  217. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +24 -0
  218. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +24 -0
  219. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +33 -0
  220. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +64 -0
  221. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +37 -0
  222. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +19 -0
  223. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +18 -0
  224. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +26 -0
  225. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +25 -0
  226. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +26 -0
  227. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +49 -0
  228. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +28 -0
  229. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +97 -0
  230. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +40 -0
  231. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +40 -0
  232. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +24 -0
  233. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +40 -0
  234. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +18 -0
  235. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +22 -0
  236. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +20 -0
  237. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +18 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +27 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +53 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -0
  242. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +44 -0
  243. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +21 -0
  244. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +15 -0
  245. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  246. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  247. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +43 -0
  248. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +29 -0
  249. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +30 -0
  250. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +71 -0
  251. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +18 -0
  252. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +18 -0
  253. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +18 -0
  254. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +27 -0
  255. data/utils/enveomics/globals.mk +8 -0
  256. data/utils/enveomics/manifest.json +9 -0
  257. data/utils/index_metadata.rb +0 -0
  258. data/utils/plot-taxdist.R +0 -0
  259. data/utils/requirements.txt +19 -19
  260. metadata +242 -2
@@ -0,0 +1,105 @@
1
+ #!/bin/bash
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R
5
+ # @update Mar-23-2016
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ set -e # <- So it stops if there is an error
10
+ function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
11
+
12
+ ORG=$1 # <- Organism (see help)
13
+ THR=2 # <- Number or threads
14
+
15
+ # This is just the help message
16
+ if [[ "$ORG" == "" ]] ; then
17
+ echo "
18
+ Use case: Essential genes phylogeny of a species. The essential genes are a
19
+ collection of genes typically found in single copy in archaeal and bacterial
20
+ genomes
21
+
22
+ IMPORTANT
23
+ This script is functional, but it's mainly intended for illustrative purposes.
24
+ Please take a look at the code first.
25
+
26
+ Usage:
27
+ $0 <organism>
28
+
29
+ <organism> The organism to use (e.g., Streptococcus_pneumoniae).
30
+
31
+ " >&2
32
+ exit
33
+ fi
34
+
35
+ # 00. Create environment
36
+ export PATH=$(dirname $0)/../Scripts:$PATH
37
+ if [[ -e $ORG ]] ; then
38
+ echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
39
+ exit 1
40
+ fi
41
+ mkdir $ORG
42
+ for i in 01.proteome 02.essential 03.aln 04.cat 05.raxml 06.autoprune ; do
43
+ mkdir $ORG/$i
44
+ done
45
+
46
+ # 01. Download proteomes
47
+ echo "[01/06] Downloading and guzipping data"
48
+ RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
49
+ rm $ORG/01.proteome/assembly_summary.txt
50
+ for i in $ORG/01.proteome/* ; do
51
+ b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
52
+ if exists $i/*.faa.gz ; then
53
+ for j in $i/*.faa.gz ; do gunzip $j ; done
54
+ cat $i/*.faa > $ORG/01.proteome/$b.faa
55
+ fi
56
+ rm -R $i
57
+ done
58
+
59
+ # 02. Essential genes
60
+ echo "[02/06] Idenfifying essential genes"
61
+ N=0
62
+ for i in $ORG/01.proteome/*.faa ; do # <- This loop could be parallelized
63
+ genomeA=$(basename $i .faa)
64
+ dir=$ORG/02.essential/$genomeA
65
+ mkdir $dir
66
+ HMM.essential.rb -i $i -m $dir/ -R $dir/log.txt -r $genomeA -t $THR
67
+ let N=$N+1
68
+ done
69
+
70
+ # 03. Find core and align groups
71
+ echo "[03/06] Identifying core essentials and aligning groups"
72
+ CORE_ESS=$(basename -s .faa $ORG/02.essential/*/*.faa | sort | uniq -c \
73
+ | awk '$1=='$N'{print $2}')
74
+ for b in $CORE_ESS ; do # <- This loop could be parallelized
75
+ cat $ORG/02.essential/*/$b.faa > $ORG/03.aln/$b.faa
76
+ clustalo -i $ORG/03.aln/$b.faa -o $ORG/03.aln/$b.aln #--threads=$THR
77
+ done
78
+
79
+ # 04. Concatenate alignment
80
+ echo "[04/06] Concatenating alignments and removing invariable sites"
81
+ Aln.cat.rb -I -c $ORG/04.cat/essential.raxcoords -i '|' $ORG/03.aln/*.aln \
82
+ > $ORG/04.cat/essential.aln 2> $ORG/04.cat/essential.log
83
+
84
+ # 05. Run RAxML
85
+ echo "[05/06] Inferring phylogeny"
86
+ # You REALLY should consider running the following with more threads (-T) and,
87
+ # if possible, multi-nodes using MPI
88
+ cd $ORG/05.raxml
89
+ raxmlHPC-PTHREADS -T $THR -p 1234 \
90
+ -s ../04.cat/essential.aln -q ../04.cat/essential.raxcoords \
91
+ -m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
92
+ # before running this line, so you know
93
+ # that you're running what you really want. Check
94
+ # options for bootstrapping and the different
95
+ # algorithms (-f). Note that -m is required, but the
96
+ # file unus.raxcoords specifies "AUTO", so RAxML will
97
+ # attempt to find the model resulting in the highest
98
+ # likelihood.
99
+ cd ../..
100
+
101
+ # 06. Autoprune
102
+ echo "[06/06] Auto-pruning the tree"
103
+ Newick.autoprune.R --t $ORG/05.raxml/RAxML_bestTree.UNUS --min_dist 0.001 \
104
+ $ORG/06.autoprune/essential-pruned.nwk
105
+
@@ -0,0 +1,100 @@
1
+ #!/bin/bash
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R
5
+ # @update Oct-20-2015
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ ORG=$1 # <- Organism (see help)
10
+ THR=2 # <- Number or threads
11
+
12
+ # This is just the help message
13
+ if [[ "$ORG" == "" ]] ; then
14
+ echo "
15
+ Use case: Unus genome phylogeny of a species. The unus genome is the collection
16
+ of orthologous groups in a set of genomes that has exactly one gene per genome,
17
+ i.e., the core genome minus in-paralogs.
18
+
19
+ IMPORTANT
20
+ This script is functional, but it's mainly intended for illustrative purposes.
21
+ Please take a look at the code first.
22
+
23
+ Usage:
24
+ $0 <organism>
25
+
26
+ <organism> The organism to use (e.g., Streptococcus_pneumoniae).
27
+
28
+ " >&2
29
+ exit
30
+ fi
31
+
32
+ # 00. Create environment
33
+ export PATH=$(dirname $0)/../Scripts:$PATH
34
+ if [[ -e $ORG ]] ; then
35
+ echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
36
+ exit 1
37
+ fi
38
+ mkdir $ORG
39
+ for i in 01.proteome 02.rbm 03.ogs 04.aln 05.cat 06.raxml ; do
40
+ mkdir $ORG/$i
41
+ done
42
+
43
+ # 01. Download proteomes
44
+ echo "[01/06] Downloading and guzipping data"
45
+ RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
46
+ rm $ORG/01.proteome/assembly_summary.txt
47
+ for i in $ORG/01.proteome/* ; do
48
+ b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
49
+ for j in $i/*.faa.gz ; do gunzip $j ; done
50
+ cat $i/*.faa > $ORG/01.proteome/$b.faa.tmp
51
+ FastA.tag.rb -i $ORG/01.proteome/$b.faa.tmp -o $ORG/01.proteome/$b.faa.tmp -d
52
+ rm -R $i $ORG/01.proteome/$b.faa.tmp
53
+ done
54
+
55
+ # 02. Reciprocal Best Matches
56
+ echo "[02/06] Idenfifying Reciprocal Best Matches"
57
+ for i in $ORG/01.proteome/*.faa ; do # <- This nested loop could be parallelized
58
+ genomeA=$(basename $i .faa)
59
+ for j in $ORG/01.proteome/*.faa ; do
60
+ genomeB=$(basename $j .faa)
61
+ rbm.rb -1 $i -2 $j -t $THR > $ORG/02.rbm/$genomeA-$genomeB.rbm
62
+ [[ "$i" == "$j" ]] && continue # <- Ignore if it simplifies distribution
63
+ done
64
+ done
65
+
66
+ # 03. Orthologous Groups
67
+ echo "[03/06] Compiling Orthologous Groups"
68
+ ogs.mcl.rb -d $ORG/02.rbm -o $ORG/03.ogs/pangenome.ogs -t $THR
69
+
70
+ # 04. Extract unus genome and align groups
71
+ echo "[04/06] Extracting unus genome and aligning OGs"
72
+ ogs.extract.rb -i $ORG/03.ogs/pangenome.ogs -s $ORG/01.proteome/%s.faa \
73
+ -o $ORG/04.aln/ -c 1 -d 1 -p
74
+ for i in $ORG/04.aln/*.fa ; do # <- This loop could be parallelized
75
+ b=$(basename $i .fa)
76
+ clustalo -i $i -o $ORG/04.aln/$b.aln --threads=$THR
77
+ done
78
+
79
+ # 05. Concatenate alignment
80
+ echo "[05/06] Concatenating alignments and removing invariable sites"
81
+ Aln.cat.rb -I -c $ORG/05.cat/unus.raxcoords -i - $ORG/04.aln/*.aln \
82
+ > $ORG/05.cat/unus.aln 2> $ORG/05.cat/unus.log
83
+
84
+ # 06. Run RAxML
85
+ echo "[06/06] Inferring phylogeny"
86
+ # You REALLY should consider running the following with more threads (-T) and,
87
+ # if possible, multi-nodes using MPI
88
+ cd $ORG/06.raxml
89
+ raxmlHPC-PTHREADS -T $THR -p 1234 \
90
+ -s ../05.cat/unus.aln -q ../05.cat/unus.raxcoords \
91
+ -m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
92
+ # before running this line, so you know
93
+ # that you're running what you really
94
+ # want. Check options for bootstrapping
95
+ # and the different algorithms (-f). Note
96
+ # that -m is required, but the file
97
+ # unus.raxcoords specifies "AUTO", so
98
+ # RAxML will attempt to find the model
99
+ # resulting in the highest likelihood.
100
+
@@ -0,0 +1,73 @@
1
+ Artistic License 2.0
2
+ Copyright (c) 2000-2006, The Perl Foundation.
3
+
4
+ Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
5
+
6
+ Preamble
7
+ This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software.
8
+
9
+ You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement.
10
+
11
+ Definitions
12
+ "Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package.
13
+
14
+ "Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures.
15
+
16
+ "You" and "your" means any person who would like to copy, distribute, or modify the Package.
17
+
18
+ "Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version.
19
+
20
+ "Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization.
21
+
22
+ "Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees.
23
+
24
+ "Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder.
25
+
26
+ "Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder.
27
+
28
+ "Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future.
29
+
30
+ "Source" form means the source code, documentation source, and configuration files for the Package.
31
+
32
+ "Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form.
33
+
34
+ Permission for Use and Modification Without Distribution
35
+ (1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version.
36
+
37
+ Permissions for Redistribution of the Standard Version
38
+ (2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package.
39
+
40
+ (3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License.
41
+
42
+ Distribution of Modified Versions of the Package as Source
43
+ (4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following:
44
+
45
+ (a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version.
46
+ (b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version.
47
+ (c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under
48
+ (i) the Original License or
49
+ (ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed.
50
+
51
+ Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source
52
+ (5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license.
53
+
54
+ (6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version.
55
+
56
+ Aggregating or Linking the Package
57
+ (7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation.
58
+
59
+ (8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package.
60
+
61
+ Items That are Not Considered Part of a Modified Version
62
+ (9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license.
63
+
64
+ General Provisions
65
+ (10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license.
66
+
67
+ (11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license.
68
+
69
+ (12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder.
70
+
71
+ (13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed.
72
+
73
+ (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,52 @@
1
+
2
+ # Makefile for the Enve-omics collection
3
+ # @update Oct 13 2013
4
+ # @author Luis M. Rodriguez-R <lmrodriguez at gmail dot com>
5
+
6
+ include globals.mk
7
+
8
+ TEST=Tests
9
+ enveomics_r=enveomics.R
10
+ enveomics_r_v=enveomics.R_1.1.5
11
+ .PHONY: test install install-scripts install-r uninstall install-deps
12
+
13
+ test: $(enveomics_r_v).tar.gz
14
+ @echo
15
+ @echo Testing
16
+ cd $(TEST) && $(MAKE)
17
+ @echo
18
+ @echo Testing $(enveomics_r)
19
+ $(R) CMD check --as-cran $(enveomics_r_v).tar.gz
20
+
21
+ install: install-r install-scripts
22
+
23
+ install-scripts:
24
+ [[ -d $(bindir)/lib ]] || mkdir $(bindir)/lib
25
+ ln -s $(foreach file,$(SCRIPTS),$(shell pwd)/$(file)) $(bindir)
26
+ ln -s $(shell pwd)/Scripts/lib/enveomics_rb $(bindir)/lib/
27
+ @echo
28
+ @echo Important note:
29
+ @echo This installation has simply created symbolic links to Scripts.
30
+ @echo If you need to move this folder, use uninstall/install afterwards.
31
+ @echo
32
+
33
+ install-r:
34
+ $(R) CMD INSTALL $(enveomics_r)/
35
+
36
+ uninstall:
37
+ -for file in $(foreach f,$(SCRIPTS),$(bindir)/$(notdir $f)) ; do \
38
+ [[ -h $$file ]] && rm -r $$file ; \
39
+ done
40
+ -[[ -h $(bindir)/lib/enveomics_rb ]] && rm -r $(bindir)/lib/enveomics_rb
41
+ -$(R) CMD REMOVE $(enveomics_r)
42
+
43
+ $(enveomics_r_v).tar.gz: install-deps
44
+ -rm -r $(enveomics_r).tar.gz
45
+ ./build_enveomics_r.bash
46
+ $(R) CMD build $(enveomics_r)/
47
+ $(MAKE) install-r
48
+
49
+ install-deps: /usr/local/bin/brew /Library/TeX/texbin/pdflatex
50
+ pandoc -v %%>/dev/null || brew install pandoc
51
+ #qpdf -v %%>/dev/null || brew install qpdf
52
+ [[ -d /usr/local/opt/texinfo/bin ]] || brew install texinfo
@@ -0,0 +1,103 @@
1
+ {
2
+ "tasks": [
3
+ {
4
+ "task": "AAsubs.log2ratio.rb",
5
+ "description": ["Estimates the log2-ratio of different amino acids in",
6
+ "homologous sites using an AAsubs file (see BlastPairwise.AAsubs.pl).",
7
+ "It provides the point estimation (.obs file), the bootstrap of the",
8
+ "estimation (.boot file) and the null model based on label-permutation",
9
+ "(.null file)."],
10
+ "see_also": ["BlastPairwise.AAsubs.pl"],
11
+ "cite": [["Konstantinidis et al, 2009, AEM",
12
+ "http://dx.doi.org/10.1128%2FAEM.00473-09"]],
13
+ "help_arg": "--help",
14
+ "options": [
15
+ {
16
+ "opt": "--input",
17
+ "arg": "in_file",
18
+ "mandatory": true,
19
+ "description": ["Input file in AAsubs format. It's a tab-delimited",
20
+ "table where each line corresponds to a substitution, the first",
21
+ "column corresponds to the compared protein IDs, the second",
22
+ "and third columns correspond to the AA on each protein, and the",
23
+ "fourth column indicates the length of the protein (not used by",
24
+ "this script."]
25
+ },
26
+ {
27
+ "opt": "--obs-file",
28
+ "arg": "out_file",
29
+ "description": ["Output file with the log2-ratios per amino acid.",
30
+ "By default, 'Input value'.obs."]
31
+ },
32
+ {
33
+ "opt": "--bootstrap-file",
34
+ "arg": "out_file",
35
+ "description": ["Output file with the bootstrap results of",
36
+ "log2-ratios per amino acid. By default, 'Input value'.boot."]
37
+ },
38
+ {
39
+ "opt": "--null-file",
40
+ "arg": "out_file",
41
+ "description": ["Output file with the permutation results of",
42
+ "log2-ratios per amino acid. By default, 'Input value'.null."]
43
+ },
44
+ {
45
+ "opt": "--overwrite",
46
+ "description": ["Overwrite existing files. By default, skip steps if",
47
+ "the files already exist."]
48
+ },
49
+ {
50
+ "opt": "--bootstraps",
51
+ "arg": "integer",
52
+ "default": 1000,
53
+ "description": "Number of bootstraps to run."
54
+ },
55
+ {
56
+ "opt": "--permutations",
57
+ "arg": "integer",
58
+ "default": 1000,
59
+ "description": "Number of permutations to run."
60
+ },
61
+ {
62
+ "opt": "--quiet",
63
+ "description": "Run quietly (no STDERR output)."
64
+ }
65
+ ]
66
+ },
67
+ {
68
+ "task": "BlastPairwise.AAsubs.pl",
69
+ "description": ["Counts the different AA substitutions in the best hit",
70
+ "blast alignments, from a BLASTP pairwise format output (-outfmt 0 in",
71
+ "BLAST+, -m 0 in legacy BLAST)."],
72
+ "see_also": ["AAsubs.log2ratio.rb"],
73
+ "cite": [["Konstantinidis et al, 2009, AEM",
74
+ "http://dx.doi.org/10.1128%2FAEM.00473-09"]],
75
+ "help_arg": "",
76
+ "options": [
77
+ {
78
+ "name": "Cigar char",
79
+ "arg": "select",
80
+ "values": ["+","_"],
81
+ "mandatory": true,
82
+ "description": ["Use '+' for similar substitutions, use '_' for non",
83
+ "similar substitutions."]
84
+ },
85
+ {
86
+ "name": "Blast M0",
87
+ "arg": "in_file",
88
+ "mandatory": true,
89
+ "description": "Blast in 'pairwise text' format (-outfmt/-m 0)."
90
+ },
91
+ ">",
92
+ {
93
+ "name": "AA subs",
94
+ "arg": "out_file",
95
+ "mandatory": true,
96
+ "description": ["A tab-delimited raw file with one substitution per",
97
+ "row and columns: (1) Name-of-query_Name-of-subject, (2)",
98
+ "AA-in-subject, (3) AA-in-query, (4) Total-Align-Length."]
99
+ }
100
+ ]
101
+ }
102
+ ]
103
+ }
@@ -0,0 +1,703 @@
1
+ {
2
+ "tasks": [
3
+ {
4
+ "task": "BlastTab.addlen.rb",
5
+ "description": ["Appends an extra column to a tabular BLAST with the",
6
+ "length of the query or the subject sequence."],
7
+ "help_arg": "--help",
8
+ "options": [
9
+ {
10
+ "opt": "--fasta",
11
+ "arg": "in_file",
12
+ "description": "FastA file of the query or the subject.",
13
+ "mandatory": true
14
+ },
15
+ {
16
+ "opt": "--subject",
17
+ "description": ["Use the subject column of the BLAST, by default the",
18
+ "query column is used."],
19
+ "note": "If used, the input FastA must contain subject sequences."
20
+ },
21
+ {
22
+ "opt": "--quiet",
23
+ "description": "Run quietly (no STDERR output)."
24
+ },
25
+ "<",
26
+ {
27
+ "arg": "in_file",
28
+ "description": "Input tabular BLAST file.",
29
+ "mandatory": true
30
+ },
31
+ ">",
32
+ {
33
+ "arg": "out_file",
34
+ "description": "Output tabular BLAST file with additional column.",
35
+ "mandatory": true
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "task": "BlastTab.advance.bash",
41
+ "description": ["Calculates the percentage of a partial BLAST result.",
42
+ "The value produced slightly subestimates the actual advance, due to",
43
+ "un-flushed output and trailing queries that could be processed but",
44
+ "generate no results."],
45
+ "help_arg": "",
46
+ "requires": [ { "interpreter": "awk" } ],
47
+ "options": [
48
+ {
49
+ "name": "Blast",
50
+ "arg": "in_file",
51
+ "description": "Incomplete Tabular BLAST output.",
52
+ "mandatory": true
53
+ },
54
+ {
55
+ "name": "Query FastA",
56
+ "arg": "in_file",
57
+ "description": "FastA file with query sequences.",
58
+ "mandatory": true
59
+ }
60
+ ]
61
+ },
62
+ {
63
+ "task": "BlastTab.best_hit_sorted.pl",
64
+ "description": "Filters a tabular BLAST to retain only the best matches.",
65
+ "help_arg": "--help",
66
+ "see_also": ["BlastTab.topHits_sorted.rb"],
67
+ "options": [
68
+ {
69
+ "name": "Sort",
70
+ "arg": "select",
71
+ "values": ["sort","cat"],
72
+ "mandatory": true,
73
+ "description": ["Use 'sort' if your BLAST is not pre-sorted by the",
74
+ "first column (or if you're not sure). Use 'cat' otherwise."]
75
+ },
76
+ {
77
+ "name": "Input BLAST",
78
+ "arg": "in_file",
79
+ "multiple_sep": " ",
80
+ "mandatory": true,
81
+ "description": "Tabular BLAST file to filter."
82
+ },
83
+ "|",
84
+ { "arg": "task" },
85
+ ">",
86
+ {
87
+ "name": "Output BLAST",
88
+ "arg": "out_file",
89
+ "mandatory": true,
90
+ "description": "Filetered tabular BLAST output."
91
+ }
92
+ ]
93
+ },
94
+ {
95
+ "task": "BlastTab.catsbj.pl",
96
+ "description": ["Generates a list of hits from a BLAST result",
97
+ "concatenating the subject sequences. This can be used, e.g., to",
98
+ "analyze BLAST results against draft genomes. This script creates two",
99
+ "files using <map.bls> as prefix with extensions .rec (for the",
100
+ "recruitment plot) and .lim (for the limits of the different sequences",
101
+ "in <seq.fa>)."],
102
+ "help_arg": "-h",
103
+ "options": [
104
+ {
105
+ "opt": "-i",
106
+ "name": "Identity",
107
+ "description": "Minimum identity (in %) to report a result.",
108
+ "arg": "float",
109
+ "default": 70.0
110
+ },
111
+ {
112
+ "opt": "-l",
113
+ "name": "Length",
114
+ "description": "Minimum alignment length to report a result.",
115
+ "default": 60.0,
116
+ "arg": "float"
117
+ },
118
+ {
119
+ "opt": "-s",
120
+ "name": "Subset",
121
+ "description": ["The FastA provided is to be treated as a subset of",
122
+ "the subject. By default, it expects all the subjects to be",
123
+ "present in the BLAST."]
124
+ },
125
+ {
126
+ "opt": "-q",
127
+ "name": "Quiet",
128
+ "description": "Run quietly."
129
+ },
130
+ {
131
+ "name": "seq.fa",
132
+ "description": "Subject sequences (ref) in FastA format.",
133
+ "mandatory": true,
134
+ "arg": "in_file"
135
+ },
136
+ {
137
+ "name": "map.bls",
138
+ "description": ["Mapping of the reads to the reference in Tabular",
139
+ "BLAST format."],
140
+ "mandatory": true,
141
+ "arg": "in_file"
142
+ }
143
+ ]
144
+ },
145
+ {
146
+ "task": "BlastTab.cogCat.rb",
147
+ "description": ["Replaces the COG gene IDs in a BLAST for the COG",
148
+ "category."],
149
+ "help_arg": "--help",
150
+ "options": [
151
+ {
152
+ "opt": "--whog",
153
+ "arg": "in_file",
154
+ "mandatory": true,
155
+ "description": "COG's 'whog' file."
156
+ },
157
+ {
158
+ "opt": "--blast",
159
+ "arg": "in_file",
160
+ "mandatory": true,
161
+ "description": "Tabular BLAST file with COG IDs as subject."
162
+ },
163
+ {
164
+ "opt": "--cog",
165
+ "description": "If set, returns the COG ID, not the COG category."
166
+ },
167
+ {
168
+ "opt": "--desc",
169
+ "description": "Includes COG description (requires --cog)."
170
+ },
171
+ {
172
+ "opt": "--noverbose",
173
+ "description": "Run quietly, but show warnings."
174
+ },
175
+ {
176
+ "opt": "--quiet",
177
+ "description": "Run quietly."
178
+ },
179
+ ">",
180
+ {
181
+ "arg": "out_file",
182
+ "name": "COG Blast",
183
+ "mandatory": true,
184
+ "description": "Tabular BLAST with COG ID's or categories as subject."
185
+ }
186
+ ]
187
+ },
188
+ {
189
+ "task": "BlastTab.filter.pl",
190
+ "description": ["Extracts a subset of hits (queries or subjects) from a",
191
+ "tabular BLAST."],
192
+ "help_arg": "",
193
+ "see_also": "BlastTab.subsample.pl",
194
+ "options": [
195
+ {
196
+ "name": "Subject",
197
+ "opt": "-s",
198
+ "description": ["If set, assumes that list.txt contains subject IDs.",
199
+ "By default: assumes query IDs."]
200
+ },
201
+ {
202
+ "name": "Inverse",
203
+ "opt": "-i",
204
+ "description": ["If set, reports the inverse of the list (i.e.,",
205
+ "reports only hits absent in the list)."]
206
+ },
207
+ {
208
+ "name": "list.txt",
209
+ "arg": "in_file",
210
+ "mandatory": true,
211
+ "description": "List of IDs to extract."
212
+ },
213
+ {
214
+ "name": "blast.txt",
215
+ "arg": "in_file",
216
+ "mandatory": true,
217
+ "description": "Tabular BLAST file containing the superset of hits."
218
+ },
219
+ ">",
220
+ {
221
+ "name": "subset.txt",
222
+ "arg": "out_file",
223
+ "mandatory": true,
224
+ "description": "Tabulat BLAST file to be created."
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "task": "BlastTab.pairedHits.rb",
230
+ "description": "Identifies the best hits of paired-reads.",
231
+ "help_arg": "--help",
232
+ "options": [
233
+ {
234
+ "opt": "--blast",
235
+ "arg": "in_file",
236
+ "mandatory": true,
237
+ "description": "Input Tabular BLAST file.",
238
+ "note": ["This script assumes that paired hits are next to each",
239
+ "other. If this is not the case (e.g., because the blast was",
240
+ "concatenated), you must sort the input before running this",
241
+ "script."]
242
+ },
243
+ {
244
+ "name": "Min score",
245
+ "opt": "--minscore",
246
+ "arg": "float",
247
+ "default": 0.0,
248
+ "description": "Minimum (summed) Bit-Score to consider a pair-match."
249
+ },
250
+ {
251
+ "name": "Best hits",
252
+ "opt": "--besthits",
253
+ "arg": "integer",
254
+ "default": 0,
255
+ "description": ["Outputs top best-hits only (use 0 to output all the",
256
+ "paired hits)."]
257
+ },
258
+ {
259
+ "name": "Orientation",
260
+ "opt": "--orient",
261
+ "arg": "select",
262
+ "values": [0,1,2,3,4],
263
+ "default": 0,
264
+ "description": ["Checks the orientation of the hit. Values are: 0,",
265
+ "no checking; 1, same direction; 2, inwards; 3, outwards; 4,",
266
+ "different direction (i.e., 2 or 3)."]
267
+ },
268
+ {
269
+ "name": "Sister prefix",
270
+ "opt": "--sisprefix",
271
+ "arg": "string",
272
+ "default": "_",
273
+ "description": ["Sister read number prefix in the name of the reads.",
274
+ "Escape characters as dots (\\.), parenthesis (\\(, \\), \\[,",
275
+ "\\]), other characters with special meaning in regular",
276
+ "expressions (\\*, \\+, \\^, \\$, \\|). This prefix allows regular",
277
+ "expressions (for example, use ':|\\.' to use any of colon or",
278
+ "dot). Note that the prefix will not be included in the base name",
279
+ "reported in the output."]
280
+ },
281
+ ">",
282
+ {
283
+ "arg": "out_file",
284
+ "mandatory": true,
285
+ "description": ["Tab-delimited flat file, with the following",
286
+ "columns: (1) Query ID (without the \"sister\" identifier). (2)",
287
+ "Subject ID. (3) Bit score (summed from both sister reads). (4/5)",
288
+ "From/To (subject) coordinates for read 1. (6/7) From/To (subject)",
289
+ "coordinates for read 2. (8) Reads orientation (1: same direction,",
290
+ "2: inwards, 3: outwards). (9) Estimated insert size."]
291
+ }
292
+ ]
293
+ },
294
+ {
295
+ "task": "BlastTab.seqdepth.pl",
296
+ "description": "Estimates the sequencing depth of subject sequences.",
297
+ "help_arg": "",
298
+ "see_also": ["BlastTab.seqdepth_ZIP.pl", "BlastTab.seqdepth_nomedian.pl"],
299
+ "options": [
300
+ "cat",
301
+ {
302
+ "arg": "in_file",
303
+ "multiple_sep": " ",
304
+ "mandatory": true,
305
+ "description": ["One or more Tabular BLAST files of reads vs genes",
306
+ "(or contigs)."]
307
+ },
308
+ "|",
309
+ { "arg": "task" },
310
+ {
311
+ "name": "genes_or_ctgs.fna",
312
+ "arg": "in_file",
313
+ "mandatory": true,
314
+ "description": ["A FastA file containing the genes or the contigs",
315
+ "(db)."]
316
+ },
317
+ ">",
318
+ {
319
+ "name": "genes_or_ctgs.cov",
320
+ "arg": "out_file",
321
+ "mandatory": true,
322
+ "description": ["A tab-delimited file with the following columns:",
323
+ "(1) Subject ID. (2) Average sequencing depth. (3) Median",
324
+ "sequencing depth. (4) Number of mapped reads. (5) Length of the",
325
+ "subject sequence."]
326
+ }
327
+ ]
328
+ },
329
+ {
330
+ "task": "BlastTab.seqdepth_ZIP.pl",
331
+ "description": ["Estimates the average sequencing depth of subject",
332
+ "sequences (genes or contigs) assuming a Zero-Inflated Poisson",
333
+ "distribution (ZIP) to correct for non-covered positions. It uses the",
334
+ "corrected method of moments estimators (CMMEs) as described by",
335
+ "Beckett et al [1]. Note that [1] has a mistake in eq. (2.4), that",
336
+ "should be: pi-hat-MM = 1 - (X-bar / lambda-hat-MM). Also note that a",
337
+ "more elaborated mixture distribution can arise from coverage",
338
+ "histograms (e.g., see [2] for an additional correction called 'tail",
339
+ "distribution' and mixtures involving negative binomial) so take these",
340
+ "results cum grano salis.\n [1]",
341
+ "http://anisette.ucs.louisiana.edu/Academic/Sciences/MATH/stage/stat2012.pdf\n",
342
+ "[2] Lindner et al, Bioinformatics, 2013."],
343
+ "help_arg": "",
344
+ "see_also": ["BlastTab.seqdepth.pl", "BlastTab.seqdepth_nomedian.pl"],
345
+ "options": [
346
+ "cat",
347
+ {
348
+ "name": "blast",
349
+ "arg": "in_file",
350
+ "multiple_sep": " ",
351
+ "mandatory": true,
352
+ "description": ["One or more Tabular BLAST files of reads vs genes",
353
+ "(or contigs)."]
354
+ },
355
+ "|",
356
+ { "arg": "task" },
357
+ {
358
+ "name": "genes_or_ctgs.fna",
359
+ "arg": "in_file",
360
+ "mandatory": true,
361
+ "description": ["A FastA file containing the genes or the contigs",
362
+ "(db)."]
363
+ },
364
+ ">",
365
+ {
366
+ "name": "genes_or_ctgs.cov",
367
+ "arg": "out_file",
368
+ "mandatory": true,
369
+ "description": ["Output file with the following columns:",
370
+ "(1) Subject ID.",
371
+ "(2) Estimated average sequencing depth (CMME lambda).",
372
+ "(3) Zero-inflation (CMME pi).",
373
+ "(4) Observed average sequencing depth.",
374
+ "(5) Observed median sequencing depth.",
375
+ "(6) Observed median sequencing depth excluding zeroes.",
376
+ "(7) Number of mapped reads.",
377
+ "(8) Length of the subject sequence."]
378
+ }
379
+ ]
380
+ },
381
+ {
382
+ "task": "BlastTab.seqdepth_nomedian.pl",
383
+ "description": ["Estimates the sequencing depth of subject",
384
+ "sequences. The values reported by this script may differ from those",
385
+ "of BlastTab.seqdepth.pl, because this script uses the aligned length",
386
+ "of the read while BlastTab.seqdepth.pl uses the aligned length of the",
387
+ "subject sequence."],
388
+ "help_arg": "",
389
+ "see_also": ["BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
390
+ "options": [
391
+ "cat",
392
+ {
393
+ "arg": "in_file",
394
+ "multiple_sep": " ",
395
+ "mandatory": true,
396
+ "description": ["One or more Tabular BLAST files of reads vs genes",
397
+ "(or contigs)."]
398
+ },
399
+ "|",
400
+ { "arg": "task" },
401
+ {
402
+ "name": "genes_or_ctgs.fna",
403
+ "arg": "in_file",
404
+ "mandatory": true,
405
+ "description": ["A FastA file containing the genes or the contigs",
406
+ "(db)."]
407
+ },
408
+ ">",
409
+ {
410
+ "name": "genes_or_ctgs.cov",
411
+ "arg": "out_file",
412
+ "mandatory": true,
413
+ "description": ["A tab-delimited file with the following columns:",
414
+ "(1) Subject ID. (2) Average sequencing depth. (3) Number of",
415
+ "mapped reads. (4) Length of the subject sequence."]
416
+ }
417
+ ]
418
+ },
419
+ {
420
+ "task": "BlastTab.subsample.pl",
421
+ "description": ["Filters a BLAST output including only the hits produced",
422
+ "by any of the given sequences as query."],
423
+ "help_arg": "",
424
+ "see_also": "BlastTab.filter.pl",
425
+ "options": [
426
+ {
427
+ "name": "blast.tab",
428
+ "mandatory": true,
429
+ "arg": "in_file",
430
+ "description": "BLAST output to be filtered (tabular format)."
431
+ },
432
+ {
433
+ "name": "sample.fa",
434
+ "mandatory": true,
435
+ "arg": "in_file",
436
+ "description": "Sequences to use as query (FastA format)."
437
+ },
438
+ ">",
439
+ {
440
+ "arg": "out_file",
441
+ "mandatory": true,
442
+ "description": "The filtered BLAST output (tabular format)."
443
+ }
444
+ ]
445
+ },
446
+ {
447
+ "task": "BlastTab.sumPerHit.pl",
448
+ "description": ["Sums the weights of all the queries hitting each",
449
+ "subject. Often (but not necessarily) the BLAST files contain only",
450
+ "best matches. The weights can be any number, but a common use of this",
451
+ "Script is to add up counts (weights are integers). For example, in a",
452
+ "BLAST of predicted genes vs some annotation source, the weights could",
453
+ "be the number of reads recruited by each gene."],
454
+ "help_arg": "-h",
455
+ "options": [
456
+ {
457
+ "name": "Weights file",
458
+ "opt": "-w",
459
+ "arg": "in_file",
460
+ "description": ["A two-columns tab-delimited file containing the",
461
+ "the name (column 1) and the weight (column 2) of each query."]
462
+ },
463
+ {
464
+ "name": "Minimum score",
465
+ "opt": "-s",
466
+ "arg": "float",
467
+ "default": 0.0
468
+ },
469
+ {
470
+ "name": "Minimum identity (%)",
471
+ "opt": "-i",
472
+ "arg": "float",
473
+ "default": 0.0
474
+ },
475
+ {
476
+ "name": "Queries",
477
+ "opt": "-m",
478
+ "arg": "integer",
479
+ "default": 0,
480
+ "description": "Maximum number of queries. Set to 0 for all."
481
+ },
482
+ {
483
+ "name": "Normalize",
484
+ "opt": "-n",
485
+ "description": "Normalize weights by the number of hits per query."
486
+ },
487
+ {
488
+ "name": "Include zeroes",
489
+ "opt": "-z",
490
+ "description": ["Add zero when weight is not found (by default:",
491
+ "doesn't list them)."]
492
+ },
493
+ {
494
+ "name": "Run quietly",
495
+ "opt": "-q"
496
+ },
497
+ {
498
+ "name": "blast",
499
+ "arg": "in_file",
500
+ "multiple_sep": " ",
501
+ "mandatory": true,
502
+ "description": "One or more BLAST files."
503
+ },
504
+ ">",
505
+ {
506
+ "arg": "out_file",
507
+ "mandatory": true,
508
+ "description": ["A two-columns tab-delimited file containing the",
509
+ "summed weights per hit."]
510
+ }
511
+ ]
512
+ },
513
+ {
514
+ "task": "BlastTab.taxid2taxrank.pl",
515
+ "description": ["Takes a BLAST with NCBI Taxonomy IDs as subjects and",
516
+ "replaces them by names at a given taxonomic rank."],
517
+ "help_arg": "",
518
+ "options": [
519
+ {
520
+ "name": "tax_blast.txt",
521
+ "mandatory": true,
522
+ "arg": "in_file",
523
+ "description": ["BLAST output, where subject IDs are NCBI Taxonomy",
524
+ "IDs."]
525
+ },
526
+ {
527
+ "name": "nodes.dmp",
528
+ "mandatory": true,
529
+ "arg": "in_file",
530
+ "description": "Nodes file from NCBI Taxonomy.",
531
+ "source_url": "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
532
+ },
533
+ {
534
+ "name": "names.dmp",
535
+ "mandatory": true,
536
+ "arg": "in_file",
537
+ "description": "Names file from NCBI Taxonomy.",
538
+ "source_url": "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
539
+ },
540
+ {
541
+ "name": "rank",
542
+ "arg": "string",
543
+ "mandatory": true,
544
+ "default": "genus",
545
+ "description": ["The rank to be reported. All the reported nodes",
546
+ "will have the same rank. To see supported values, run:\n",
547
+ "`cut -f 5 nodes.dmp | sort -u`."]
548
+ },
549
+ {
550
+ "name": "Best-hit",
551
+ "arg": "select",
552
+ "values": ["yes", "no"],
553
+ "default": "yes",
554
+ "description": ["Should it take into account the best hit per query",
555
+ "only? This is: should it filter by best-hit?"]
556
+ },
557
+ ">",
558
+ {
559
+ "name": "taxrank_list.txt",
560
+ "arg": "out_file",
561
+ "mandatory": true,
562
+ "description": ["BLAST-like output, where subject IDs are Taxonomy",
563
+ "names."]
564
+ }
565
+ ]
566
+ },
567
+ {
568
+ "task": "BlastTab.topHits_sorted.rb",
569
+ "description": "Reports the top-N best hits of a BLAST.",
570
+ "help_arg": "--help",
571
+ "see_also": "BlastTab.best_hit_sorted.pl",
572
+ "options": [
573
+ {
574
+ "name": "Sort",
575
+ "arg": "select",
576
+ "values": ["sort","cat"],
577
+ "mandatory": true,
578
+ "description": ["Use 'sort' if your BLAST is not pre-sorted by the",
579
+ "first column (or if you're not sure). Use 'cat' otherwise."]
580
+ },
581
+ {
582
+ "arg": "in_file",
583
+ "mandatory": true,
584
+ "description": "Tabular BLAST file."
585
+ },
586
+ "|",
587
+ { "arg": "task" },
588
+ "--blast",
589
+ "/dev/stdin",
590
+ {
591
+ "opt": "--top",
592
+ "arg": "integer",
593
+ "default": 5,
594
+ "description": "Maximum number of hits to report for each query."
595
+ },
596
+ {
597
+ "opt": "--sort-by",
598
+ "arg": "select",
599
+ "values": ["bitscore", "evalue", "identity", "length"],
600
+ "default": "bitscore",
601
+ "description": "Parameter used to detect the 'best' hits."
602
+ },
603
+ {
604
+ "opt": "--quiet",
605
+ "description": "Run quietly."
606
+ },
607
+ ">",
608
+ {
609
+ "arg": "out_file",
610
+ "mandatory": true,
611
+ "description": "Output (filtered) Tabular BLAST."
612
+ }
613
+ ]
614
+ },
615
+ {
616
+ "task": "BlastTab.recplot2.R",
617
+ "description": ["Produce recruitment plot objects provided that",
618
+ "BlastTab.catsbj.pl has been previously executed."],
619
+ "help_arg": "--help",
620
+ "requires": [
621
+ { "r_package": "optparse" },
622
+ { "r_package": "enveomics.R" }
623
+ ],
624
+ "options": [
625
+ {
626
+ "opt": "--prefix",
627
+ "arg": "in_file",
628
+ "mandatory": true,
629
+ "description": ["Path to the prefix of the BlastTab.catsbj.pl output",
630
+ "files. At least the files .rec and .lim must exist with this",
631
+ "prefix."]
632
+ },
633
+ {
634
+ "opt": "--pos-breaks",
635
+ "arg": "integer",
636
+ "default": 1000,
637
+ "description": ["Breaks in the positions histogram."]
638
+ },
639
+ {
640
+ "opt": "--id-breaks",
641
+ "arg": "integer",
642
+ "default": 300,
643
+ "description": ["Breaks in the identity histogram."]
644
+ },
645
+ {
646
+ "opt": "--id-metric",
647
+ "arg": "select",
648
+ "values": ["identity", "corrected identity", "bit score"],
649
+ "default": "identity",
650
+ "description": ["Metric of identity to be used (Y-axis). Corrected",
651
+ "identity is only supported if the original BLAST file included",
652
+ "sequence lengths."]
653
+ },
654
+ {
655
+ "opt": "--id-summary",
656
+ "arg": "string",
657
+ "default": "sum",
658
+ "description": "Function summarizing the identity bins."
659
+ },
660
+ {
661
+ "opt": "--id-cutoff",
662
+ "arg": "float",
663
+ "default": 95.0,
664
+ "description": ["Cutoff of identity metric above which the hits are",
665
+ "considered 'in-group'. The 95% identity corresponds to the",
666
+ "expectation of ANI<95% within species."]
667
+ },
668
+ {
669
+ "opt": "--threads",
670
+ "arg": "integer",
671
+ "default": 2,
672
+ "description": "Number of threads to use."
673
+ },
674
+ {
675
+ "opt": "--no-verbose",
676
+ "description": "Indicates if the function should report the advance."
677
+ },
678
+ {
679
+ "name": "R Object Output",
680
+ "arg": "out_file",
681
+ "mandatory": true,
682
+ "description": ["Recplo2 object that can be re-plotted using",
683
+ "R function plot."]
684
+ },
685
+ {
686
+ "name": "Graphical Output",
687
+ "arg": "out_file",
688
+ "description": "Recruitment plot in PDF."
689
+ },
690
+ {
691
+ "name": "Width",
692
+ "arg": "float",
693
+ "description": "Width of the plot in inches (7 by default)."
694
+ },
695
+ {
696
+ "name": "Height",
697
+ "arg": "float",
698
+ "description": "Height of the plot in inches (7 by default)."
699
+ }
700
+ ]
701
+ }
702
+ ]
703
+ }