RubyGems - miga-base - Versions diffs - 0.3.0.0 → 0.3.0.1 - Mend

miga-base 0.3.0.0 → 0.3.0.1

Files changed (260) hide show

checksums.yaml +4 -4
data/README.md +21 -4
data/actions/init.rb +258 -0
data/actions/run_local.rb +1 -2
data/actions/test_taxonomy.rb +4 -1
data/bin/miga +8 -1
data/lib/miga/dataset.rb +4 -4
data/lib/miga/dataset_result.rb +7 -4
data/lib/miga/version.rb +2 -2
data/scripts/_distances_noref_nomulti.bash +3 -1
data/scripts/clade_finding.bash +1 -1
data/scripts/init.bash +1 -1
data/scripts/miga.bash +1 -1
data/scripts/mytaxa.bash +78 -72
data/scripts/mytaxa_scan.bash +67 -62
data/scripts/ogs.bash +1 -1
data/scripts/trimmed_fasta.bash +4 -3
data/utils/enveomics/Examples/aai-matrix.bash +66 -0
data/utils/enveomics/Examples/ani-matrix.bash +66 -0
data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
data/utils/enveomics/LICENSE.txt +73 -0
data/utils/enveomics/Makefile +52 -0
data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
data/utils/enveomics/Manifest/Tasks/blasttab.json +703 -0
data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
data/utils/enveomics/Manifest/Tasks/fasta.json +571 -0
data/utils/enveomics/Manifest/Tasks/fastq.json +208 -0
data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
data/utils/enveomics/Manifest/Tasks/ogs.json +339 -0
data/utils/enveomics/Manifest/Tasks/other.json +746 -0
data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +454 -0
data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
data/utils/enveomics/Manifest/categories.json +132 -0
data/utils/enveomics/Manifest/examples.json +154 -0
data/utils/enveomics/Manifest/tasks.json +4 -0
data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
data/utils/enveomics/README.md +40 -0
data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
data/utils/enveomics/Scripts/Aln.cat.rb +162 -0
data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
data/utils/enveomics/Scripts/BlastTab.addlen.rb +61 -0
data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
data/utils/enveomics/Scripts/BlastTab.catsbj.pl +106 -0
data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
data/utils/enveomics/Scripts/BlastTab.recplot2.R +40 -0
data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
data/utils/enveomics/Scripts/Chao1.pl +97 -0
data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
data/utils/enveomics/Scripts/FastA.interpose.pl +87 -0
data/utils/enveomics/Scripts/FastA.length.pl +38 -0
data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
data/utils/enveomics/Scripts/FastA.split.pl +55 -0
data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
data/utils/enveomics/Scripts/FastA.tag.rb +64 -0
data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
data/utils/enveomics/Scripts/HMM.essential.rb +254 -0
data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
data/utils/enveomics/Scripts/JPlace.to_iToL.rb +306 -0
data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
data/utils/enveomics/Scripts/SRA.download.bash +50 -0
data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
data/utils/enveomics/Scripts/Table.barplot.R +30 -0
data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
data/utils/enveomics/Scripts/Table.filter.pl +61 -0
data/utils/enveomics/Scripts/Table.merge.pl +77 -0
data/utils/enveomics/Scripts/Table.replace.rb +69 -0
data/utils/enveomics/Scripts/Table.round.rb +63 -0
data/utils/enveomics/Scripts/Table.split.pl +57 -0
data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
data/utils/enveomics/Scripts/aai.rb +373 -0
data/utils/enveomics/Scripts/ani.rb +362 -0
data/utils/enveomics/Scripts/gi2tax.rb +103 -0
data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
data/utils/enveomics/Scripts/lib/data/essential.hmm.gz +0 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +26 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
data/utils/enveomics/Scripts/ogs.rb +104 -0
data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
data/utils/enveomics/Scripts/rbm.rb +137 -0
data/utils/enveomics/Tests/Makefile +10 -0
data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
data/utils/enveomics/Tests/alkB.nwk +1 -0
data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
data/utils/enveomics/Tests/hiv1.faa +59 -0
data/utils/enveomics/Tests/hiv1.fna +134 -0
data/utils/enveomics/Tests/hiv2.faa +70 -0
data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
data/utils/enveomics/build_enveomics_r.bash +44 -0
data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
data/utils/enveomics/enveomics.R/NAMESPACE +35 -0
data/utils/enveomics/enveomics.R/R/autoprune.R +121 -0
data/utils/enveomics/enveomics.R/R/barplot.R +165 -0
data/utils/enveomics/enveomics.R/R/cliopts.R +119 -0
data/utils/enveomics/enveomics.R/R/df2dist.R +117 -0
data/utils/enveomics/enveomics.R/R/growthcurve.R +263 -0
data/utils/enveomics/enveomics.R/R/recplot.R +320 -0
data/utils/enveomics/enveomics.R/R/recplot2.R +745 -0
data/utils/enveomics/enveomics.R/R/tribs.R +423 -0
data/utils/enveomics/enveomics.R/R/utils.R +16 -0
data/utils/enveomics/enveomics.R/README.md +52 -0
data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +30 -0
data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +43 -0
data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +19 -0
data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +37 -0
data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +24 -0
data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +24 -0
data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +33 -0
data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +64 -0
data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +37 -0
data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +19 -0
data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +18 -0
data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +26 -0
data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +25 -0
data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +26 -0
data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +49 -0
data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +28 -0
data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +97 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +40 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +40 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +24 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +40 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +18 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +22 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +20 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +18 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +27 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +53 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -0
data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +44 -0
data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +21 -0
data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +15 -0
data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +43 -0
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +29 -0
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +30 -0
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +71 -0
data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +18 -0
data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +18 -0
data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +18 -0
data/utils/enveomics/enveomics.R/man/z$-methods.Rd +27 -0
data/utils/enveomics/globals.mk +8 -0
data/utils/enveomics/manifest.json +9 -0
data/utils/index_metadata.rb +0 -0
data/utils/plot-taxdist.R +0 -0
data/utils/requirements.txt +19 -19
metadata +242 -2

data/utils/enveomics/Examples/essential-phylogeny.bash ADDED Viewed

@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# @author  Luis M. Rodriguez-R
+# @update  Mar-23-2016
+# @license artistic license 2.0
+#
+set -e # <- So it stops if there is an error
+function exists { [[ -e "$1" ]] ; } # <- To test *any* of many files
+ORG=$1 # <- Organism (see help)
+THR=2 # <- Number or threads
+# This is just the help message
+if [[ "$ORG" == "" ]] ; then
+echo "
+Use case: Essential genes phylogeny of a species. The essential genes are a
+collection of genes typically found in single copy in archaeal and bacterial
+genomes
+IMPORTANT
+This script is functional, but it's mainly intended for illustrative purposes.
+Please take a look at the code first.
+Usage:
+$0 <organism>
+<organism>	The organism to use (e.g., Streptococcus_pneumoniae).
+" >&2
+exit
+fi
+# 00. Create environment
+export PATH=$(dirname $0)/../Scripts:$PATH
+if [[ -e $ORG ]] ; then
+   echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
+   exit 1
+fi
+mkdir $ORG
+for i in 01.proteome 02.essential 03.aln 04.cat 05.raxml 06.autoprune ; do
+   mkdir $ORG/$i
+done
+# 01. Download proteomes
+echo "[01/06] Downloading and guzipping data"
+RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
+rm $ORG/01.proteome/assembly_summary.txt
+for i in $ORG/01.proteome/* ; do
+   b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
+   if exists $i/*.faa.gz ; then
+      for j in $i/*.faa.gz ; do gunzip $j ; done
+      cat $i/*.faa > $ORG/01.proteome/$b.faa
+   fi
+   rm -R $i
+done
+# 02. Essential genes
+echo "[02/06] Idenfifying essential genes"
+N=0
+for i in $ORG/01.proteome/*.faa ; do # <- This loop could be parallelized
+   genomeA=$(basename $i .faa)
+   dir=$ORG/02.essential/$genomeA
+   mkdir $dir
+   HMM.essential.rb -i $i -m $dir/ -R $dir/log.txt -r $genomeA -t $THR
+   let N=$N+1
+done
+# 03. Find core and align groups
+echo "[03/06] Identifying core essentials and aligning groups"
+CORE_ESS=$(basename -s .faa $ORG/02.essential/*/*.faa | sort | uniq -c \
+   | awk '$1=='$N'{print $2}')
+for b in $CORE_ESS ; do # <- This loop could be parallelized
+   cat $ORG/02.essential/*/$b.faa > $ORG/03.aln/$b.faa
+   clustalo -i $ORG/03.aln/$b.faa -o $ORG/03.aln/$b.aln #--threads=$THR
+done
+# 04. Concatenate alignment
+echo "[04/06] Concatenating alignments and removing invariable sites"
+Aln.cat.rb -I -c $ORG/04.cat/essential.raxcoords -i '|' $ORG/03.aln/*.aln \
+   > $ORG/04.cat/essential.aln 2> $ORG/04.cat/essential.log
+# 05. Run RAxML
+echo "[05/06] Inferring phylogeny"
+# You REALLY should consider running the following with more threads (-T) and,
+# if possible, multi-nodes using MPI
+cd $ORG/05.raxml
+raxmlHPC-PTHREADS -T $THR -p 1234 \
+   -s ../04.cat/essential.aln -q ../04.cat/essential.raxcoords \
+   -m PROTCATGTR -n UNUS #  IMPORTANT:	Please read the documentation of RAxML
+   			 # 		before running this line, so you know
+			 #  that you're running what you really want. Check
+			 #  options for bootstrapping and the different
+			 #  algorithms (-f). Note that -m is required, but the
+			 #  file unus.raxcoords specifies "AUTO", so RAxML will
+			 #  attempt to find the model resulting in the highest
+			 #  likelihood.
+cd ../..
+# 06. Autoprune
+echo "[06/06] Auto-pruning the tree"
+Newick.autoprune.R --t $ORG/05.raxml/RAxML_bestTree.UNUS --min_dist 0.001 \
+   $ORG/06.autoprune/essential-pruned.nwk

data/utils/enveomics/Examples/unus-genome-phylogeny.bash ADDED Viewed

@@ -0,0 +1,100 @@
+#!/bin/bash
+#
+# @author  Luis M. Rodriguez-R
+# @update  Oct-20-2015
+# @license artistic license 2.0
+#
+ORG=$1 # <- Organism (see help)
+THR=2 # <- Number or threads
+# This is just the help message
+if [[ "$ORG" == "" ]] ; then
+echo "
+Use case: Unus genome phylogeny of a species. The unus genome is the collection
+of orthologous groups in a set of genomes that has exactly one gene per genome,
+i.e., the core genome minus in-paralogs.
+IMPORTANT
+This script is functional, but it's mainly intended for illustrative purposes.
+Please take a look at the code first.
+Usage:
+$0 <organism>
+<organism>	The organism to use (e.g., Streptococcus_pneumoniae).
+" >&2
+exit
+fi
+# 00. Create environment
+export PATH=$(dirname $0)/../Scripts:$PATH
+if [[ -e $ORG ]] ; then
+   echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
+   exit 1
+fi
+mkdir $ORG
+for i in 01.proteome 02.rbm 03.ogs 04.aln 05.cat 06.raxml ; do
+   mkdir $ORG/$i
+done
+# 01. Download proteomes
+echo "[01/06] Downloading and guzipping data"
+RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
+rm $ORG/01.proteome/assembly_summary.txt
+for i in $ORG/01.proteome/* ; do
+   b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
+   for j in $i/*.faa.gz ; do gunzip $j ; done
+   cat $i/*.faa > $ORG/01.proteome/$b.faa.tmp
+   FastA.tag.rb -i $ORG/01.proteome/$b.faa.tmp -o $ORG/01.proteome/$b.faa.tmp -d
+   rm -R $i $ORG/01.proteome/$b.faa.tmp
+done
+# 02. Reciprocal Best Matches
+echo "[02/06] Idenfifying Reciprocal Best Matches"
+for i in $ORG/01.proteome/*.faa ; do # <- This nested loop could be parallelized
+   genomeA=$(basename $i .faa)
+   for j in $ORG/01.proteome/*.faa ; do
+      genomeB=$(basename $j .faa)
+      rbm.rb -1 $i -2 $j -t $THR > $ORG/02.rbm/$genomeA-$genomeB.rbm
+      [[ "$i" == "$j" ]] && continue # <- Ignore if it simplifies distribution
+   done
+done
+# 03. Orthologous Groups
+echo "[03/06] Compiling Orthologous Groups"
+ogs.mcl.rb -d $ORG/02.rbm -o $ORG/03.ogs/pangenome.ogs -t $THR
+# 04. Extract unus genome and align groups
+echo "[04/06] Extracting unus genome and aligning OGs"
+ogs.extract.rb -i $ORG/03.ogs/pangenome.ogs -s $ORG/01.proteome/%s.faa \
+   -o $ORG/04.aln/ -c 1 -d 1 -p
+for i in $ORG/04.aln/*.fa ; do # <- This loop could be parallelized
+   b=$(basename $i .fa)
+   clustalo -i $i -o $ORG/04.aln/$b.aln --threads=$THR
+done
+# 05. Concatenate alignment
+echo "[05/06] Concatenating alignments and removing invariable sites"
+Aln.cat.rb -I -c $ORG/05.cat/unus.raxcoords -i - $ORG/04.aln/*.aln \
+   > $ORG/05.cat/unus.aln 2> $ORG/05.cat/unus.log
+# 06. Run RAxML
+echo "[06/06] Inferring phylogeny"
+# You REALLY should consider running the following with more threads (-T) and,
+# if possible, multi-nodes using MPI
+cd $ORG/06.raxml
+raxmlHPC-PTHREADS -T $THR -p 1234 \
+   -s ../05.cat/unus.aln -q ../05.cat/unus.raxcoords \
+   -m PROTCATGTR -n UNUS #  IMPORTANT:	Please read the documentation of RAxML
+   			 # 		before running this line, so you know
+			 # 		that you're running what you really
+			 #		want. Check options for bootstrapping
+			 #		and the different algorithms (-f). Note
+			 #		that -m is required, but the file
+			 #		unus.raxcoords specifies "AUTO", so
+			 #		RAxML will attempt to find the model
+			 #		resulting in the highest likelihood.

data/utils/enveomics/LICENSE.txt ADDED Viewed

@@ -0,0 +1,73 @@
+Artistic License 2.0
+Copyright (c) 2000-2006, The Perl Foundation.
+Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+Preamble
+This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software.
+You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement.
+Definitions
+"Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package.
+"Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures.
+"You" and "your" means any person who would like to copy, distribute, or modify the Package.
+"Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version.
+"Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization.
+"Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees.
+"Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder.
+"Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder.
+"Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future.
+"Source" form means the source code, documentation source, and configuration files for the Package.
+"Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form.
+Permission for Use and Modification Without Distribution
+(1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version.
+Permissions for Redistribution of the Standard Version
+(2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package.
+(3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License.
+Distribution of Modified Versions of the Package as Source
+(4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following:
+(a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version.
+(b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version.
+(c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under
+(i) the Original License or
+(ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed.
+Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source
+(5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license.
+(6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version.
+Aggregating or Linking the Package
+(7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation.
+(8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package.
+Items That are Not Considered Part of a Modified Version
+(9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license.
+General Provisions
+(10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license.
+(11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license.
+(12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder.
+(13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed.
+(14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/utils/enveomics/Makefile ADDED Viewed

@@ -0,0 +1,52 @@
+# Makefile for the Enve-omics collection
+# @update Oct 13 2013
+# @author Luis M. Rodriguez-R <lmrodriguez at gmail dot com>
+include globals.mk
+TEST=Tests
+enveomics_r=enveomics.R
+enveomics_r_v=enveomics.R_1.1.5
+.PHONY: test install install-scripts install-r uninstall install-deps
+test: $(enveomics_r_v).tar.gz
+	@echo
+	@echo Testing
+	cd $(TEST) && $(MAKE)
+	@echo
+	@echo Testing $(enveomics_r)
+	$(R) CMD check --as-cran $(enveomics_r_v).tar.gz
+install: install-r install-scripts
+install-scripts:
+	[[ -d $(bindir)/lib ]] || mkdir $(bindir)/lib
+	ln -s $(foreach file,$(SCRIPTS),$(shell pwd)/$(file)) $(bindir)
+	ln -s $(shell pwd)/Scripts/lib/enveomics_rb $(bindir)/lib/
+	@echo
+	@echo Important note:
+	@echo This installation has simply created symbolic links to Scripts.
+	@echo If you need to move this folder, use uninstall/install afterwards.
+	@echo
+install-r:
+	$(R) CMD INSTALL $(enveomics_r)/
+uninstall:
+	-for file in $(foreach f,$(SCRIPTS),$(bindir)/$(notdir $f)) ; do \
+	   [[ -h $$file ]] && rm -r $$file ; \
+	done
+	-[[ -h $(bindir)/lib/enveomics_rb ]] && rm -r $(bindir)/lib/enveomics_rb
+	-$(R) CMD REMOVE $(enveomics_r)
+$(enveomics_r_v).tar.gz: install-deps
+	-rm -r $(enveomics_r).tar.gz
+	./build_enveomics_r.bash
+	$(R) CMD build $(enveomics_r)/
+	$(MAKE) install-r
+install-deps: /usr/local/bin/brew /Library/TeX/texbin/pdflatex
+	pandoc -v %%>/dev/null || brew install pandoc
+	#qpdf -v %%>/dev/null || brew install qpdf
+	[[ -d /usr/local/opt/texinfo/bin ]] || brew install texinfo

data/utils/enveomics/Manifest/Tasks/aasubs.json ADDED Viewed

@@ -0,0 +1,103 @@
+{
+  "tasks": [
+    {
+      "task": "AAsubs.log2ratio.rb",
+      "description": ["Estimates the log2-ratio of different amino acids in",
+        "homologous sites using an AAsubs file (see BlastPairwise.AAsubs.pl).",
+	"It provides the point estimation (.obs file), the bootstrap of the",
+	"estimation (.boot file) and the null model based on label-permutation",
+	"(.null file)."],
+      "see_also": ["BlastPairwise.AAsubs.pl"],
+      "cite": [["Konstantinidis et al, 2009, AEM",
+        "http://dx.doi.org/10.1128%2FAEM.00473-09"]],
+      "help_arg": "--help",
+      "options": [
+        {
+	  "opt": "--input",
+	  "arg": "in_file",
+	  "mandatory": true,
+	  "description": ["Input file in AAsubs format. It's a tab-delimited",
+	    "table where each line corresponds to a substitution, the first",
+	    "column corresponds to the compared protein IDs, the second",
+	    "and third columns correspond to the AA on each protein, and the",
+	    "fourth column indicates the length of the protein (not used by",
+	    "this script."]
+	},
+	{
+	  "opt": "--obs-file",
+	  "arg": "out_file",
+	  "description": ["Output file with the log2-ratios per amino acid.",
+	    "By default, 'Input value'.obs."]
+	},
+	{
+	  "opt": "--bootstrap-file",
+	  "arg": "out_file",
+	  "description": ["Output file with the bootstrap results of",
+	    "log2-ratios per amino acid. By default, 'Input value'.boot."]
+	},
+	{
+	  "opt": "--null-file",
+	  "arg": "out_file",
+	  "description": ["Output file with the permutation results of",
+	    "log2-ratios per amino acid. By default, 'Input value'.null."]
+	},
+	{
+	  "opt": "--overwrite",
+	  "description": ["Overwrite existing files. By default, skip steps if",
+	    "the files already exist."]
+	},
+	{
+	  "opt": "--bootstraps",
+	  "arg": "integer",
+	  "default": 1000,
+	  "description": "Number of bootstraps to run."
+	},
+	{
+	  "opt": "--permutations",
+	  "arg": "integer",
+	  "default": 1000,
+	  "description": "Number of permutations to run."
+	},
+	{
+	  "opt": "--quiet",
+	  "description": "Run quietly (no STDERR output)."
+	}
+      ]
+    },
+    {
+      "task": "BlastPairwise.AAsubs.pl",
+      "description": ["Counts the different AA substitutions in the best hit",
+        "blast alignments, from a BLASTP pairwise format output (-outfmt 0 in",
+	"BLAST+, -m 0 in legacy BLAST)."],
+      "see_also": ["AAsubs.log2ratio.rb"],
+      "cite": [["Konstantinidis et al, 2009, AEM",
+        "http://dx.doi.org/10.1128%2FAEM.00473-09"]],
+      "help_arg": "",
+      "options": [
+        {
+	  "name": "Cigar char",
+	  "arg": "select",
+	  "values": ["+","_"],
+	  "mandatory": true,
+	  "description": ["Use '+' for similar substitutions, use '_' for non",
+	    "similar substitutions."]
+	},
+	{
+	  "name": "Blast M0",
+	  "arg": "in_file",
+	  "mandatory": true,
+	  "description": "Blast in 'pairwise text' format (-outfmt/-m 0)."
+	},
+	">",
+	{
+	  "name": "AA subs",
+	  "arg": "out_file",
+	  "mandatory": true,
+	  "description": ["A tab-delimited raw file with one substitution per",
+	    "row and columns: (1) Name-of-query_Name-of-subject, (2)",
+	    "AA-in-subject, (3) AA-in-query, (4) Total-Align-Length."]
+	}
+      ]
+    }
+  ]
+}

data/utils/enveomics/Manifest/Tasks/blasttab.json ADDED Viewed

@@ -0,0 +1,703 @@
+{
+  "tasks": [
+    {
+      "task": "BlastTab.addlen.rb",
+      "description": ["Appends an extra column to a tabular BLAST with the",
+        "length of the query or the subject sequence."],
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--fasta",
+          "arg": "in_file",
+          "description": "FastA file of the query or the subject.",
+          "mandatory": true
+        },
+        {
+          "opt": "--subject",
+          "description": ["Use the subject column of the BLAST, by default the",
+            "query column is used."],
+          "note": "If used, the input FastA must contain subject sequences."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly (no STDERR output)."
+        },
+        "<",
+        {
+          "arg": "in_file",
+          "description": "Input tabular BLAST file.",
+          "mandatory": true
+        },
+        ">",
+        {
+          "arg": "out_file",
+          "description": "Output tabular BLAST file with additional column.",
+          "mandatory": true
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.advance.bash",
+      "description": ["Calculates the percentage of a partial BLAST result.",
+        "The value produced slightly subestimates the actual advance, due to",
+        "un-flushed output and trailing queries that could be processed but",
+        "generate no results."],
+      "help_arg": "",
+      "requires": [ { "interpreter": "awk" } ],
+      "options": [
+        {
+          "name": "Blast",
+          "arg": "in_file",
+          "description": "Incomplete Tabular BLAST output.",
+          "mandatory": true
+        },
+        {
+          "name": "Query FastA",
+          "arg": "in_file",
+          "description": "FastA file with query sequences.",
+          "mandatory": true
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.best_hit_sorted.pl",
+      "description": "Filters a tabular BLAST to retain only the best matches.",
+      "help_arg": "--help",
+      "see_also": ["BlastTab.topHits_sorted.rb"],
+      "options": [
+        {
+          "name": "Sort",
+          "arg": "select",
+          "values": ["sort","cat"],
+          "mandatory": true,
+          "description": ["Use 'sort' if your BLAST is not pre-sorted by the",
+            "first column (or if you're not sure). Use 'cat' otherwise."]
+        },
+        {
+          "name": "Input BLAST",
+          "arg": "in_file",
+          "multiple_sep": " ",
+          "mandatory": true,
+          "description": "Tabular BLAST file to filter."
+        },
+        "|",
+        { "arg": "task" },
+        ">",
+        {
+          "name": "Output BLAST",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": "Filetered tabular BLAST output."
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.catsbj.pl",
+      "description": ["Generates a list of hits from a BLAST result",
+        "concatenating the subject sequences. This can be used, e.g., to",
+        "analyze BLAST results against draft genomes. This script creates two",
+        "files using <map.bls> as prefix with extensions .rec (for the",
+        "recruitment plot) and .lim (for the limits of the different sequences",
+        "in <seq.fa>)."],
+      "help_arg": "-h",
+      "options": [
+         {
+           "opt": "-i",
+           "name": "Identity",
+           "description": "Minimum identity (in %) to report a result.",
+           "arg": "float",
+           "default": 70.0
+         },
+         {
+           "opt": "-l",
+           "name": "Length",
+           "description": "Minimum alignment length to report a result.",
+           "default": 60.0,
+           "arg": "float"
+         },
+         {
+           "opt": "-s",
+           "name": "Subset",
+           "description": ["The FastA provided is to be treated as a subset of",
+             "the subject. By default, it expects all the subjects to be",
+             "present in the BLAST."]
+         },
+         {
+           "opt": "-q",
+           "name": "Quiet",
+           "description": "Run quietly."
+         },
+         {
+           "name": "seq.fa",
+           "description": "Subject sequences (ref) in FastA format.",
+           "mandatory": true,
+           "arg": "in_file"
+         },
+         {
+           "name": "map.bls",
+           "description": ["Mapping of the reads to the reference in Tabular",
+             "BLAST format."],
+           "mandatory": true,
+           "arg": "in_file"
+         }
+      ]
+    },
+    {
+      "task": "BlastTab.cogCat.rb",
+      "description": ["Replaces the COG gene IDs in a BLAST for the COG",
+        "category."],
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--whog",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "COG's 'whog' file."
+        },
+        {
+          "opt": "--blast",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "Tabular BLAST file with COG IDs as subject."
+        },
+        {
+          "opt": "--cog",
+          "description": "If set, returns the COG ID, not the COG category."
+        },
+        {
+          "opt": "--desc",
+          "description": "Includes COG description (requires --cog)."
+        },
+        {
+          "opt": "--noverbose",
+          "description": "Run quietly, but show warnings."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly."
+        },
+        ">",
+        {
+          "arg": "out_file",
+          "name": "COG Blast",
+          "mandatory": true,
+          "description": "Tabular BLAST with COG ID's or categories as subject."
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.filter.pl",
+      "description": ["Extracts a subset of hits (queries or subjects) from a",
+        "tabular BLAST."],
+      "help_arg": "",
+      "see_also": "BlastTab.subsample.pl",
+      "options": [
+        {
+          "name": "Subject",
+          "opt": "-s",
+          "description": ["If set, assumes that list.txt contains subject IDs.",
+            "By default: assumes query IDs."]
+        },
+        {
+          "name": "Inverse",
+          "opt": "-i",
+          "description": ["If set, reports the inverse of the list (i.e.,",
+            "reports only hits absent in the list)."]
+        },
+        {
+          "name": "list.txt",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "List of IDs to extract."
+        },
+        {
+          "name": "blast.txt",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "Tabular BLAST file containing the superset of hits."
+        },
+        ">",
+        {
+          "name": "subset.txt",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": "Tabulat BLAST file to be created."
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.pairedHits.rb",
+      "description": "Identifies the best hits of paired-reads.",
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--blast",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "Input Tabular BLAST file.",
+          "note": ["This script assumes that paired hits are next to each",
+            "other. If this is not the case (e.g., because the blast was",
+            "concatenated), you must sort the input before running this",
+            "script."]
+        },
+        {
+          "name": "Min score",
+          "opt": "--minscore",
+          "arg": "float",
+          "default": 0.0,
+          "description": "Minimum (summed) Bit-Score to consider a pair-match."
+        },
+        {
+          "name": "Best hits",
+          "opt": "--besthits",
+          "arg": "integer",
+          "default": 0,
+          "description": ["Outputs top best-hits only (use 0 to output all the",
+            "paired hits)."]
+        },
+        {
+          "name": "Orientation",
+          "opt": "--orient",
+          "arg": "select",
+          "values": [0,1,2,3,4],
+          "default": 0,
+          "description": ["Checks the orientation of the hit. Values are: 0,",
+            "no checking; 1, same direction; 2, inwards; 3, outwards; 4,",
+            "different direction (i.e., 2 or 3)."]
+        },
+        {
+          "name": "Sister prefix",
+          "opt": "--sisprefix",
+          "arg": "string",
+          "default": "_",
+          "description": ["Sister read number prefix in the name of the reads.",
+            "Escape characters as dots (\\.), parenthesis (\\(, \\), \\[,",
+            "\\]), other characters with special meaning in regular",
+	    "expressions (\\*, \\+, \\^, \\$, \\|). This prefix allows regular",
+	    "expressions (for example, use ':|\\.' to use any of colon or",
+	    "dot). Note that the prefix will not be included in the base name",
+	    "reported in the output."]
+        },
+        ">",
+        {
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["Tab-delimited flat file, with the following",
+            "columns: (1) Query ID (without the \"sister\" identifier). (2)",
+            "Subject ID. (3) Bit score (summed from both sister reads). (4/5)",
+            "From/To (subject) coordinates for read 1. (6/7) From/To (subject)",
+            "coordinates for read 2. (8) Reads orientation (1: same direction,",
+            "2: inwards, 3: outwards). (9) Estimated insert size."]
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.seqdepth.pl",
+      "description": "Estimates the sequencing depth of subject sequences.",
+      "help_arg": "",
+      "see_also": ["BlastTab.seqdepth_ZIP.pl", "BlastTab.seqdepth_nomedian.pl"],
+      "options": [
+        "cat",
+        {
+          "arg": "in_file",
+          "multiple_sep": " ",
+          "mandatory": true,
+          "description": ["One or more Tabular BLAST files of reads vs genes",
+            "(or contigs)."]
+        },
+        "|",
+	{ "arg": "task" },
+        {
+          "name": "genes_or_ctgs.fna",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["A FastA file containing the genes or the contigs",
+            "(db)."]
+        },
+        ">",
+        {
+          "name": "genes_or_ctgs.cov",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["A tab-delimited file with the following columns:",
+            "(1) Subject ID. (2) Average sequencing depth. (3) Median",
+            "sequencing depth. (4) Number of mapped reads. (5) Length of the",
+            "subject sequence."]
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.seqdepth_ZIP.pl",
+      "description": ["Estimates the average sequencing depth of subject",
+        "sequences (genes or contigs) assuming a Zero-Inflated Poisson",
+        "distribution (ZIP) to correct for non-covered positions. It uses the",
+        "corrected method of moments estimators (CMMEs) as described by",
+        "Beckett et al [1]. Note that [1] has a mistake in eq. (2.4), that",
+        "should be: pi-hat-MM = 1 - (X-bar / lambda-hat-MM). Also note that a",
+        "more elaborated mixture distribution can arise from coverage",
+        "histograms (e.g., see [2] for an additional correction called 'tail",
+        "distribution' and mixtures involving negative binomial) so take these",
+        "results cum grano salis.\n [1]",
+        "http://anisette.ucs.louisiana.edu/Academic/Sciences/MATH/stage/stat2012.pdf\n",
+        "[2] Lindner et al, Bioinformatics, 2013."],
+      "help_arg": "",
+      "see_also": ["BlastTab.seqdepth.pl", "BlastTab.seqdepth_nomedian.pl"],
+      "options": [
+        "cat",
+        {
+          "name": "blast",
+          "arg": "in_file",
+          "multiple_sep": " ",
+          "mandatory": true,
+          "description": ["One or more Tabular BLAST files of reads vs genes",
+            "(or contigs)."]
+        },
+        "|",
+        { "arg": "task" },
+        {
+          "name": "genes_or_ctgs.fna",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["A FastA file containing the genes or the contigs",
+            "(db)."]
+        },
+        ">",
+        {
+          "name": "genes_or_ctgs.cov",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["Output file with the following columns:",
+            "(1) Subject ID.",
+            "(2) Estimated average sequencing depth (CMME lambda).",
+            "(3) Zero-inflation (CMME pi).",
+            "(4) Observed average sequencing depth.",
+            "(5) Observed median sequencing depth.",
+            "(6) Observed median sequencing depth excluding zeroes.",
+            "(7) Number of mapped reads.",
+            "(8) Length of the subject sequence."]
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.seqdepth_nomedian.pl",
+      "description": ["Estimates the sequencing depth of subject",
+        "sequences. The values reported by this script may differ from those",
+	"of BlastTab.seqdepth.pl, because this script uses the aligned length",
+	"of the read while BlastTab.seqdepth.pl uses the aligned length of the",
+	"subject sequence."],
+      "help_arg": "",
+      "see_also": ["BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
+      "options": [
+        "cat",
+	{
+	  "arg": "in_file",
+          "multiple_sep": " ",
+          "mandatory": true,
+          "description": ["One or more Tabular BLAST files of reads vs genes",
+            "(or contigs)."]
+	},
+	"|",
+	{ "arg": "task" },
+        {
+          "name": "genes_or_ctgs.fna",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["A FastA file containing the genes or the contigs",
+            "(db)."]
+        },
+        ">",
+        {
+          "name": "genes_or_ctgs.cov",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["A tab-delimited file with the following columns:",
+            "(1) Subject ID. (2) Average sequencing depth. (3) Number of",
+	    "mapped reads. (4) Length of the subject sequence."]
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.subsample.pl",
+      "description": ["Filters a BLAST output including only the hits produced",
+        "by any of the given sequences as query."],
+      "help_arg": "",
+      "see_also": "BlastTab.filter.pl",
+      "options": [
+        {
+	  "name": "blast.tab",
+	  "mandatory": true,
+	  "arg": "in_file",
+	  "description": "BLAST output to be filtered (tabular format)."
+	},
+	{
+	  "name": "sample.fa",
+	  "mandatory": true,
+	  "arg": "in_file",
+	  "description": "Sequences to use as query (FastA format)."
+	},
+	">",
+	{
+	  "arg": "out_file",
+	  "mandatory": true,
+	  "description": "The filtered BLAST output (tabular format)."
+	}
+      ]
+    },
+    {
+      "task": "BlastTab.sumPerHit.pl",
+      "description": ["Sums the weights of all the queries hitting each",
+        "subject. Often (but not necessarily) the BLAST files contain only",
+        "best matches. The weights can be any number, but a common use of this",
+        "Script is to add up counts (weights are integers). For example, in a",
+        "BLAST of predicted genes vs some annotation source, the weights could",
+        "be the number of reads recruited by each gene."],
+      "help_arg": "-h",
+      "options": [
+        {
+          "name": "Weights file",
+          "opt": "-w",
+          "arg": "in_file",
+          "description": ["A two-columns tab-delimited file containing the",
+            "the name (column 1) and the weight (column 2) of each query."]
+        },
+        {
+          "name": "Minimum score",
+          "opt": "-s",
+          "arg": "float",
+          "default": 0.0
+        },
+        {
+          "name": "Minimum identity (%)",
+          "opt": "-i",
+          "arg": "float",
+          "default": 0.0
+        },
+        {
+          "name": "Queries",
+          "opt": "-m",
+          "arg": "integer",
+          "default": 0,
+          "description": "Maximum number of queries. Set to 0 for all."
+        },
+        {
+          "name": "Normalize",
+          "opt": "-n",
+          "description": "Normalize weights by the number of hits per query."
+        },
+        {
+          "name": "Include zeroes",
+          "opt": "-z",
+          "description": ["Add zero when weight is not found (by default:",
+            "doesn't list them)."]
+        },
+        {
+          "name": "Run quietly",
+          "opt": "-q"
+        },
+        {
+          "name": "blast",
+          "arg": "in_file",
+          "multiple_sep": " ",
+          "mandatory": true,
+          "description": "One or more BLAST files."
+        },
+        ">",
+        {
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["A two-columns tab-delimited file containing the",
+            "summed weights per hit."]
+        }
+      ]
+    },
+    {
+      "task": "BlastTab.taxid2taxrank.pl",
+      "description": ["Takes a BLAST with NCBI Taxonomy IDs as subjects and",
+        "replaces them by names at a given taxonomic rank."],
+      "help_arg": "",
+      "options": [
+        {
+	  "name": "tax_blast.txt",
+	  "mandatory": true,
+	  "arg": "in_file",
+	  "description": ["BLAST output, where subject IDs are NCBI Taxonomy",
+	    "IDs."]
+	},
+	{
+	  "name": "nodes.dmp",
+	  "mandatory": true,
+	  "arg": "in_file",
+	  "description": "Nodes file from NCBI Taxonomy.",
+	  "source_url": "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
+	},
+	{
+	  "name": "names.dmp",
+	  "mandatory": true,
+	  "arg": "in_file",
+	  "description": "Names file from NCBI Taxonomy.",
+	  "source_url": "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
+	},
+	{
+	  "name": "rank",
+	  "arg": "string",
+	  "mandatory": true,
+	  "default": "genus",
+	  "description": ["The rank to be reported. All the reported nodes",
+	    "will have the same rank. To see supported values, run:\n",
+	    "`cut -f 5 nodes.dmp | sort -u`."]
+	},
+	{
+	  "name": "Best-hit",
+	  "arg": "select",
+	  "values": ["yes", "no"],
+	  "default": "yes",
+	  "description": ["Should it take into account the best hit per query",
+	    "only? This is: should it filter by best-hit?"]
+	},
+	">",
+	{
+	  "name": "taxrank_list.txt",
+	  "arg": "out_file",
+	  "mandatory": true,
+	  "description": ["BLAST-like output, where subject IDs are Taxonomy",
+	    "names."]
+	}
+      ]
+    },
+    {
+      "task": "BlastTab.topHits_sorted.rb",
+      "description": "Reports the top-N best hits of a BLAST.",
+      "help_arg": "--help",
+      "see_also": "BlastTab.best_hit_sorted.pl",
+      "options": [
+        {
+          "name": "Sort",
+          "arg": "select",
+          "values": ["sort","cat"],
+          "mandatory": true,
+          "description": ["Use 'sort' if your BLAST is not pre-sorted by the",
+            "first column (or if you're not sure). Use 'cat' otherwise."]
+	},
+	{
+	  "arg": "in_file",
+	  "mandatory": true,
+	  "description": "Tabular BLAST file."
+	},
+	"|",
+	{ "arg": "task" },
+	"--blast",
+	"/dev/stdin",
+	{
+	  "opt": "--top",
+	  "arg": "integer",
+	  "default": 5,
+	  "description": "Maximum number of hits to report for each query."
+	},
+	{
+	  "opt": "--sort-by",
+	  "arg": "select",
+	  "values": ["bitscore", "evalue", "identity", "length"],
+	  "default": "bitscore",
+	  "description": "Parameter used to detect the 'best' hits."
+	},
+	{
+	  "opt": "--quiet",
+	  "description": "Run quietly."
+	},
+	">",
+	{
+	  "arg": "out_file",
+	  "mandatory": true,
+	  "description": "Output (filtered) Tabular BLAST."
+	}
+      ]
+    },
+    {
+      "task": "BlastTab.recplot2.R",
+      "description": ["Produce recruitment plot objects provided that",
+        "BlastTab.catsbj.pl has been previously executed."],
+      "help_arg": "--help",
+      "requires": [
+        { "r_package": "optparse" },
+        { "r_package": "enveomics.R" }
+      ],
+      "options": [
+        {
+	  "opt": "--prefix",
+	  "arg": "in_file",
+	  "mandatory": true,
+	  "description": ["Path to the prefix of the BlastTab.catsbj.pl output",
+	    "files. At least the files .rec and .lim must exist with this",
+	    "prefix."]
+	},
+	{
+	  "opt": "--pos-breaks",
+	  "arg": "integer",
+	  "default": 1000,
+	  "description": ["Breaks in the positions histogram."]
+	},
+	{
+	  "opt": "--id-breaks",
+	  "arg": "integer",
+	  "default": 300,
+	  "description": ["Breaks in the identity histogram."]
+	},
+	{
+	  "opt": "--id-metric",
+	  "arg": "select",
+	  "values": ["identity", "corrected identity", "bit score"],
+	  "default": "identity",
+	  "description": ["Metric of identity to be used (Y-axis). Corrected",
+	    "identity is only supported if the original BLAST file included",
+	    "sequence lengths."]
+	},
+	{
+	  "opt": "--id-summary",
+	  "arg": "string",
+	  "default": "sum",
+	  "description": "Function summarizing the identity bins."
+	},
+	{
+	  "opt": "--id-cutoff",
+	  "arg": "float",
+	  "default": 95.0,
+	  "description": ["Cutoff of identity metric above which the hits are",
+	    "considered 'in-group'. The 95% identity corresponds to the",
+	    "expectation of ANI<95% within species."]
+	},
+	{
+	  "opt": "--threads",
+	  "arg": "integer",
+	  "default": 2,
+	  "description": "Number of threads to use."
+	},
+	{
+	  "opt": "--no-verbose",
+	  "description": "Indicates if the function should report the advance."
+	},
+	{
+	  "name": "R Object Output",
+	  "arg": "out_file",
+	  "mandatory": true,
+	  "description": ["Recplo2 object that can be re-plotted using",
+	    "R function plot."]
+	},
+	{
+	  "name": "Graphical Output",
+	  "arg": "out_file",
+	  "description": "Recruitment plot in PDF."
+	},
+	{
+	  "name": "Width",
+	  "arg": "float",
+	  "description": "Width of the plot in inches (7 by default)."
+	},
+	{
+	  "name": "Height",
+	  "arg": "float",
+	  "description": "Height of the plot in inches (7 by default)."
+	}
+      ]
+    }
+  ]
+}