miga-base 1.2.17.0 → 1.2.17.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +2 -2
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI +3659 -0
- data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
- data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +165 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +356 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +162 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +138 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +67 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +421 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +108 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +167 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +203 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +141 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +192 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +349 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +419 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1698 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +638 -0
- data/utils/enveomics/enveomics.R/R/utils.R +90 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +111 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +34 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +63 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +147 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +34 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +31 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +81 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +49 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +48 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +22 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- metadata +268 -6
@@ -0,0 +1,88 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Mar-23-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
$:.push File.expand_path(File.dirname(__FILE__) + '/lib')
|
10
|
+
require 'enveomics_rb/og'
|
11
|
+
require 'optparse'
|
12
|
+
|
13
|
+
o = {:q=>FALSE, :f=>"(\\S+)\\.txt", :consolidate=>TRUE, :pre=>[]}
|
14
|
+
ARGV << '-h' if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
Annotates Orthology Groups (OGs) using one or more reference genomes.
|
18
|
+
|
19
|
+
Usage: #{$0} [options]"
|
20
|
+
opts.separator ""
|
21
|
+
opts.separator "Mandatory"
|
22
|
+
opts.on("-i", "--in FILE", "Input file containing the OGs (as generated by ogs.rb)."){ |v| o[:in]=v }
|
23
|
+
opts.on("-o", "--out FILE", "Output file containing the annotated OGs."){ |v| o[:out]=v }
|
24
|
+
opts.on("-a FILE1,FILE2,...", Array, "Input file(s) containing the annotations.One or more tab-delimited files",
|
25
|
+
"with the gene names in the first column and the annotation in the second."){ |v| o[:annotations]=v }
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Other Options"
|
28
|
+
opts.on("-f","--format STRING", "Format of the filenames for the annotation files, using regex syntax.",
|
29
|
+
"By default: '#{o[:f]}'."){ |v| o[:f]=v }
|
30
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
|
31
|
+
opts.on("-h", "--help", "Display this screen.") do
|
32
|
+
puts opts
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
opts.separator ""
|
36
|
+
end.parse!
|
37
|
+
abort "-i is mandatory" if o[:in].nil?
|
38
|
+
abort "-o is mandatory" if o[:out].nil?
|
39
|
+
abort "-a is mandatory" if o[:annotations].nil?
|
40
|
+
|
41
|
+
##### MAIN:
|
42
|
+
begin
|
43
|
+
# Read the pre-computed OGs
|
44
|
+
collection = OGCollection.new
|
45
|
+
$stderr.puts "Reading pre-computed OGs in '#{o[:in]}'." unless o[:q]
|
46
|
+
f = File.open(o[:in], 'r')
|
47
|
+
h = f.gets.chomp.split /\t/
|
48
|
+
while ln = f.gets
|
49
|
+
collection << OG.new(h, ln.chomp.split(/\t/))
|
50
|
+
end
|
51
|
+
f.close
|
52
|
+
$stderr.puts " Loaded OGs: #{collection.ogs.length}." unless o[:q]
|
53
|
+
|
54
|
+
# Read annotations
|
55
|
+
o[:annotations].each do |annot|
|
56
|
+
m = /#{o[:f]}/.match(annot)
|
57
|
+
if m.nil? or m[1].nil?
|
58
|
+
warn "Cannot parse filename: #{rbm_file} (doesn't match /#{o[:f]}/)."
|
59
|
+
next
|
60
|
+
end
|
61
|
+
f = File.open(annot, 'r')
|
62
|
+
no_og = 0
|
63
|
+
collection.add_note_src m[1]+' annotation'
|
64
|
+
while ln=f.gets
|
65
|
+
r = ln.chomp.split /\t/
|
66
|
+
g = Gene.new m[1], r[0]
|
67
|
+
og = collection.get_og g
|
68
|
+
if og.nil?
|
69
|
+
no_og += 1
|
70
|
+
else
|
71
|
+
og.add_note g.id + ': ' + r[1], collection.note_srcs.length-1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
warn "Warning: Cannot find #{no_og} genes from #{m[1]} in OG collection." if no_og>0
|
75
|
+
end
|
76
|
+
# Save the output matrix
|
77
|
+
$stderr.puts "Saving annotated OGs into '#{o[:out]}'." unless o[:q]
|
78
|
+
f = File.open(o[:out], "w")
|
79
|
+
f.puts collection.to_s
|
80
|
+
f.close
|
81
|
+
$stderr.puts "Done.\n" unless o[:q]
|
82
|
+
rescue => err
|
83
|
+
$stderr.puts "Exception: #{err}\n\n"
|
84
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
85
|
+
err
|
86
|
+
end
|
87
|
+
|
88
|
+
|
@@ -0,0 +1,160 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @license: artistic-2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
$:.push File.expand_path("../lib", __FILE__)
|
9
|
+
require "optparse"
|
10
|
+
require "json"
|
11
|
+
require "tmpdir"
|
12
|
+
|
13
|
+
o = {q:false, n:100, thr:2}
|
14
|
+
ARGV << "-h" if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
Subsamples the genomes in a set of Orthology Groups (OGs) and estimates the
|
18
|
+
trend of core genome and pangenome sizes.
|
19
|
+
|
20
|
+
Usage: #{$0} [options]"
|
21
|
+
opts.separator ""
|
22
|
+
opts.separator "Mandatory"
|
23
|
+
opts.on("-o", "--ogs FILE",
|
24
|
+
"Input file containing the precomputed OGs."){ |v| o[:ogs]=v }
|
25
|
+
opts.separator ""
|
26
|
+
opts.separator "Output Options"
|
27
|
+
opts.on("-s", "--summary FILE",
|
28
|
+
"Output file in tabular format with summary statistics."){ |v| o[:summ]=v }
|
29
|
+
opts.on("-t", "--tab FILE","Output file in tabular format."){ |v| o[:tab]=v }
|
30
|
+
opts.on("-j", "--json FILE", "Output file in JSON format."){ |v| o[:json]=v }
|
31
|
+
opts.separator ""
|
32
|
+
opts.separator "Other Options"
|
33
|
+
opts.on("-n", "--replicates INT",
|
34
|
+
"Number of replicates to estimate. By default: #{o[:n]}."
|
35
|
+
){ |v| o[:n]=v.to_i }
|
36
|
+
opts.on("--threads INT",
|
37
|
+
"Children threads to spawn. By default: #{o[:thr]}."){ |v| o[:thr]=v.to_i}
|
38
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
|
39
|
+
opts.on("-h", "--help", "Display this screen.") do
|
40
|
+
puts opts
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
opts.separator ""
|
44
|
+
end.parse!
|
45
|
+
abort "-o is mandatory" if o[:ogs].nil?
|
46
|
+
|
47
|
+
##### MAIN:
|
48
|
+
begin
|
49
|
+
# Read the pre-computed OGs
|
50
|
+
$stderr.puts "Reading pre-computed OGs in '#{o[:ogs]}'." unless o[:q]
|
51
|
+
bool_a = []
|
52
|
+
genomes_n = nil
|
53
|
+
File.open(o[:ogs], "r") do |f|
|
54
|
+
h = f.gets.chomp.split "\t"
|
55
|
+
genomes_n = h.size
|
56
|
+
while ln = f.gets
|
57
|
+
bool_a << ln.chomp.split("\t").map{ |g| g!="-" }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
$stderr.puts " Loaded OGs: #{bool_a.size}." unless o[:q]
|
61
|
+
bool_a_b = bool_a.map{ |og| og.map{ |g| g ? "1" : "0" }.join("").to_i(2) }
|
62
|
+
|
63
|
+
# Generate subsamples
|
64
|
+
size = {core:[], pan:[]}
|
65
|
+
Dir.mktmpdir do |dir|
|
66
|
+
children = 0
|
67
|
+
(0 .. o[:n]-1).each do |i|
|
68
|
+
fork do
|
69
|
+
# Generate trajectory
|
70
|
+
genomes = (0 .. genomes_n-1).to_a.shuffle
|
71
|
+
genomes_b = (2 ** genomes_n) - 1
|
72
|
+
core = []
|
73
|
+
pan = []
|
74
|
+
while not genomes.empty?
|
75
|
+
core.unshift 0
|
76
|
+
pan.unshift 0
|
77
|
+
bool_a_b.map! do |og|
|
78
|
+
r_og = og & genomes_b
|
79
|
+
if r_og>0
|
80
|
+
core[0] += 1 if r_og==genomes_b
|
81
|
+
pan[0] += 1
|
82
|
+
og
|
83
|
+
else
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
bool_a_b.compact!
|
88
|
+
genomes_b ^= 2 ** genomes.pop
|
89
|
+
end
|
90
|
+
abort "UNEXPECTED ERROR: Final genomes_b=#{genomes_b}." if genomes_b>0
|
91
|
+
# Store trajectory
|
92
|
+
File.open("#{dir}/#{i}", "w") do |tfh|
|
93
|
+
tfh.puts JSON.generate({core:core, pan:pan})
|
94
|
+
end
|
95
|
+
end # fork
|
96
|
+
children += 1
|
97
|
+
if children >= o[:thr]
|
98
|
+
Process.wait
|
99
|
+
children -= 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
Process.waitall
|
103
|
+
# Recover trajectories
|
104
|
+
(0 .. o[:n]-1).each do |i|
|
105
|
+
s = JSON.parse(File.read("#{dir}/#{i}"), {:symbolize_names=>true})
|
106
|
+
size[:core][i] = s[:core]
|
107
|
+
size[:pan][i] = s[:pan]
|
108
|
+
end
|
109
|
+
end # Dir.mktmpdir
|
110
|
+
|
111
|
+
# Show result
|
112
|
+
$stderr.puts "Generating reports." unless o[:q]
|
113
|
+
|
114
|
+
# Save results in JSON
|
115
|
+
unless o[:json].nil?
|
116
|
+
ofh = File.open(o[:json], "w")
|
117
|
+
ofh.puts JSON.pretty_generate(size)
|
118
|
+
ofh.close
|
119
|
+
end
|
120
|
+
|
121
|
+
# Save results in tab
|
122
|
+
unless o[:tab].nil?
|
123
|
+
ofh = File.open(o[:tab], "w")
|
124
|
+
ofh.puts (%w{replicate metric}+(1 .. genomes_n).to_a).join("\t")
|
125
|
+
(0 .. o[:n]-1).each do |i|
|
126
|
+
ofh.puts ([i+1,"core"] + size[:core][i]).join("\t")
|
127
|
+
ofh.puts ([i+1,"pan"] + size[:pan][i]).join("\t")
|
128
|
+
end
|
129
|
+
ofh.close
|
130
|
+
end
|
131
|
+
|
132
|
+
# Save summary results in tab
|
133
|
+
unless o[:summ].nil?
|
134
|
+
ofh = File.open(o[:summ], "w")
|
135
|
+
ofh.puts %w{genomes core_avg core_sd core_q1 core_q2 core_q3
|
136
|
+
pan_avg pan_sd pan_q1 pan_q2 pan_q3}.join("\t")
|
137
|
+
(0 .. genomes_n-1).each do |i|
|
138
|
+
res = [ i+1 ]
|
139
|
+
[:core, :pan].each do |met|
|
140
|
+
a = size[met].map{ |r| r[i] }.sort
|
141
|
+
avg = a.inject(0,:+).to_f / a.size
|
142
|
+
var = a.map{ |v| v**2 }.inject(0,:+).to_f/a.size - avg**2
|
143
|
+
sd = Math.sqrt(var)
|
144
|
+
q1 = a[ a.size*1/4 ]
|
145
|
+
q2 = a[ a.size*2/4 ]
|
146
|
+
q3 = a[ a.size*3/4 ]
|
147
|
+
res += [avg,sd,q1,q2,q3]
|
148
|
+
end
|
149
|
+
ofh.puts res.join("\t")
|
150
|
+
end
|
151
|
+
ofh.close
|
152
|
+
end
|
153
|
+
|
154
|
+
$stderr.puts "Done.\n" unless o[:q]
|
155
|
+
rescue => err
|
156
|
+
$stderr.puts "Exception: #{err}\n\n"
|
157
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
158
|
+
err
|
159
|
+
end
|
160
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @license artistic license 2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
$:.push File.expand_path("lib", File.dirname(__FILE__))
|
8
|
+
require "enveomics_rb/enveomics"
|
9
|
+
require "enveomics_rb/og"
|
10
|
+
|
11
|
+
o = {q:false, pergenome:false, prefix:false, first:false, rand:false,
|
12
|
+
core:0.0, dups:0}
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = "
|
15
|
+
Extracts sequences of Orthology Groups (OGs) from genomes (proteomes).
|
16
|
+
|
17
|
+
Usage: #{$0} [options]"
|
18
|
+
opts.separator ""
|
19
|
+
opts.separator "Mandatory"
|
20
|
+
opts.on("-i", "--in FILE",
|
21
|
+
"Input file containing the OGs (as generated by ogs.rb)."){ |v| o[:in]=v }
|
22
|
+
opts.on("-o", "--out FILE",
|
23
|
+
"Output directory where to place extracted sequences."){ |v| o[:out]=v }
|
24
|
+
opts.on("-s", "--seqs STRING",
|
25
|
+
"Path to the proteomes in FastA format, using '%s' to denote the genome.",
|
26
|
+
"For example: /path/to/seqs/%s.faa."){ |v| o[:seqs]=v }
|
27
|
+
opts.separator ""
|
28
|
+
opts.separator "Other Options"
|
29
|
+
opts.on("-c", "--core FLOAT",
|
30
|
+
"Use only OGs present in at least this fraction of the genomes.",
|
31
|
+
"To use only the strict core genome*, use -c 1."){ |v| o[:core]=v.to_f }
|
32
|
+
opts.on("-d", "--duplicates INT",
|
33
|
+
"Use only OGs with less than this number of in-paralogs in a genome.",
|
34
|
+
"To use only genes without in-paralogs*, use -d 1."
|
35
|
+
){ |v| o[:dups]=v.to_i }
|
36
|
+
opts.on("-g", "--per-genome",
|
37
|
+
"If set, the output is generated per genome.",
|
38
|
+
"By default, the output is per OG."){ |v| o[:pergenome]=v }
|
39
|
+
opts.on("-p", "--prefix",
|
40
|
+
"If set, each sequence is prefixed with the genome name",
|
41
|
+
"(or OG number, if --per-genome) and a dash."){ |v| o[:prefix]=v }
|
42
|
+
opts.on("-r", "--rand",
|
43
|
+
"Get only one gene per genome per OG (random) regardless of in-paralogs.",
|
44
|
+
"By default all genes are extracted."){ |v| o[:rand]=v }
|
45
|
+
opts.on("-f", "--first",
|
46
|
+
"Get only one gene per genome per OG (first) regardless of in-paralogs.",
|
47
|
+
"By default all genes are extracted. Takes precendece over --rand."
|
48
|
+
){ |v| o[:first]=v }
|
49
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
|
50
|
+
opts.on("-h", "--help", "Display this screen.") do
|
51
|
+
puts opts
|
52
|
+
exit
|
53
|
+
end
|
54
|
+
opts.separator ""
|
55
|
+
opts.separator " * To use only the unus genome (OGs with exactly one " +
|
56
|
+
"gene per genome), use: -c 1 -d 1."
|
57
|
+
opts.separator ""
|
58
|
+
end.parse!
|
59
|
+
abort "-i is mandatory" if o[:in].nil?
|
60
|
+
abort "-o is mandatory" if o[:out].nil?
|
61
|
+
abort "-s is mandatory" if o[:seqs].nil?
|
62
|
+
|
63
|
+
##### MAIN:
|
64
|
+
begin
|
65
|
+
# Read the pre-computed OGs
|
66
|
+
collection = OGCollection.new
|
67
|
+
$stderr.puts "Reading pre-computed OGs in '#{o[:in]}'." unless o[:q]
|
68
|
+
f = File.open(o[:in], "r")
|
69
|
+
h = f.gets.chomp.split /\t/
|
70
|
+
while ln = f.gets
|
71
|
+
collection << OG.new(h, ln.chomp.split(/\t/))
|
72
|
+
end
|
73
|
+
f.close
|
74
|
+
$stderr.puts " Loaded OGs: #{collection.ogs.size}." unless o[:q]
|
75
|
+
$stderr.puts " Reported Genomes: #{Gene.genomes.size}." unless o[:q]
|
76
|
+
|
77
|
+
# Filter core/in-paralog genes
|
78
|
+
collection.filter_core! o[:core] unless o[:core]==0.0
|
79
|
+
collection.remove_inparalogs! o[:dups] unless o[:dups]==0
|
80
|
+
$stderr.puts " Filtered OGs: #{collection.ogs.size}." unless
|
81
|
+
o[:q] or o[:core]==0.0
|
82
|
+
|
83
|
+
# Open outputs
|
84
|
+
$stderr.puts "Initializing output files." unless o[:q]
|
85
|
+
Dir.mkdir(o[:out]) unless Dir.exist? o[:out]
|
86
|
+
ofhs = o[:pergenome] ?
|
87
|
+
Gene.genomes.map{|g| File.open("#{o[:out]}/#{g}.fa", "w")} :
|
88
|
+
( (1 .. collection.ogs.size).map do |og|
|
89
|
+
File.open("#{o[:out]}/OG#{og}.fa", "w")
|
90
|
+
end )
|
91
|
+
$stderr.puts " Created files: #{ofhs.size}." unless o[:q]
|
92
|
+
|
93
|
+
# Read genomes
|
94
|
+
$stderr.puts "Filtering genes." unless o[:q]
|
95
|
+
genome_i = -1
|
96
|
+
Gene.genomes.each do |genome|
|
97
|
+
genome_i = Gene.genomes.index(genome)
|
98
|
+
$stderr.print " Genome #{genome_i+1}. \r" unless o[:q]
|
99
|
+
genes = ( collection.get_genome_genes(genome).map do |og|
|
100
|
+
o[:first] ? [og.first] : (o[:rand] ? [og.sample] : og)
|
101
|
+
end )
|
102
|
+
hand = nil
|
103
|
+
File.open(sprintf(o[:seqs], genome), "r").each do |ln|
|
104
|
+
if ln =~ /^>(\S+)/
|
105
|
+
og = genes.index{|g| g.include? $1}
|
106
|
+
hand = og.nil? ? nil : ( o[:pergenome] ? genome_i : og )
|
107
|
+
ln.sub!(/^>/, ">#{o[:pergenome] ? "OG#{og}" : genome}-") if
|
108
|
+
o[:prefix] and not hand.nil?
|
109
|
+
end
|
110
|
+
ofhs[hand].puts(ln) unless hand.nil?
|
111
|
+
end
|
112
|
+
end
|
113
|
+
$stderr.puts " #{genome_i+1} genomes processed." unless o[:q]
|
114
|
+
|
115
|
+
# Close outputs
|
116
|
+
$stderr.puts "Closing output files." unless o[:q]
|
117
|
+
ofhs.each{|h| h.close}
|
118
|
+
$stderr.puts "Done.\n" unless o[:q]
|
119
|
+
rescue => err
|
120
|
+
$stderr.puts "Exception: #{err}\n\n"
|
121
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
122
|
+
err
|
123
|
+
end
|
124
|
+
|
125
|
+
|
@@ -0,0 +1,186 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Sep-11-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
$:.push File.expand_path(File.dirname(__FILE__) + "/lib")
|
10
|
+
require 'enveomics_rb/og'
|
11
|
+
require 'optparse'
|
12
|
+
require 'tmpdir'
|
13
|
+
|
14
|
+
o = {q:false, f:"(\\S+)-(\\S+)\\.rbm", mcl:"", inflation:1.5, blind:false,
|
15
|
+
evalue:false, thr:2, identity:false, bestmatch:false}
|
16
|
+
ARGV << "-h" if ARGV.size==0
|
17
|
+
OptionParser.new do |opts|
|
18
|
+
opts.banner = "
|
19
|
+
Identifies Orthology Groups (OGs) in Reciprocal Best Matches (RBM)
|
20
|
+
between all pairs in a collection of genomes, using the Markov Cluster
|
21
|
+
Algorithm.
|
22
|
+
|
23
|
+
Requires MCL (see http://www.micans.org/mcl).
|
24
|
+
|
25
|
+
Usage: #{$0} [options]"
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Mandatory"
|
28
|
+
opts.on("-o", "--out FILE",
|
29
|
+
"Output file containing the detected OGs."){ |v| o[:out]=v }
|
30
|
+
opts.on("-d", "--dir DIR",
|
31
|
+
"Directory containing the RBM files.",
|
32
|
+
"Becomes optional iff --abc is set to a non-empty file."){ |v| o[:dir]=v }
|
33
|
+
opts.separator ""
|
34
|
+
opts.separator "Other Options"
|
35
|
+
opts.on("-f", "--format STRING",
|
36
|
+
"Format of the filenames for the RBM files (within -d), using regex " +
|
37
|
+
"syntax.", "By default: '#{o[:f]}'."){ |v| o[:f]=v }
|
38
|
+
opts.on("-I", "--inflation FLOAT",
|
39
|
+
"Inflation parameter for MCL clustering. By default: #{o[:inflation]}."
|
40
|
+
){ |v| o[:inflation]=v.to_f }
|
41
|
+
opts.on("-b", "--blind",
|
42
|
+
"If set, computes clusters without taking bitscore into account."
|
43
|
+
){ |v| o[:blind]=v }
|
44
|
+
opts.on("-e", "--evalue",
|
45
|
+
"If set, uses the e-value to weight edges, instead of the default " +
|
46
|
+
"Bit-Score."){ |v| o[:evalue]=v }
|
47
|
+
opts.on("-i", "--identity",
|
48
|
+
"If set, uses the identity to weight edges, instead of the default " +
|
49
|
+
"Bit-Score."){ |v| o[:identity]=v }
|
50
|
+
opts.on("-B", "--best-match",
|
51
|
+
"If set, it assumes best-matches instead reciprocal best matches."
|
52
|
+
){ |v| o[:bestmatch]=v }
|
53
|
+
opts.on("-m", "--mcl-bin DIR",
|
54
|
+
"Path to the directory containing the mcl binaries.",
|
55
|
+
"By default, assumed to be in the PATH."){ |v| o[:mcl]=v+"/" }
|
56
|
+
opts.on("--abc FILE",
|
57
|
+
"Use this abc file instead of a temporal file."){ |v| o[:abc] = v }
|
58
|
+
opts.on("-t", "--threads INT",
|
59
|
+
"Number of threads to use. By default: #{o[:thr]}."){ |v| o[:thr]=v.to_i }
|
60
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
|
61
|
+
opts.on("-h", "--help", "Display this screen.") do
|
62
|
+
puts opts
|
63
|
+
exit
|
64
|
+
end
|
65
|
+
opts.separator ""
|
66
|
+
end.parse!
|
67
|
+
abort "-o is mandatory" if o[:out].nil?
|
68
|
+
o[:evalue] = false if o[:identity]
|
69
|
+
o[:evalue] = false if o[:blind]
|
70
|
+
o[:identity] = false if o[:blind]
|
71
|
+
|
72
|
+
##### MAIN:
|
73
|
+
begin
|
74
|
+
Dir.mktmpdir do |dir|
|
75
|
+
o[:abc] = "#{dir}/rbms.abc" if o[:abc].nil?
|
76
|
+
abort "-d must exist and be a directory" unless
|
77
|
+
File.size?(o[:abc]) or
|
78
|
+
(!o[:dir].nil? and File.exists?(o[:dir]) and File.directory?(o[:dir]))
|
79
|
+
# Traverse the whole directory
|
80
|
+
if File.size? o[:abc]
|
81
|
+
$stderr.puts "Reusing existing abc file '#{o[:abc]}'." unless o[:q]
|
82
|
+
else
|
83
|
+
file_i = 0
|
84
|
+
ln_i = 0
|
85
|
+
$stderr.puts "Reading RBM files within '#{o[:dir]}'." unless o[:q]
|
86
|
+
abc = File.open(o[:abc] + ".tmp", "w")
|
87
|
+
Dir.entries(o[:dir]).each do |rbm_file|
|
88
|
+
next unless File.file?(o[:dir]+"/"+rbm_file)
|
89
|
+
# Parse the filename to identify the genomes
|
90
|
+
m = /#{o[:f]}/.match(rbm_file)
|
91
|
+
if m.nil? or m[2].nil?
|
92
|
+
warn "Ignoring #{rbm_file}: doesn't match /#{o[:f]}/."
|
93
|
+
next
|
94
|
+
end
|
95
|
+
file_i += 1
|
96
|
+
# Read the RBMs list
|
97
|
+
f = File.open(o[:dir]+"/"+rbm_file, "r")
|
98
|
+
while ln = f.gets
|
99
|
+
# Add the RBM to the abc file
|
100
|
+
row = ln.split(/\t/)
|
101
|
+
abc.puts [m[1]+">"+row[0], m[2]+">"+row[1],
|
102
|
+
(o[:blind] ? "1" :
|
103
|
+
(o[:evalue] ? row[10] :
|
104
|
+
(o[:identity] ? row[2] : row[11])))].join("\t")
|
105
|
+
ln_i += 1
|
106
|
+
end
|
107
|
+
f.close
|
108
|
+
$stderr.print " Scanned files: #{file_i}. " +
|
109
|
+
"Found RBMs: #{ln_i}. \r" unless o[:q]
|
110
|
+
end
|
111
|
+
abc.close
|
112
|
+
File.rename(o[:abc] + ".tmp", o[:abc])
|
113
|
+
$stderr.print "\n" unless o[:q]
|
114
|
+
end # if File.size? o[:abc] ... else
|
115
|
+
|
116
|
+
# Build .mci file (mcxload) and compute .mccl file (mcl)
|
117
|
+
$stderr.puts "Markov-Clustering" unless o[:q]
|
118
|
+
`'#{o[:mcl]}mcxload' #{"--stream-mirror" unless o[:bestmatch]} \
|
119
|
+
-abc '#{o[:abc]}' -o '#{dir}/rbms.mci' --write-binary \
|
120
|
+
-write-tab '#{dir}/genes.tab' #{"--stream-neg-log10" if o[:evalue]} \
|
121
|
+
&>/dev/null`
|
122
|
+
`'#{o[:mcl]}mcl' '#{dir}/rbms.mci' -V all -I #{o[:inflation].to_s} \
|
123
|
+
-o '#{dir}/ogs.mcl' -te #{o[:thr].to_s}`
|
124
|
+
|
125
|
+
# Load .tab as Gene objects
|
126
|
+
$stderr.puts "Loading gene table from '#{dir}/genes.tab'." unless o[:q]
|
127
|
+
genes = []
|
128
|
+
tab = File.open("#{dir}/genes.tab", "r")
|
129
|
+
while ln = tab.gets
|
130
|
+
ln.chomp!
|
131
|
+
r = ln.split /\t|>/
|
132
|
+
genes[ r[0].to_i ] = Gene.new(r[1], r[2])
|
133
|
+
end
|
134
|
+
tab.close
|
135
|
+
$stderr.puts " Got " + genes.size.to_s + " genes in " +
|
136
|
+
Gene.genomes.size.to_s + " genomes." unless o[:q]
|
137
|
+
|
138
|
+
# Load .mcl file as OGCollection
|
139
|
+
$stderr.puts "Loading clusters from '#{dir}/ogs.mcl'." unless o[:q]
|
140
|
+
collection = OGCollection.new
|
141
|
+
mcl = File.open("#{dir}/ogs.mcl", "r")
|
142
|
+
in_matrix = false
|
143
|
+
my_genes = nil
|
144
|
+
while ln = mcl.gets
|
145
|
+
ln.chomp!
|
146
|
+
if ln =~ /^\(mclmatrix$/
|
147
|
+
in_matrix = true
|
148
|
+
next
|
149
|
+
end
|
150
|
+
next if ln =~ /^begin$/
|
151
|
+
if in_matrix
|
152
|
+
break if ln =~ /^\)$/
|
153
|
+
if ln =~ /^\d+\s+/
|
154
|
+
ln.sub!(/^\d+\s+/, "")
|
155
|
+
my_genes = []
|
156
|
+
end
|
157
|
+
ln.sub!(/^\s+/, "")
|
158
|
+
raise "Incomplete mcl matrix, offending line: #{$.}: #{ln}" if
|
159
|
+
my_genes.nil?
|
160
|
+
my_genes += ln.split(/\s/)
|
161
|
+
if my_genes.last == "$"
|
162
|
+
my_genes.pop
|
163
|
+
og = OG.new
|
164
|
+
my_genes.each{|i| og << genes[ i.to_i ]}
|
165
|
+
collection << og
|
166
|
+
my_genes = nil
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
mcl.close
|
171
|
+
$stderr.puts " Got #{collection.ogs.size} clusters." unless o[:q]
|
172
|
+
|
173
|
+
# Save the output matrix
|
174
|
+
$stderr.puts "Saving matrix into '#{o[:out]}'." unless o[:q]
|
175
|
+
f = File.open(o[:out], "w")
|
176
|
+
f.puts collection.to_s
|
177
|
+
f.close
|
178
|
+
$stderr.puts "Done.\n" unless o[:q]
|
179
|
+
end
|
180
|
+
rescue => err
|
181
|
+
$stderr.puts "Exception: #{err}\n\n"
|
182
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
183
|
+
err
|
184
|
+
end
|
185
|
+
|
186
|
+
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author: Luis M. Rodriguez-R
|
5
|
+
# @update: Apr-29-2015
|
6
|
+
# @license: artistic license 2.0
|
7
|
+
#
|
8
|
+
|
9
|
+
$:.push File.expand_path(File.dirname(__FILE__) + '/lib')
|
10
|
+
require 'enveomics_rb/og'
|
11
|
+
require 'optparse'
|
12
|
+
|
13
|
+
o = {:q=>FALSE, :f=>"(\\S+)-(\\S+)\\.rbm", :consolidate=>TRUE, :pre=>[]}
|
14
|
+
ARGV << '-h' if ARGV.size==0
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "
|
17
|
+
***IMPORTANT NOTE***
|
18
|
+
This script suffers from chaining effect and is very sensitive to spurious connections,
|
19
|
+
because it applies a greedy clustering algorithm. For most practical purposes, the use
|
20
|
+
of this script is discouraged and `ogs.mcl.rb` should be preferred. [ Apr-29-2015 ]
|
21
|
+
|
22
|
+
Identifies Orthology Groups (OGs) in Reciprocal Best Matches (RBM)
|
23
|
+
between all pairs in a collection of genomes.
|
24
|
+
|
25
|
+
Usage: #{$0} [options]"
|
26
|
+
opts.separator ""
|
27
|
+
opts.separator "Mandatory"
|
28
|
+
opts.on("-o", "--out FILE", "Output file containing the detected OGs."){ |v| o[:out]=v }
|
29
|
+
opts.separator ""
|
30
|
+
opts.separator "Other Options"
|
31
|
+
opts.on("-d", "--dir DIR", "Directory containing the RBM files."){ |v| o[:dir]=v }
|
32
|
+
opts.on("-p", "--pre-ogs FILE1,FILE2,...", Array, "Pre-computed OGs file(s), separated by commas."){ |v| o[:pre]=v }
|
33
|
+
opts.on("-n", "--unchecked", "Do not check internal redundancy in OGs."){ o[:consolidate]=FALSE }
|
34
|
+
opts.on("-f","--format STRING", "Format of the filenames for the RBM files (within -d), using regex syntax. By default: '#{o[:f]}'."){ |v| o[:f]=v }
|
35
|
+
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
|
36
|
+
opts.on("-h", "--help", "Display this screen.") do
|
37
|
+
puts opts
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
opts.separator ""
|
41
|
+
end.parse!
|
42
|
+
abort "-o is mandatory" if o[:out].nil?
|
43
|
+
|
44
|
+
##### MAIN:
|
45
|
+
begin
|
46
|
+
# Initialize the collection of OGs.
|
47
|
+
collection = OGCollection.new
|
48
|
+
# Read the pre-computed OGs (if -p is passed).
|
49
|
+
o[:pre].each do |pre|
|
50
|
+
$stderr.puts "Reading pre-computed OGs in '#{pre}'." unless o[:q]
|
51
|
+
f = File.open(pre, 'r')
|
52
|
+
h = f.gets.chomp.split /\t/
|
53
|
+
while ln = f.gets
|
54
|
+
collection << OG.new(h, ln.chomp.split(/\t/))
|
55
|
+
end
|
56
|
+
f.close
|
57
|
+
$stderr.puts " Loaded OGs: #{collection.ogs.length}." unless o[:q]
|
58
|
+
end
|
59
|
+
# Read the RBM files in the directory (if -d is passed).
|
60
|
+
unless o[:dir].nil?
|
61
|
+
abort "-d must exist and be a directory" unless File.exists?(o[:dir]) and File.directory?(o[:dir])
|
62
|
+
# Traverse the whole directory.
|
63
|
+
file_i = 0
|
64
|
+
$stderr.puts "Reading RBM files within '#{o[:dir]}'." unless o[:q]
|
65
|
+
Dir.entries(o[:dir]).each do |rbm_file|
|
66
|
+
next unless File.file?(o[:dir]+"/"+rbm_file)
|
67
|
+
# Parse the filename to identify the genomes.
|
68
|
+
m = /#{o[:f]}/.match(rbm_file)
|
69
|
+
if m.nil? or m[2].nil?
|
70
|
+
warn "Cannot parse filename: #{rbm_file} (doesn't match /#{o[:f]}/)."
|
71
|
+
next
|
72
|
+
end
|
73
|
+
file_i += 1
|
74
|
+
# Read the RBMs list
|
75
|
+
f = File.open(o[:dir]+"/"+rbm_file, "r")
|
76
|
+
while ln = f.gets
|
77
|
+
# Add the RBM to the collection of OGs. Only the first two columns are used.
|
78
|
+
row = ln.split(/\t/)
|
79
|
+
collection.add_rbm( Gene.new(m[1],row[0]), Gene.new(m[2],row[1]) )
|
80
|
+
end
|
81
|
+
f.close
|
82
|
+
$stderr.print " Scanned files: #{file_i}. Found OGs: #{collection.ogs.length}. \r" unless o[:q]
|
83
|
+
end
|
84
|
+
$stderr.print "\n" unless o[:q]
|
85
|
+
end
|
86
|
+
# Evaluate internal consistency merging linked OGs (unless -n is passed).
|
87
|
+
if o[:consolidate]
|
88
|
+
$stderr.puts "Evaluating internal consistency." unless o[:q]
|
89
|
+
collection.consolidate!
|
90
|
+
$stderr.puts " Final OGs: #{collection.ogs.length}." unless o[:q]
|
91
|
+
end
|
92
|
+
# Save the output matrix
|
93
|
+
$stderr.puts "Saving matrix into '#{o[:out]}'." unless o[:q]
|
94
|
+
f = File.open(o[:out], "w")
|
95
|
+
f.puts collection.to_s
|
96
|
+
f.close
|
97
|
+
$stderr.puts "Done.\n" unless o[:q]
|
98
|
+
rescue => err
|
99
|
+
$stderr.puts "Exception: #{err}\n\n"
|
100
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
101
|
+
err
|
102
|
+
end
|
103
|
+
|
104
|
+
|