miga-base 1.2.15.2 → 1.2.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/download/gtdb.rb +4 -1
- data/lib/miga/cli/action/gtdb_get.rb +4 -0
- data/lib/miga/daemon.rb +4 -1
- data/lib/miga/lair.rb +6 -4
- data/lib/miga/remote_dataset/download.rb +3 -2
- data/lib/miga/remote_dataset.rb +25 -7
- data/lib/miga/taxonomy.rb +6 -0
- data/lib/miga/version.rb +2 -2
- metadata +6 -302
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +0 -41964
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +0 -32439
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -62056
- data/utils/FastAAI/FastAAI +0 -3659
- data/utils/FastAAI/FastAAI-legacy/FastAAI +0 -1336
- data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +0 -1296
- data/utils/FastAAI/README.md +0 -84
- data/utils/enveomics/Docs/recplot2.md +0 -244
- data/utils/enveomics/Examples/aai-matrix.bash +0 -66
- data/utils/enveomics/Examples/ani-matrix.bash +0 -66
- data/utils/enveomics/Examples/essential-phylogeny.bash +0 -105
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +0 -100
- data/utils/enveomics/LICENSE.txt +0 -73
- data/utils/enveomics/Makefile +0 -52
- data/utils/enveomics/Manifest/Tasks/aasubs.json +0 -103
- data/utils/enveomics/Manifest/Tasks/blasttab.json +0 -790
- data/utils/enveomics/Manifest/Tasks/distances.json +0 -161
- data/utils/enveomics/Manifest/Tasks/fasta.json +0 -802
- data/utils/enveomics/Manifest/Tasks/fastq.json +0 -291
- data/utils/enveomics/Manifest/Tasks/graphics.json +0 -126
- data/utils/enveomics/Manifest/Tasks/mapping.json +0 -137
- data/utils/enveomics/Manifest/Tasks/ogs.json +0 -382
- data/utils/enveomics/Manifest/Tasks/other.json +0 -906
- data/utils/enveomics/Manifest/Tasks/remote.json +0 -355
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +0 -650
- data/utils/enveomics/Manifest/Tasks/tables.json +0 -308
- data/utils/enveomics/Manifest/Tasks/trees.json +0 -68
- data/utils/enveomics/Manifest/Tasks/variants.json +0 -111
- data/utils/enveomics/Manifest/categories.json +0 -165
- data/utils/enveomics/Manifest/examples.json +0 -162
- data/utils/enveomics/Manifest/tasks.json +0 -4
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
- data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
- data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
- data/utils/enveomics/README.md +0 -42
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +0 -171
- data/utils/enveomics/Scripts/Aln.cat.rb +0 -221
- data/utils/enveomics/Scripts/Aln.convert.pl +0 -35
- data/utils/enveomics/Scripts/AlphaDiversity.pl +0 -152
- data/utils/enveomics/Scripts/BedGraph.tad.rb +0 -93
- data/utils/enveomics/Scripts/BedGraph.window.rb +0 -71
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +0 -102
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +0 -63
- data/utils/enveomics/Scripts/BlastTab.advance.bash +0 -48
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +0 -55
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +0 -104
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +0 -76
- data/utils/enveomics/Scripts/BlastTab.filter.pl +0 -47
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +0 -194
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +0 -104
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +0 -157
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +0 -48
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +0 -86
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +0 -119
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +0 -86
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +0 -47
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +0 -114
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +0 -90
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +0 -123
- data/utils/enveomics/Scripts/Chao1.pl +0 -97
- data/utils/enveomics/Scripts/CharTable.classify.rb +0 -234
- data/utils/enveomics/Scripts/EBIseq2tax.rb +0 -83
- data/utils/enveomics/Scripts/FastA.N50.pl +0 -60
- data/utils/enveomics/Scripts/FastA.extract.rb +0 -152
- data/utils/enveomics/Scripts/FastA.filter.pl +0 -52
- data/utils/enveomics/Scripts/FastA.filterLen.pl +0 -28
- data/utils/enveomics/Scripts/FastA.filterN.pl +0 -60
- data/utils/enveomics/Scripts/FastA.fragment.rb +0 -100
- data/utils/enveomics/Scripts/FastA.gc.pl +0 -42
- data/utils/enveomics/Scripts/FastA.interpose.pl +0 -93
- data/utils/enveomics/Scripts/FastA.length.pl +0 -38
- data/utils/enveomics/Scripts/FastA.mask.rb +0 -89
- data/utils/enveomics/Scripts/FastA.per_file.pl +0 -36
- data/utils/enveomics/Scripts/FastA.qlen.pl +0 -57
- data/utils/enveomics/Scripts/FastA.rename.pl +0 -65
- data/utils/enveomics/Scripts/FastA.revcom.pl +0 -23
- data/utils/enveomics/Scripts/FastA.sample.rb +0 -98
- data/utils/enveomics/Scripts/FastA.slider.pl +0 -85
- data/utils/enveomics/Scripts/FastA.split.pl +0 -55
- data/utils/enveomics/Scripts/FastA.split.rb +0 -79
- data/utils/enveomics/Scripts/FastA.subsample.pl +0 -131
- data/utils/enveomics/Scripts/FastA.tag.rb +0 -65
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +0 -69
- data/utils/enveomics/Scripts/FastA.wrap.rb +0 -48
- data/utils/enveomics/Scripts/FastQ.filter.pl +0 -54
- data/utils/enveomics/Scripts/FastQ.interpose.pl +0 -90
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +0 -89
- data/utils/enveomics/Scripts/FastQ.offset.pl +0 -90
- data/utils/enveomics/Scripts/FastQ.split.pl +0 -53
- data/utils/enveomics/Scripts/FastQ.tag.rb +0 -70
- data/utils/enveomics/Scripts/FastQ.test-error.rb +0 -81
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +0 -24
- data/utils/enveomics/Scripts/GFF.catsbj.pl +0 -127
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +0 -84
- data/utils/enveomics/Scripts/HMM.essential.rb +0 -351
- data/utils/enveomics/Scripts/HMM.haai.rb +0 -168
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +0 -83
- data/utils/enveomics/Scripts/JPlace.distances.rb +0 -88
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +0 -320
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +0 -81
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +0 -198
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +0 -35
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +0 -49
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +0 -92
- data/utils/enveomics/Scripts/Newick.autoprune.R +0 -27
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +0 -228
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +0 -32
- data/utils/enveomics/Scripts/RefSeq.download.bash +0 -48
- data/utils/enveomics/Scripts/SRA.download.bash +0 -55
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +0 -36
- data/utils/enveomics/Scripts/TRIBS.test.R +0 -39
- data/utils/enveomics/Scripts/Table.barplot.R +0 -31
- data/utils/enveomics/Scripts/Table.df2dist.R +0 -30
- data/utils/enveomics/Scripts/Table.filter.pl +0 -61
- data/utils/enveomics/Scripts/Table.merge.pl +0 -77
- data/utils/enveomics/Scripts/Table.prefScore.R +0 -60
- data/utils/enveomics/Scripts/Table.replace.rb +0 -69
- data/utils/enveomics/Scripts/Table.round.rb +0 -63
- data/utils/enveomics/Scripts/Table.split.pl +0 -57
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +0 -227
- data/utils/enveomics/Scripts/VCF.KaKs.rb +0 -147
- data/utils/enveomics/Scripts/VCF.SNPs.rb +0 -88
- data/utils/enveomics/Scripts/aai.rb +0 -421
- data/utils/enveomics/Scripts/ani.rb +0 -362
- data/utils/enveomics/Scripts/anir.rb +0 -137
- data/utils/enveomics/Scripts/clust.rand.rb +0 -102
- data/utils/enveomics/Scripts/gi2tax.rb +0 -103
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +0 -96
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +0 -293
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +0 -175
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +0 -24
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +0 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +0 -30
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +0 -253
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +0 -88
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +0 -182
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +0 -49
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +0 -74
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +0 -237
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +0 -31
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +0 -152
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +0 -3
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +0 -74
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +0 -135
- data/utils/enveomics/Scripts/ogs.annotate.rb +0 -88
- data/utils/enveomics/Scripts/ogs.core-pan.rb +0 -160
- data/utils/enveomics/Scripts/ogs.extract.rb +0 -125
- data/utils/enveomics/Scripts/ogs.mcl.rb +0 -186
- data/utils/enveomics/Scripts/ogs.rb +0 -104
- data/utils/enveomics/Scripts/ogs.stats.rb +0 -131
- data/utils/enveomics/Scripts/rbm-legacy.rb +0 -172
- data/utils/enveomics/Scripts/rbm.rb +0 -108
- data/utils/enveomics/Scripts/sam.filter.rb +0 -148
- data/utils/enveomics/Tests/Makefile +0 -10
- data/utils/enveomics/Tests/Mgen_M2288.faa +0 -3189
- data/utils/enveomics/Tests/Mgen_M2288.fna +0 -8282
- data/utils/enveomics/Tests/Mgen_M2321.fna +0 -8288
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +0 -2970
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +0 -7
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +0 -17
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +0 -137
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +0 -123
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +0 -200
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +0 -55
- data/utils/enveomics/Tests/alkB.nwk +0 -1
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +0 -13
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +0 -17
- data/utils/enveomics/Tests/hiv1.faa +0 -59
- data/utils/enveomics/Tests/hiv1.fna +0 -134
- data/utils/enveomics/Tests/hiv2.faa +0 -70
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +0 -233
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +0 -1
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +0 -233
- data/utils/enveomics/Tests/phyla_counts.tsv +0 -10
- data/utils/enveomics/Tests/primate_lentivirus.ogs +0 -11
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +0 -9
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +0 -8
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +0 -6
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +0 -9
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +0 -6
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +0 -6
- data/utils/enveomics/build_enveomics_r.bash +0 -45
- data/utils/enveomics/enveomics.R/DESCRIPTION +0 -31
- data/utils/enveomics/enveomics.R/NAMESPACE +0 -39
- data/utils/enveomics/enveomics.R/R/autoprune.R +0 -155
- data/utils/enveomics/enveomics.R/R/barplot.R +0 -184
- data/utils/enveomics/enveomics.R/R/cliopts.R +0 -135
- data/utils/enveomics/enveomics.R/R/df2dist.R +0 -154
- data/utils/enveomics/enveomics.R/R/growthcurve.R +0 -331
- data/utils/enveomics/enveomics.R/R/prefscore.R +0 -79
- data/utils/enveomics/enveomics.R/R/recplot.R +0 -354
- data/utils/enveomics/enveomics.R/R/recplot2.R +0 -1631
- data/utils/enveomics/enveomics.R/R/tribs.R +0 -583
- data/utils/enveomics/enveomics.R/R/utils.R +0 -80
- data/utils/enveomics/enveomics.R/README.md +0 -81
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +0 -25
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +0 -46
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +0 -103
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +0 -67
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +0 -24
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +0 -44
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +0 -75
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +0 -50
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +0 -44
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +0 -139
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +0 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +0 -77
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +0 -25
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +0 -21
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +0 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +0 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -36
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +0 -52
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +0 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +0 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +0 -43
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +0 -82
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +0 -59
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +0 -36
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +0 -68
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +0 -28
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +0 -14
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +0 -13
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +0 -78
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +0 -46
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +0 -125
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +0 -19
- data/utils/enveomics/globals.mk +0 -8
- data/utils/enveomics/manifest.json +0 -9
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +0 -67
- data/utils/multitrim/multitrim.py +0 -1555
- data/utils/multitrim/multitrim.yml +0 -13
@@ -1,131 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
#
|
4
|
-
# @author: Luis M. Rodriguez-R
|
5
|
-
# @license: Artistic-2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
$:.push File.expand_path(File.dirname(__FILE__) + '/lib')
|
9
|
-
require 'enveomics_rb/og'
|
10
|
-
require 'optparse'
|
11
|
-
require 'json'
|
12
|
-
|
13
|
-
o = {q:false, a:false}
|
14
|
-
ARGV << '-h' if ARGV.size==0
|
15
|
-
OptionParser.new do |opts|
|
16
|
-
opts.banner = "
|
17
|
-
Estimates some descriptive statistics on a set of Orthology Groups (OGs).
|
18
|
-
|
19
|
-
Usage: #{$0} [options]"
|
20
|
-
opts.separator ""
|
21
|
-
opts.separator "Mandatory"
|
22
|
-
opts.on("-o", "--ogs FILE",
|
23
|
-
"Input file containing the precomputed OGs."){ |v| o[:ogs]=v }
|
24
|
-
opts.separator ""
|
25
|
-
opts.separator "Other Options"
|
26
|
-
opts.on("-j", "--json FILE", "Output file in JSON format."){ |v| o[:json]=v }
|
27
|
-
opts.on("-t", "--tab FILE","Output file in tabular format."){ |v| o[:tab]=v }
|
28
|
-
opts.on("-T", "--transposed-tab FILE",
|
29
|
-
"Output file in transposed tabular format."){ |v| o[:ttab]=v }
|
30
|
-
opts.on("-a", "--auto", "Run completely quietly (no STDERR or STDOUT)") do
|
31
|
-
o[:q] = true
|
32
|
-
o[:a] = true
|
33
|
-
end
|
34
|
-
opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
|
35
|
-
opts.on("-h", "--help", "Display this screen.") do
|
36
|
-
puts opts
|
37
|
-
exit
|
38
|
-
end
|
39
|
-
opts.separator ""
|
40
|
-
end.parse!
|
41
|
-
abort "-o is mandatory" if o[:ogs].nil?
|
42
|
-
|
43
|
-
##### MAIN:
|
44
|
-
begin
|
45
|
-
# Initialize the collection of OGs.
|
46
|
-
collection = OGCollection.new
|
47
|
-
|
48
|
-
# Read the pre-computed OGs
|
49
|
-
$stderr.puts "Reading pre-computed OGs in '#{o[:ogs]}'." unless o[:q]
|
50
|
-
f = File.open(o[:ogs], "r")
|
51
|
-
h = f.gets.chomp.split /\t/
|
52
|
-
while ln = f.gets
|
53
|
-
collection << OG.new(h, ln.chomp.split(/\t/))
|
54
|
-
end
|
55
|
-
f.close
|
56
|
-
$stderr.puts " Loaded OGs: #{collection.ogs.length}." unless o[:q]
|
57
|
-
|
58
|
-
# Estimate descriptive stats
|
59
|
-
stat_name = {
|
60
|
-
genomes: "Number of genomes",
|
61
|
-
pan: "Pangenome (OGs)",
|
62
|
-
core: "Core genome (OGs)",
|
63
|
-
core90pc: "OGs in 90% of the genomes",
|
64
|
-
core80pc: "OGs in 80% of the genomes",
|
65
|
-
unus: "Unus genome, core genome discarding paralogs (OGs)",
|
66
|
-
avg: "Average number of OGs in a genome",
|
67
|
-
avg_pan: "Average genome (OGs) / Pangenome (OGs)",
|
68
|
-
core_avg: "Core genome (OGs) / Average genome (OGs)",
|
69
|
-
core_pan: "Core genome (OGs) / Pangenome (OGs)",
|
70
|
-
ogs_shannon: "Entropy of the OG frequencies (bits)"
|
71
|
-
}
|
72
|
-
stats = {}
|
73
|
-
stats[:genomes] = Gene.genomes.length
|
74
|
-
stats[:pan] = collection.ogs.length
|
75
|
-
stats[:core] = collection.ogs.map do |og|
|
76
|
-
(og.genomes.length == Gene.genomes.length) ? 1 : 0
|
77
|
-
end.inject(0,:+)
|
78
|
-
stats[:core90pc] = collection.ogs.map do |og|
|
79
|
-
(og.genomes.length >= 0.9*Gene.genomes.length) ? 1 : 0
|
80
|
-
end.inject(0,:+)
|
81
|
-
stats[:core80pc] = collection.ogs.map do |og|
|
82
|
-
(og.genomes.length >= 0.8*Gene.genomes.length) ? 1 : 0
|
83
|
-
end.inject(0,:+)
|
84
|
-
stats[:unus] = collection.ogs.map do |og|
|
85
|
-
(og.genomes.length != Gene.genomes.length) ? 0 :
|
86
|
-
(og.genes.all?{ |i| i.size==1 }) ? 1 : 0
|
87
|
-
end.inject(0,:+)
|
88
|
-
og_genomes = collection.ogs.map{ |og| og.genomes.length }.inject(0,:+)
|
89
|
-
stats[:avg] = og_genomes.to_f/Gene.genomes.length
|
90
|
-
stats[:avg_pan] = stats[:avg]/stats[:pan]
|
91
|
-
stats[:core_avg] = stats[:core].to_f/stats[:avg]
|
92
|
-
stats[:core_pan] = stats[:core].to_f/stats[:pan]
|
93
|
-
stats[:ogs_shannon] = -1 * collection.ogs.map do |og|
|
94
|
-
pi = og.genomes.length.to_f/Gene.genomes.length
|
95
|
-
pi * Math.log(pi)
|
96
|
-
end.inject(0.0,:+)
|
97
|
-
|
98
|
-
# Show result
|
99
|
-
$stderr.puts "Generating reports." unless o[:q]
|
100
|
-
stats.each_pair{ |k,v| puts " #{stat_name[k]}: #{v}" } unless o[:a]
|
101
|
-
|
102
|
-
# Save results in JSON
|
103
|
-
unless o[:json].nil?
|
104
|
-
ohf = File.open(o[:json], "w")
|
105
|
-
ohf.puts JSON.pretty_generate(stats)
|
106
|
-
ohf.close
|
107
|
-
end
|
108
|
-
|
109
|
-
# Save results in tab
|
110
|
-
unless o[:tab].nil?
|
111
|
-
ohf = File.open(o[:tab], "w")
|
112
|
-
stats.each_pair{ |k,v| ohf.puts "#{k}\t#{v}" }
|
113
|
-
ohf.close
|
114
|
-
end
|
115
|
-
|
116
|
-
# Save results in T(tab)
|
117
|
-
unless o[:ttab].nil?
|
118
|
-
ohf = File.open(o[:ttab], "w")
|
119
|
-
ohf.puts stats.keys.join("\t")
|
120
|
-
ohf.puts stats.values.join("\t")
|
121
|
-
ohf.close
|
122
|
-
end
|
123
|
-
|
124
|
-
$stderr.puts "Done.\n" unless o[:q]
|
125
|
-
rescue => err
|
126
|
-
$stderr.puts "Exception: #{err}\n\n"
|
127
|
-
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
128
|
-
err
|
129
|
-
end
|
130
|
-
|
131
|
-
|
@@ -1,172 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# frozen_string_literal: true
|
4
|
-
|
5
|
-
$VERSION = 0.1
|
6
|
-
$:.push File.expand_path('../lib', __FILE__)
|
7
|
-
require 'enveomics_rb/enveomics'
|
8
|
-
require 'tmpdir'
|
9
|
-
|
10
|
-
o = {
|
11
|
-
q: false, thr: 1,
|
12
|
-
len: 0, id: 0.0, fract: 0.0, score: 0.0,
|
13
|
-
bin: '', program: :'blast+', nucl: false
|
14
|
-
}
|
15
|
-
|
16
|
-
OptionParser.new do |opts|
|
17
|
-
cmd = File.basename($0)
|
18
|
-
opts.banner = <<~BANNER
|
19
|
-
|
20
|
-
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
21
|
-
|
22
|
-
[DEPRECATED: Please use rbm.rb instead]
|
23
|
-
|
24
|
-
Finds the reciprocal best matches between two sets of sequences
|
25
|
-
|
26
|
-
Usage: #{cmd} [options]
|
27
|
-
|
28
|
-
BANNER
|
29
|
-
|
30
|
-
opts.separator 'Mandatory'
|
31
|
-
opts.on(
|
32
|
-
'-1', '--seq1 FILE',
|
33
|
-
'Path to the FastA file containing the set 1'
|
34
|
-
) { |v| o[:seq1] = v }
|
35
|
-
opts.on(
|
36
|
-
'-2', '--seq2 FILE',
|
37
|
-
'Path to the FastA file containing the set 2'
|
38
|
-
) { |v| o[:seq2] = v }
|
39
|
-
opts.separator ''
|
40
|
-
opts.separator 'Search Options'
|
41
|
-
opts.on(
|
42
|
-
'-n', '--nucl',
|
43
|
-
'Sequences are assumed to be nucleotides (proteins by default)',
|
44
|
-
'Incompatible with -p diamond'
|
45
|
-
) { |v| o[:nucl] = true }
|
46
|
-
opts.on(
|
47
|
-
'-l', '--len INT', Integer,
|
48
|
-
'Minimum alignment length (in residues)',
|
49
|
-
"By default: #{o[:len]}"
|
50
|
-
) { |v| o[:len] = v }
|
51
|
-
opts.on(
|
52
|
-
'-f', '--fract FLOAT', Float,
|
53
|
-
'Minimum alignment length (as a fraction of the query)',
|
54
|
-
'If set, requires BLAST+ or Diamond (see -p)',
|
55
|
-
"By default: #{o[:fract]}"
|
56
|
-
) { |v| o[:fract] = v }
|
57
|
-
opts.on(
|
58
|
-
'-i', '--id NUM', Float,
|
59
|
-
'Minimum alignment identity (in %)',
|
60
|
-
"By default: #{o[:id]}"
|
61
|
-
){ |v| o[:id] = v }
|
62
|
-
opts.on(
|
63
|
-
'-s', '--score NUM', Float,
|
64
|
-
'Minimum alignment score (in bits)',
|
65
|
-
"By default: #{o[:score]}"
|
66
|
-
) { |v| o[:score] = v }
|
67
|
-
opts.separator ''
|
68
|
-
opts.separator 'Software Options'
|
69
|
-
opts.on(
|
70
|
-
'-b', '--bin DIR',
|
71
|
-
'Path to the directory containing the binaries of the search program'
|
72
|
-
) { |v| o[:bin] = v }
|
73
|
-
opts.on(
|
74
|
-
'-p', '--program STR',
|
75
|
-
'Search program to be used. One of: blast+ (default), blast, diamond'
|
76
|
-
) { |v| o[:program] = v.downcase.to_sym }
|
77
|
-
opts.on(
|
78
|
-
'-t', '--threads INT', Integer,
|
79
|
-
'Number of parallel threads to be used',
|
80
|
-
"By default: #{o[:thr]}"
|
81
|
-
) { |v| o[:thr] = v }
|
82
|
-
opts.separator ''
|
83
|
-
opts.separator 'Other Options'
|
84
|
-
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
85
|
-
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
86
|
-
opts.separator ''
|
87
|
-
end.parse!
|
88
|
-
|
89
|
-
abort '-1 is mandatory' if o[:seq1].nil?
|
90
|
-
abort '-2 is mandatory' if o[:seq2].nil?
|
91
|
-
if o[:program] == :diamond && o[:nucl]
|
92
|
-
abort '-p diamond is incompatible with -n'
|
93
|
-
end
|
94
|
-
if o[:fract] > 0.0 && o[:program] == :blast
|
95
|
-
abort 'Argument -f/--fract requires -p blast+ or -p diamond'
|
96
|
-
end
|
97
|
-
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
98
|
-
$quiet = o[:q]
|
99
|
-
|
100
|
-
Dir.mktmpdir do |dir|
|
101
|
-
say('Temporal directory: ', dir)
|
102
|
-
|
103
|
-
# Create databases
|
104
|
-
say 'Creating databases'
|
105
|
-
[:seq1, :seq2].each do |seq|
|
106
|
-
case o[:program]
|
107
|
-
when :blast
|
108
|
-
`"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
|
109
|
-
-p #{o[:nucl] ? 'F' : 'T'}`
|
110
|
-
when :'blast+'
|
111
|
-
`"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
|
112
|
-
-dbtype #{o[:nucl] ? 'nucl' : 'prot'}`
|
113
|
-
when :diamond
|
114
|
-
`"#{o[:bin]}diamond" makedb --in "#{o[seq]}" \
|
115
|
-
--db "#{dir}/#{seq}.dmnd" --threads "#{o[:thr]}"`
|
116
|
-
else
|
117
|
-
abort "Unsupported program: #{o[:program]}"
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Best-hits
|
122
|
-
rbh = {}
|
123
|
-
n2 = 0
|
124
|
-
say ' Running comparisons'
|
125
|
-
[2, 1].each do |i|
|
126
|
-
qry_seen = {}
|
127
|
-
q = o[:"seq#{i}"]
|
128
|
-
s = "#{dir}/seq#{i == 1 ? 2 : 1}"
|
129
|
-
say(' Query: ', q)
|
130
|
-
case o[:program]
|
131
|
-
when :blast
|
132
|
-
`"#{o[:bin]}blastall" -p #{o[:nucl] ? 'blastn' : 'blastp'} -d "#{s}" \
|
133
|
-
-i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
|
134
|
-
when :'blast+'
|
135
|
-
`"#{o[:bin]}#{o[:nucl] ? 'blastn' : 'blastp'}" -db "#{s}" -query "#{q}" \
|
136
|
-
-max_target_seqs 1 -num_threads #{o[:thr]} -out "#{dir}/#{i}.tab" \
|
137
|
-
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
|
138
|
-
sstart send evalue bitscore qlen slen"`
|
139
|
-
when :diamond
|
140
|
-
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
|
141
|
-
--query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
|
142
|
-
&& "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt \
|
143
|
-
6 qseqid sseqid pident length mismatch gapopen qstart qend sstart \
|
144
|
-
send evalue bitscore qlen slen --out "#{dir}/#{i}.tab" --quiet`
|
145
|
-
else
|
146
|
-
abort "Unsupported program: #{o[:program]}"
|
147
|
-
end
|
148
|
-
|
149
|
-
n = 0
|
150
|
-
File.open("#{dir}/#{i}.tab", 'r') do |fh|
|
151
|
-
fh.each do |ln|
|
152
|
-
ln.chomp!
|
153
|
-
row = ln.split(/\t/)
|
154
|
-
row[12] = '1' unless [:'blast+', :diamond].include? o[:program]
|
155
|
-
next unless qry_seen[row[0]].nil? &&
|
156
|
-
row[3].to_i >= o[:len] && row[2].to_f >= o[:id] &&
|
157
|
-
row[11].to_f >= o[:score] && row[3].to_f / row[12].to_i >= o[:fract]
|
158
|
-
|
159
|
-
qry_seen[row[0]] = 1
|
160
|
-
n += 1
|
161
|
-
if i == 2
|
162
|
-
rbh[row[0]] = row[1]
|
163
|
-
elsif !rbh[row[1]].nil? && rbh[row[1]] == row[0]
|
164
|
-
puts ln
|
165
|
-
n2 += 1
|
166
|
-
end
|
167
|
-
end
|
168
|
-
end
|
169
|
-
say " #{n} sequences with hit"
|
170
|
-
end
|
171
|
-
say " #{n2} RBMs"
|
172
|
-
end
|
@@ -1,108 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# frozen_string_literal: true
|
4
|
-
|
5
|
-
$VERSION = 1.01
|
6
|
-
$:.push File.expand_path('../lib', __FILE__)
|
7
|
-
require 'enveomics_rb/rbm'
|
8
|
-
require 'tmpdir'
|
9
|
-
|
10
|
-
bms_dummy = Enveomics::RBM.new('1', '2').bms1
|
11
|
-
o = { q: false, out: '-' }
|
12
|
-
%i[thr len id fract score bin program nucl].each do |k|
|
13
|
-
o[k] = bms_dummy.opt(k)
|
14
|
-
end
|
15
|
-
|
16
|
-
OptionParser.new do |opts|
|
17
|
-
opts.version = $VERSION
|
18
|
-
cmd = File.basename($0)
|
19
|
-
opts.banner = <<~BANNER
|
20
|
-
|
21
|
-
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
22
|
-
|
23
|
-
Finds the reciprocal best matches between two sets of sequences
|
24
|
-
|
25
|
-
Usage: #{cmd} [options]
|
26
|
-
|
27
|
-
BANNER
|
28
|
-
|
29
|
-
opts.separator 'Mandatory'
|
30
|
-
opts.on(
|
31
|
-
'-1', '--seq1 FILE',
|
32
|
-
'Path to the FastA file containing the set 1'
|
33
|
-
) { |v| o[:seq1] = v }
|
34
|
-
opts.on(
|
35
|
-
'-2', '--seq2 FILE',
|
36
|
-
'Path to the FastA file containing the set 2'
|
37
|
-
) { |v| o[:seq2] = v }
|
38
|
-
opts.on(
|
39
|
-
'-o', '--out FILE',
|
40
|
-
'Reciprocal Best Matches in BLAST tabular format.',
|
41
|
-
'Supports compression with .gz extension, use - for STDOUT (default)'
|
42
|
-
) { |v| o[:out] = v }
|
43
|
-
opts.separator ''
|
44
|
-
opts.separator 'Search Options'
|
45
|
-
opts.on(
|
46
|
-
'-n', '--nucl',
|
47
|
-
'Sequences are assumed to be nucleotides (proteins by default)',
|
48
|
-
'Incompatible with -p diamond'
|
49
|
-
) { |v| o[:nucl] = true }
|
50
|
-
opts.on(
|
51
|
-
'-l', '--len INT', Integer,
|
52
|
-
'Minimum alignment length (in residues)',
|
53
|
-
"By default: #{o[:len]}"
|
54
|
-
) { |v| o[:len] = v }
|
55
|
-
opts.on(
|
56
|
-
'-f', '--fract FLOAT', Float,
|
57
|
-
'Minimum alignment length (as a fraction of the query)',
|
58
|
-
'If set, requires BLAST+ or Diamond (see -p)',
|
59
|
-
"By default: #{o[:fract]}"
|
60
|
-
) { |v| o[:fract] = v }
|
61
|
-
opts.on(
|
62
|
-
'-i', '--id NUM', Float,
|
63
|
-
'Minimum alignment identity (in %)',
|
64
|
-
"By default: #{o[:id]}"
|
65
|
-
){ |v| o[:id] = v }
|
66
|
-
opts.on(
|
67
|
-
'-s', '--score NUM', Float,
|
68
|
-
'Minimum alignment score (in bits)',
|
69
|
-
"By default: #{o[:score]}"
|
70
|
-
) { |v| o[:score] = v }
|
71
|
-
opts.separator ''
|
72
|
-
opts.separator 'Software Options'
|
73
|
-
opts.on(
|
74
|
-
'-b', '--bin DIR',
|
75
|
-
'Path to the directory containing the binaries of the search program'
|
76
|
-
) { |v| o[:bin] = v }
|
77
|
-
opts.on(
|
78
|
-
'-p', '--program STR',
|
79
|
-
'Search program to be used',
|
80
|
-
'One of: blast+ (default), blast, diamond, blat'
|
81
|
-
) { |v| o[:program] = v.downcase.to_sym }
|
82
|
-
opts.on(
|
83
|
-
'-t', '--threads INT', Integer,
|
84
|
-
'Number of parallel threads to be used',
|
85
|
-
"By default: #{o[:thr]}"
|
86
|
-
) { |v| o[:thr] = v }
|
87
|
-
opts.separator ''
|
88
|
-
opts.separator 'Other Options'
|
89
|
-
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { $QUIET = true }
|
90
|
-
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
91
|
-
opts.separator ''
|
92
|
-
end.parse!
|
93
|
-
|
94
|
-
raise Enveomics::OptionError.new('-1 is mandatory') if o[:seq1].nil?
|
95
|
-
raise Enveomics::OptionError.new('-2 is mandatory') if o[:seq2].nil?
|
96
|
-
raise Enveomics::OptionError.new(
|
97
|
-
'Argument -f/--fract requires -p blast+ or -p diamond'
|
98
|
-
) if o[:fract] > 0.0 && !%i[blast+ diamond].include?(o[:program])
|
99
|
-
|
100
|
-
rbm = Enveomics::RBM.new(o[:seq1], o[:seq2], o)
|
101
|
-
ofh = writer(o[:out])
|
102
|
-
rbm.each { |bm| ofh.puts bm.to_s }
|
103
|
-
ofh.close
|
104
|
-
|
105
|
-
say('Forward Best Matches: ', rbm.bms1.count)
|
106
|
-
say('Reverse Best Matches: ', rbm.bms2.count)
|
107
|
-
say('Reciprocal Best Matches: ', rbm.count)
|
108
|
-
|
@@ -1,148 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# frozen_string_literal: true
|
4
|
-
|
5
|
-
$VERSION = 1.0
|
6
|
-
$:.push File.expand_path('../lib', __FILE__)
|
7
|
-
require 'enveomics_rb/enveomics'
|
8
|
-
use 'shellwords'
|
9
|
-
|
10
|
-
o = {
|
11
|
-
q: false, threads: 2, m_format: :sam, g_format: :fasta, identity: 95.0,
|
12
|
-
o: '-', header: true
|
13
|
-
}
|
14
|
-
|
15
|
-
OptionParser.new do |opt|
|
16
|
-
Enveomics.opt_banner(
|
17
|
-
opt, 'Filters a SAM or BAM file by target sequences and/or identity',
|
18
|
-
"#{File.basename($0)} -m map.sam -o filtered_map.sam [options]"
|
19
|
-
)
|
20
|
-
|
21
|
-
opt.separator 'Input/Output'
|
22
|
-
opt.on(
|
23
|
-
'-g', '--genome PATH',
|
24
|
-
'Genome assembly',
|
25
|
-
'Supports compression with .gz extension, use - for STDIN'
|
26
|
-
) { |v| o[:g] = v }
|
27
|
-
opt.on(
|
28
|
-
'-m', '--mapping PATH',
|
29
|
-
'Mapping file',
|
30
|
-
'Supports compression with .gz extension, use - for STDIN'
|
31
|
-
) { |v| o[:m] = v }
|
32
|
-
opt.on(
|
33
|
-
'-o', '--out-sam PATH',
|
34
|
-
'Output filtered file in SAM format',
|
35
|
-
'Supports compression with .gz extension, use - for STDOUT (default)'
|
36
|
-
) { |v| o[:o] = v }
|
37
|
-
opt.separator ''
|
38
|
-
|
39
|
-
opt.separator 'Formats'
|
40
|
-
opt.on(
|
41
|
-
'--g-format STRING',
|
42
|
-
'Genome assembly format: fasta (default) or list'
|
43
|
-
) { |v| o[:g_format] = v.downcase.to_sym }
|
44
|
-
opt.on(
|
45
|
-
'--m-format STRING',
|
46
|
-
'Mapping file format: sam (default) or bam',
|
47
|
-
'sam supports compression with .gz file extension'
|
48
|
-
) { |v| o[:m_format] = v.downcase.to_sym }
|
49
|
-
opt.separator ''
|
50
|
-
|
51
|
-
opt.separator 'General'
|
52
|
-
opt.on(
|
53
|
-
'-i', '--identity FLOAT', Float,
|
54
|
-
"Set a fixed threshold of percent identity (default: #{o[:identity]})"
|
55
|
-
) { |v| o[:identity] = v }
|
56
|
-
opt.on('--no-header', 'Do not include the headers') { |v| o[:header] = v }
|
57
|
-
opt.separator ''
|
58
|
-
opt.on(
|
59
|
-
'-t', '--threads INT', Integer, "Threads to use (default: #{o[:threads]})"
|
60
|
-
) { |v| o[:threads] = v }
|
61
|
-
opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
|
62
|
-
opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
|
63
|
-
opt.on('-h', '--help', 'Display this screen') do
|
64
|
-
puts opt
|
65
|
-
exit
|
66
|
-
end
|
67
|
-
opt.separator ''
|
68
|
-
end.parse!
|
69
|
-
|
70
|
-
$QUIET = o[:q]
|
71
|
-
|
72
|
-
# Functions
|
73
|
-
|
74
|
-
##
|
75
|
-
# Parses one line +ln+ in SAM format and outputs filtered lines to +ofh+
|
76
|
-
# Filters by minimum +identity+ and +target+ sequences, and prints
|
77
|
-
# the headers if +header+
|
78
|
-
def parse_sam_line(ln, identity, target, header, ofh)
|
79
|
-
if ln =~ /^@/ || ln =~ /^\s*$/
|
80
|
-
ofh.puts ln if header
|
81
|
-
return
|
82
|
-
end
|
83
|
-
|
84
|
-
# No match
|
85
|
-
row = ln.chomp.split("\t")
|
86
|
-
return if row[2] == '*'
|
87
|
-
|
88
|
-
# Filter by target
|
89
|
-
return if !target.nil? && !target.include?(row[2])
|
90
|
-
|
91
|
-
# Exclude unless concordant or unaligned
|
92
|
-
length = row[9].size
|
93
|
-
row.shift(11) # Discard non-flag columns
|
94
|
-
flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
|
95
|
-
return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
|
96
|
-
|
97
|
-
# Filter by identity
|
98
|
-
unless flags['MD']
|
99
|
-
raise Enveomics::ParseError.new(
|
100
|
-
"SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
|
101
|
-
)
|
102
|
-
end
|
103
|
-
mismatches = flags['MD'].scan(/[^\d]/).count
|
104
|
-
id = 100.0 * (length - mismatches) / length
|
105
|
-
ofh.puts ln if id >= identity
|
106
|
-
end
|
107
|
-
|
108
|
-
# Reading targets
|
109
|
-
if o[:g]
|
110
|
-
say 'Loading target sequences to filter'
|
111
|
-
reader = reader(o[:g])
|
112
|
-
target =
|
113
|
-
case o[:g_format]
|
114
|
-
when :fasta
|
115
|
-
reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
|
116
|
-
when :list
|
117
|
-
reader.each.map(&:chomp)
|
118
|
-
else
|
119
|
-
raise Enveomics::OptionError.new(
|
120
|
-
"Unsupported target sequences format: #{o[:g_format]}"
|
121
|
-
)
|
122
|
-
end
|
123
|
-
reader.close
|
124
|
-
else
|
125
|
-
target = nil
|
126
|
-
end
|
127
|
-
|
128
|
-
# Reading and filtering mapping
|
129
|
-
say 'Reading mapping file'
|
130
|
-
ofh = writer(o[:o])
|
131
|
-
case o[:m_format]
|
132
|
-
when :sam
|
133
|
-
reader = reader(o[:m])
|
134
|
-
reader.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
|
135
|
-
reader.close
|
136
|
-
when :bam
|
137
|
-
cmd = ['samtools', 'view', o[:m], '-@', o[:threads]]
|
138
|
-
cmd << '-h' if o[:header]
|
139
|
-
IO.popen(cmd.shelljoin) do |fh|
|
140
|
-
fh.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
|
141
|
-
end
|
142
|
-
else
|
143
|
-
raise Enveomics::OptionError.new(
|
144
|
-
"Unsupported mapping format: #{o[:m_format]}"
|
145
|
-
)
|
146
|
-
end
|
147
|
-
ofh.close
|
148
|
-
|