miga-base 1.2.15.2 → 1.2.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/download/gtdb.rb +4 -1
- data/lib/miga/cli/action/gtdb_get.rb +4 -0
- data/lib/miga/daemon.rb +4 -1
- data/lib/miga/lair.rb +6 -4
- data/lib/miga/remote_dataset/download.rb +3 -2
- data/lib/miga/remote_dataset.rb +25 -7
- data/lib/miga/taxonomy.rb +6 -0
- data/lib/miga/version.rb +2 -2
- metadata +6 -302
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +0 -41964
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +0 -32439
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -62056
- data/utils/FastAAI/FastAAI +0 -3659
- data/utils/FastAAI/FastAAI-legacy/FastAAI +0 -1336
- data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +0 -1296
- data/utils/FastAAI/README.md +0 -84
- data/utils/enveomics/Docs/recplot2.md +0 -244
- data/utils/enveomics/Examples/aai-matrix.bash +0 -66
- data/utils/enveomics/Examples/ani-matrix.bash +0 -66
- data/utils/enveomics/Examples/essential-phylogeny.bash +0 -105
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +0 -100
- data/utils/enveomics/LICENSE.txt +0 -73
- data/utils/enveomics/Makefile +0 -52
- data/utils/enveomics/Manifest/Tasks/aasubs.json +0 -103
- data/utils/enveomics/Manifest/Tasks/blasttab.json +0 -790
- data/utils/enveomics/Manifest/Tasks/distances.json +0 -161
- data/utils/enveomics/Manifest/Tasks/fasta.json +0 -802
- data/utils/enveomics/Manifest/Tasks/fastq.json +0 -291
- data/utils/enveomics/Manifest/Tasks/graphics.json +0 -126
- data/utils/enveomics/Manifest/Tasks/mapping.json +0 -137
- data/utils/enveomics/Manifest/Tasks/ogs.json +0 -382
- data/utils/enveomics/Manifest/Tasks/other.json +0 -906
- data/utils/enveomics/Manifest/Tasks/remote.json +0 -355
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +0 -650
- data/utils/enveomics/Manifest/Tasks/tables.json +0 -308
- data/utils/enveomics/Manifest/Tasks/trees.json +0 -68
- data/utils/enveomics/Manifest/Tasks/variants.json +0 -111
- data/utils/enveomics/Manifest/categories.json +0 -165
- data/utils/enveomics/Manifest/examples.json +0 -162
- data/utils/enveomics/Manifest/tasks.json +0 -4
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
- data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
- data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
- data/utils/enveomics/README.md +0 -42
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +0 -171
- data/utils/enveomics/Scripts/Aln.cat.rb +0 -221
- data/utils/enveomics/Scripts/Aln.convert.pl +0 -35
- data/utils/enveomics/Scripts/AlphaDiversity.pl +0 -152
- data/utils/enveomics/Scripts/BedGraph.tad.rb +0 -93
- data/utils/enveomics/Scripts/BedGraph.window.rb +0 -71
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +0 -102
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +0 -63
- data/utils/enveomics/Scripts/BlastTab.advance.bash +0 -48
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +0 -55
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +0 -104
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +0 -76
- data/utils/enveomics/Scripts/BlastTab.filter.pl +0 -47
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +0 -194
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +0 -104
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +0 -157
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +0 -48
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +0 -86
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +0 -119
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +0 -86
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +0 -47
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +0 -114
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +0 -90
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +0 -123
- data/utils/enveomics/Scripts/Chao1.pl +0 -97
- data/utils/enveomics/Scripts/CharTable.classify.rb +0 -234
- data/utils/enveomics/Scripts/EBIseq2tax.rb +0 -83
- data/utils/enveomics/Scripts/FastA.N50.pl +0 -60
- data/utils/enveomics/Scripts/FastA.extract.rb +0 -152
- data/utils/enveomics/Scripts/FastA.filter.pl +0 -52
- data/utils/enveomics/Scripts/FastA.filterLen.pl +0 -28
- data/utils/enveomics/Scripts/FastA.filterN.pl +0 -60
- data/utils/enveomics/Scripts/FastA.fragment.rb +0 -100
- data/utils/enveomics/Scripts/FastA.gc.pl +0 -42
- data/utils/enveomics/Scripts/FastA.interpose.pl +0 -93
- data/utils/enveomics/Scripts/FastA.length.pl +0 -38
- data/utils/enveomics/Scripts/FastA.mask.rb +0 -89
- data/utils/enveomics/Scripts/FastA.per_file.pl +0 -36
- data/utils/enveomics/Scripts/FastA.qlen.pl +0 -57
- data/utils/enveomics/Scripts/FastA.rename.pl +0 -65
- data/utils/enveomics/Scripts/FastA.revcom.pl +0 -23
- data/utils/enveomics/Scripts/FastA.sample.rb +0 -98
- data/utils/enveomics/Scripts/FastA.slider.pl +0 -85
- data/utils/enveomics/Scripts/FastA.split.pl +0 -55
- data/utils/enveomics/Scripts/FastA.split.rb +0 -79
- data/utils/enveomics/Scripts/FastA.subsample.pl +0 -131
- data/utils/enveomics/Scripts/FastA.tag.rb +0 -65
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +0 -69
- data/utils/enveomics/Scripts/FastA.wrap.rb +0 -48
- data/utils/enveomics/Scripts/FastQ.filter.pl +0 -54
- data/utils/enveomics/Scripts/FastQ.interpose.pl +0 -90
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +0 -89
- data/utils/enveomics/Scripts/FastQ.offset.pl +0 -90
- data/utils/enveomics/Scripts/FastQ.split.pl +0 -53
- data/utils/enveomics/Scripts/FastQ.tag.rb +0 -70
- data/utils/enveomics/Scripts/FastQ.test-error.rb +0 -81
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +0 -24
- data/utils/enveomics/Scripts/GFF.catsbj.pl +0 -127
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +0 -84
- data/utils/enveomics/Scripts/HMM.essential.rb +0 -351
- data/utils/enveomics/Scripts/HMM.haai.rb +0 -168
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +0 -83
- data/utils/enveomics/Scripts/JPlace.distances.rb +0 -88
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +0 -320
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +0 -81
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +0 -198
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +0 -35
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +0 -49
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +0 -92
- data/utils/enveomics/Scripts/Newick.autoprune.R +0 -27
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +0 -228
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +0 -32
- data/utils/enveomics/Scripts/RefSeq.download.bash +0 -48
- data/utils/enveomics/Scripts/SRA.download.bash +0 -55
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +0 -36
- data/utils/enveomics/Scripts/TRIBS.test.R +0 -39
- data/utils/enveomics/Scripts/Table.barplot.R +0 -31
- data/utils/enveomics/Scripts/Table.df2dist.R +0 -30
- data/utils/enveomics/Scripts/Table.filter.pl +0 -61
- data/utils/enveomics/Scripts/Table.merge.pl +0 -77
- data/utils/enveomics/Scripts/Table.prefScore.R +0 -60
- data/utils/enveomics/Scripts/Table.replace.rb +0 -69
- data/utils/enveomics/Scripts/Table.round.rb +0 -63
- data/utils/enveomics/Scripts/Table.split.pl +0 -57
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +0 -227
- data/utils/enveomics/Scripts/VCF.KaKs.rb +0 -147
- data/utils/enveomics/Scripts/VCF.SNPs.rb +0 -88
- data/utils/enveomics/Scripts/aai.rb +0 -421
- data/utils/enveomics/Scripts/ani.rb +0 -362
- data/utils/enveomics/Scripts/anir.rb +0 -137
- data/utils/enveomics/Scripts/clust.rand.rb +0 -102
- data/utils/enveomics/Scripts/gi2tax.rb +0 -103
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +0 -96
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +0 -293
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +0 -175
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +0 -24
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +0 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +0 -30
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +0 -253
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +0 -88
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +0 -182
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +0 -49
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +0 -74
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +0 -237
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +0 -31
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +0 -152
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +0 -3
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +0 -74
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +0 -135
- data/utils/enveomics/Scripts/ogs.annotate.rb +0 -88
- data/utils/enveomics/Scripts/ogs.core-pan.rb +0 -160
- data/utils/enveomics/Scripts/ogs.extract.rb +0 -125
- data/utils/enveomics/Scripts/ogs.mcl.rb +0 -186
- data/utils/enveomics/Scripts/ogs.rb +0 -104
- data/utils/enveomics/Scripts/ogs.stats.rb +0 -131
- data/utils/enveomics/Scripts/rbm-legacy.rb +0 -172
- data/utils/enveomics/Scripts/rbm.rb +0 -108
- data/utils/enveomics/Scripts/sam.filter.rb +0 -148
- data/utils/enveomics/Tests/Makefile +0 -10
- data/utils/enveomics/Tests/Mgen_M2288.faa +0 -3189
- data/utils/enveomics/Tests/Mgen_M2288.fna +0 -8282
- data/utils/enveomics/Tests/Mgen_M2321.fna +0 -8288
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +0 -2970
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +0 -7
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +0 -17
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +0 -137
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +0 -123
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +0 -200
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +0 -55
- data/utils/enveomics/Tests/alkB.nwk +0 -1
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +0 -13
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +0 -17
- data/utils/enveomics/Tests/hiv1.faa +0 -59
- data/utils/enveomics/Tests/hiv1.fna +0 -134
- data/utils/enveomics/Tests/hiv2.faa +0 -70
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +0 -233
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +0 -1
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +0 -233
- data/utils/enveomics/Tests/phyla_counts.tsv +0 -10
- data/utils/enveomics/Tests/primate_lentivirus.ogs +0 -11
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +0 -9
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +0 -8
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +0 -6
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +0 -9
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +0 -6
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +0 -6
- data/utils/enveomics/build_enveomics_r.bash +0 -45
- data/utils/enveomics/enveomics.R/DESCRIPTION +0 -31
- data/utils/enveomics/enveomics.R/NAMESPACE +0 -39
- data/utils/enveomics/enveomics.R/R/autoprune.R +0 -155
- data/utils/enveomics/enveomics.R/R/barplot.R +0 -184
- data/utils/enveomics/enveomics.R/R/cliopts.R +0 -135
- data/utils/enveomics/enveomics.R/R/df2dist.R +0 -154
- data/utils/enveomics/enveomics.R/R/growthcurve.R +0 -331
- data/utils/enveomics/enveomics.R/R/prefscore.R +0 -79
- data/utils/enveomics/enveomics.R/R/recplot.R +0 -354
- data/utils/enveomics/enveomics.R/R/recplot2.R +0 -1631
- data/utils/enveomics/enveomics.R/R/tribs.R +0 -583
- data/utils/enveomics/enveomics.R/R/utils.R +0 -80
- data/utils/enveomics/enveomics.R/README.md +0 -81
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -16
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +0 -25
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +0 -46
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +0 -103
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +0 -67
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +0 -24
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +0 -44
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +0 -75
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +0 -50
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +0 -44
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +0 -139
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +0 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +0 -77
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +0 -25
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +0 -21
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +0 -47
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +0 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +0 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -36
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +0 -52
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +0 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +0 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +0 -43
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +0 -82
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +0 -59
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +0 -36
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +0 -23
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +0 -68
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +0 -28
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +0 -27
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +0 -14
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +0 -13
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +0 -78
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +0 -46
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +0 -45
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +0 -125
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +0 -19
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +0 -19
- data/utils/enveomics/globals.mk +0 -8
- data/utils/enveomics/manifest.json +0 -9
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +0 -67
- data/utils/multitrim/multitrim.py +0 -1555
- data/utils/multitrim/multitrim.yml +0 -13
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
|
5
|
-
# @license Artistic-2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
require 'optparse'
|
|
9
|
-
|
|
10
|
-
o = {q: false}
|
|
11
|
-
ARGV << '-h' if ARGV.size==0
|
|
12
|
-
|
|
13
|
-
OptionParser.new do |opt|
|
|
14
|
-
opt.banner = "
|
|
15
|
-
Extracts a list of sequences and/or coordinates from multi-FastA files.
|
|
16
|
-
|
|
17
|
-
Usage: #{$0} [options]"
|
|
18
|
-
opt.separator ''
|
|
19
|
-
opt.separator 'Mandatory'
|
|
20
|
-
opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
|
|
21
|
-
opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
|
|
22
|
-
opt.on('-c', '--coords STRING',
|
|
23
|
-
'Comma-delimited list of coordinates (mandatory unless -C is passed).',
|
|
24
|
-
'The format of the coordinates is "SEQ:FROM..TO" or "SEQ:FROM~LEN":',
|
|
25
|
-
'SEQ: Sequence ID, or * (asterisk) to extract range from all sequences',
|
|
26
|
-
'FROM: Integer, position of the first base to include (can be negative)',
|
|
27
|
-
'TO: Integer, last base to include (can be negative)',
|
|
28
|
-
'LEN: Length of the range to extract'
|
|
29
|
-
){ |v| o[:c] = v }
|
|
30
|
-
opt.separator ''
|
|
31
|
-
opt.separator 'Options'
|
|
32
|
-
opt.on('-C', '--coords-file PATH',
|
|
33
|
-
'File containing the coordinates, one per line.',
|
|
34
|
-
'Each line must follow the format described for -c.'){ |v| o[:C] = v }
|
|
35
|
-
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
|
36
|
-
opt.on('-h', '--help', 'Display this screen.') do
|
|
37
|
-
puts opt
|
|
38
|
-
exit
|
|
39
|
-
end
|
|
40
|
-
opt.separator ''
|
|
41
|
-
end.parse!
|
|
42
|
-
abort '-i is mandatory.' if o[:i].nil?
|
|
43
|
-
abort '-o is mandatory.' if o[:o].nil?
|
|
44
|
-
abort '-c is mandatory.' if o[:c].nil? and o[:C].nil?
|
|
45
|
-
|
|
46
|
-
# Classses to parse coordinates
|
|
47
|
-
class SeqCoords
|
|
48
|
-
attr :id, :from, :to, :length, :str
|
|
49
|
-
def initialize(str)
|
|
50
|
-
@str = str
|
|
51
|
-
m = /(\S+):(-?\d+)(~|\.\.)(-?\d+)/.match str
|
|
52
|
-
raise "Cannot parse coordinates: #{str}" if m.nil?
|
|
53
|
-
@id = m[1]
|
|
54
|
-
@from = m[2].to_i
|
|
55
|
-
if m[3] == '~'
|
|
56
|
-
@length = m[4].to_i
|
|
57
|
-
else
|
|
58
|
-
@to = m[4].to_i
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
def extract(id, seq)
|
|
63
|
-
return nil unless concerns? id
|
|
64
|
-
from_i = from > 0 ? from : seq.length + 1 + from
|
|
65
|
-
if to.nil?
|
|
66
|
-
seq[from_i, length]
|
|
67
|
-
else
|
|
68
|
-
to_i = to > 0 ? to : seq.length + 1 + to
|
|
69
|
-
seq[from_i .. to_i]
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
def concerns?(seq_id)
|
|
74
|
-
return true if id == '*'
|
|
75
|
-
return id == seq_id
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
class SeqCoordsCollection
|
|
80
|
-
class << self
|
|
81
|
-
def from_str(str)
|
|
82
|
-
c = new
|
|
83
|
-
str.split(',').each { |i| c << SeqCoords.new(i) }
|
|
84
|
-
c
|
|
85
|
-
end
|
|
86
|
-
def from_file(path)
|
|
87
|
-
c = new
|
|
88
|
-
File.open(path, 'r') do |fh|
|
|
89
|
-
fh.each{ |i| c << SeqCoords.new(i.chomp) }
|
|
90
|
-
end
|
|
91
|
-
c
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
attr :collection
|
|
96
|
-
|
|
97
|
-
def initialize
|
|
98
|
-
@collection = []
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
def <<(coords)
|
|
102
|
-
@collection << coords
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
def extract(id, seq)
|
|
106
|
-
@collection.map{ |c| c.extract(id, seq) }.compact
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# Functions to parse sequences
|
|
111
|
-
def do_stuff(id, sq)
|
|
112
|
-
return if id.nil? or sq.empty?
|
|
113
|
-
@n_in += 1
|
|
114
|
-
sq.gsub!(/[^A-Za-z]/, '')
|
|
115
|
-
i = 0
|
|
116
|
-
@coll.extract(id, sq).each do |new_sq|
|
|
117
|
-
@ofh.puts ">#{id}:#{i += 1}"
|
|
118
|
-
@ofh.puts new_sq
|
|
119
|
-
@n_out += 1
|
|
120
|
-
end
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
# Parse coordinates
|
|
124
|
-
$stderr.puts 'Parsing coordinates' unless o[:q]
|
|
125
|
-
@coll = o[:c].nil? ? SeqCoordsCollection.from_file(o[:C]) :
|
|
126
|
-
SeqCoordsCollection.from_str(o[:c])
|
|
127
|
-
$stderr.puts " Coordinates found: #{@coll.collection.size}"
|
|
128
|
-
|
|
129
|
-
# Parse sequences
|
|
130
|
-
$stderr.puts 'Parsing sequences' unless o[:q]
|
|
131
|
-
@n_in = 0
|
|
132
|
-
@n_out = 0
|
|
133
|
-
@ofh = File.open(o[:o], 'w')
|
|
134
|
-
File.open(o[:i], 'r') do |fh|
|
|
135
|
-
id = nil
|
|
136
|
-
sq = ''
|
|
137
|
-
fh.each do |ln|
|
|
138
|
-
next if ln =~ /^;/
|
|
139
|
-
if ln =~ /^>(\S+)/
|
|
140
|
-
id = $1
|
|
141
|
-
do_stuff(id, sq)
|
|
142
|
-
sq = ''
|
|
143
|
-
else
|
|
144
|
-
sq << ln
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
do_stuff(id, sq)
|
|
148
|
-
end
|
|
149
|
-
@ofh.close
|
|
150
|
-
$stderr.puts " Input sequences: #{@n_in}"
|
|
151
|
-
$stderr.puts " Output fragments: #{@n_out}"
|
|
152
|
-
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
#
|
|
3
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
|
4
|
-
# @update Oct-07-2015
|
|
5
|
-
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
use warnings;
|
|
9
|
-
use strict;
|
|
10
|
-
use Getopt::Std;
|
|
11
|
-
|
|
12
|
-
sub HELP_MESSAGE { die "
|
|
13
|
-
.Description:
|
|
14
|
-
Extracts a subset of sequences from a FastA file.
|
|
15
|
-
|
|
16
|
-
.Usage: $0 [options] list.txt seqs.fa > subset.fa
|
|
17
|
-
|
|
18
|
-
[options]
|
|
19
|
-
-r Reverse list. Extracts sequences NOT present in the list.
|
|
20
|
-
-q Runs quietly.
|
|
21
|
-
-h Prints this message and exits.
|
|
22
|
-
|
|
23
|
-
[mandatory]
|
|
24
|
-
list.txt List of sequences to extract.
|
|
25
|
-
seqs.fa FastA file containing the superset of sequences.
|
|
26
|
-
subset.fa FastA file to be created.
|
|
27
|
-
|
|
28
|
-
" }
|
|
29
|
-
|
|
30
|
-
my %o=();
|
|
31
|
-
getopts('rhq', \%o);
|
|
32
|
-
my($list, $fa) = @ARGV;
|
|
33
|
-
($list and $fa) or &HELP_MESSAGE;
|
|
34
|
-
$o{h} and &HELP_MESSAGE;
|
|
35
|
-
|
|
36
|
-
print STDERR "Reading list.\n" unless $o{q};
|
|
37
|
-
open LI, "<", $list or die "Cannot read file: $list: $!\n";
|
|
38
|
-
my %li = map { chomp; $_ => 1 } <LI>;
|
|
39
|
-
close LI;
|
|
40
|
-
|
|
41
|
-
print STDERR "Filtering FastA.\n" unless $o{q};
|
|
42
|
-
open FA, "<", $fa or die "Cannot read file: $fa: $!\n";
|
|
43
|
-
my $good = 0;
|
|
44
|
-
while(my $ln = <FA>){
|
|
45
|
-
next if $ln =~ /^;/;
|
|
46
|
-
chomp $ln;
|
|
47
|
-
if($ln =~ m/^>((\S+).*)/){ $good = (exists $li{$1} or exists $li{">$1"} or exists $li{$2} or exists $li{$ln}) }
|
|
48
|
-
elsif($ln =~ m/^>/){ $good=$o{r}; print STDERR "Warning: Non-cannonical defline, line $.: $ln\n" }
|
|
49
|
-
print "$ln\n" if (($good and not $o{r}) or ($o{r} and not $good));
|
|
50
|
-
}
|
|
51
|
-
close FA;
|
|
52
|
-
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
|
|
3
|
-
use warnings;
|
|
4
|
-
use strict;
|
|
5
|
-
use Bio::SeqIO;
|
|
6
|
-
|
|
7
|
-
my $file = $ARGV[0];
|
|
8
|
-
my $min = $ARGV[1];
|
|
9
|
-
($file and $min) or die <<HELP
|
|
10
|
-
|
|
11
|
-
This script will filter a multi fastA file by length
|
|
12
|
-
|
|
13
|
-
Usage "perl $0 fastafile minlenght "
|
|
14
|
-
HELP
|
|
15
|
-
;
|
|
16
|
-
my $seq_in = Bio::SeqIO->new( -format => 'fasta',-file => $file);
|
|
17
|
-
|
|
18
|
-
while( my $seq1 = $seq_in->next_seq() ) {
|
|
19
|
-
|
|
20
|
-
my $id = $seq1->primary_id;
|
|
21
|
-
chomp $id;
|
|
22
|
-
my $seq = $seq1->seq;
|
|
23
|
-
chomp $seq;
|
|
24
|
-
my $lseq = length($seq);
|
|
25
|
-
if($lseq>=$min){
|
|
26
|
-
print ">$id","\n",$seq,"\n";
|
|
27
|
-
}
|
|
28
|
-
}
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
#
|
|
3
|
-
# @author Luis M. Rodriguez-R
|
|
4
|
-
# @update Oct-07-2015
|
|
5
|
-
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
use warnings;
|
|
9
|
-
use strict;
|
|
10
|
-
|
|
11
|
-
my($file, $content, $stretch) = @ARGV;
|
|
12
|
-
$file or die <<HELP
|
|
13
|
-
|
|
14
|
-
Description:
|
|
15
|
-
Filter sequences by N-content and presence of long homopolymers.
|
|
16
|
-
Usage:
|
|
17
|
-
$0 sequences.fa [content [stretch]] > filtered.fa
|
|
18
|
-
Where:
|
|
19
|
-
sequences.fa Input file in FastA format
|
|
20
|
-
content A number between 0 and 1 indicating the maximum proportion of Ns
|
|
21
|
-
(1 to turn off, 0.5 by default)
|
|
22
|
-
stretch A number indicating the maximum number of consecutive identical
|
|
23
|
-
nucleotides allowed (0 to turn off, 100 by default)
|
|
24
|
-
filtered.fa Filtered set of sequences.
|
|
25
|
-
|
|
26
|
-
HELP
|
|
27
|
-
;
|
|
28
|
-
($content ||= 0.5)+=0;
|
|
29
|
-
($stretch ||= 100)+=0;
|
|
30
|
-
|
|
31
|
-
my $good = 0;
|
|
32
|
-
my $N = 0;
|
|
33
|
-
|
|
34
|
-
FASTA: {
|
|
35
|
-
local $/ = "\n>";
|
|
36
|
-
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
|
37
|
-
SEQ: while(<FILE>){
|
|
38
|
-
$N++;
|
|
39
|
-
s/^;.*//gm;
|
|
40
|
-
s/>//g;
|
|
41
|
-
my($n,$s) = split /\n/, $_, 2;
|
|
42
|
-
(my $clean = $s) =~ s/[^ACTGN]//g;
|
|
43
|
-
if($content < 1){
|
|
44
|
-
(my $Ns = $clean) =~ s/[^N]//g;
|
|
45
|
-
next SEQ if length($Ns)>length($clean)*$content;
|
|
46
|
-
}
|
|
47
|
-
if($stretch > 0){
|
|
48
|
-
for my $nuc (qw(A C T G N)){
|
|
49
|
-
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
print ">$n\n$s\n";
|
|
53
|
-
$good++;
|
|
54
|
-
}
|
|
55
|
-
close FILE;
|
|
56
|
-
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
# frozen_string_literal: true
|
|
4
|
-
|
|
5
|
-
$:.push File.expand_path('../lib', __FILE__)
|
|
6
|
-
require 'enveomics_rb/enveomics'
|
|
7
|
-
require 'enveomics_rb/stats'
|
|
8
|
-
$VERSION = 1.0
|
|
9
|
-
|
|
10
|
-
o = { q: false, completeness: nil, minlen: 500, shuffle: true }
|
|
11
|
-
OptionParser.new do |opts|
|
|
12
|
-
opts.version = $VERSION
|
|
13
|
-
Enveomics.opt_banner(
|
|
14
|
-
opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
|
|
15
|
-
"#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
opts.separator 'Mandatory'
|
|
19
|
-
opts.on(
|
|
20
|
-
'-i', '--in FILE',
|
|
21
|
-
'Path to the FastA file containing the complete sequences',
|
|
22
|
-
'Supports compression with .gz extension, use - for STDIN'
|
|
23
|
-
) { |v| o[:in] = v }
|
|
24
|
-
opts.on(
|
|
25
|
-
'-o', '--out FILE', 'Path to the FastA to create',
|
|
26
|
-
'Supports compression with .gz extension, use - for STDOUT'
|
|
27
|
-
) { |v| o[:out] = v }
|
|
28
|
-
opts.on(
|
|
29
|
-
'-c', '--completeness FLOAT',
|
|
30
|
-
'Fraction of genome completeness to simulate from 0 to 1'
|
|
31
|
-
) { |v| o[:completeness] = v.to_f }
|
|
32
|
-
|
|
33
|
-
opts.separator ''
|
|
34
|
-
opts.separator 'Options'
|
|
35
|
-
opts.on(
|
|
36
|
-
'-m', '--minlen INT',
|
|
37
|
-
"Minimum fragment length to report. By default: #{o[:minlen]}"
|
|
38
|
-
) { |v| o[:minlen] = v.to_i }
|
|
39
|
-
opts.on(
|
|
40
|
-
'-s', '--sorted', 'Keep fragments sorted as in the input file',
|
|
41
|
-
'By default, fragments are shuffled'
|
|
42
|
-
) { |v| o[:shuffle] = !v }
|
|
43
|
-
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
|
44
|
-
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
|
45
|
-
opts.separator ''
|
|
46
|
-
end.parse!
|
|
47
|
-
|
|
48
|
-
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
|
49
|
-
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
|
50
|
-
raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
|
|
51
|
-
|
|
52
|
-
begin
|
|
53
|
-
# Read input sequences
|
|
54
|
-
g_id = []
|
|
55
|
-
g_seq = []
|
|
56
|
-
ifh = reader(o[:in])
|
|
57
|
-
id = ''
|
|
58
|
-
ifh.each_line do |ln|
|
|
59
|
-
if ln =~ /^>(\S*)/
|
|
60
|
-
g_id << $1
|
|
61
|
-
g_seq << ''
|
|
62
|
-
else
|
|
63
|
-
g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
ifh.close
|
|
67
|
-
|
|
68
|
-
# Fragment genomes
|
|
69
|
-
f = {}
|
|
70
|
-
binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
|
|
71
|
-
p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
|
|
72
|
-
while !g_seq.empty?
|
|
73
|
-
id = g_id.shift
|
|
74
|
-
seq = g_seq.shift
|
|
75
|
-
gL = seq.length
|
|
76
|
-
while !seq.empty?
|
|
77
|
-
rand_x =
|
|
78
|
-
Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
|
|
79
|
-
fL = [0, (rand_x * binlen).round].max
|
|
80
|
-
f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
|
|
81
|
-
seq = seq[(fL + 1) .. -1]
|
|
82
|
-
seq = '' if seq.nil?
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Save output
|
|
87
|
-
k = f.keys
|
|
88
|
-
k.shuffle! if o[:shuffle]
|
|
89
|
-
ofh = writer(o[:out])
|
|
90
|
-
k.each do |id|
|
|
91
|
-
ofh.puts ">#{id}"
|
|
92
|
-
ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
|
|
93
|
-
end
|
|
94
|
-
ofh.close
|
|
95
|
-
rescue => err
|
|
96
|
-
$stderr.puts "Exception: #{err}\n\n"
|
|
97
|
-
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
|
98
|
-
err
|
|
99
|
-
end
|
|
100
|
-
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
#
|
|
3
|
-
# @author Luis M Rodriguez-R
|
|
4
|
-
# @update Mar-23-2016
|
|
5
|
-
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
use warnings;
|
|
9
|
-
use strict;
|
|
10
|
-
|
|
11
|
-
$#ARGV>=0 or die "
|
|
12
|
-
Usage:
|
|
13
|
-
$0 seqs.fa... > gc.txt
|
|
14
|
-
|
|
15
|
-
seqs.fa One or more FastA files.
|
|
16
|
-
gc.txt A table with the G+C content of the sequences.
|
|
17
|
-
|
|
18
|
-
";
|
|
19
|
-
|
|
20
|
-
for my $fa (@ARGV){
|
|
21
|
-
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
|
22
|
-
my $def = "";
|
|
23
|
-
my $len = 0;
|
|
24
|
-
my $gc = 0;
|
|
25
|
-
while(<FA>){
|
|
26
|
-
next if /^;/;
|
|
27
|
-
if(m/^>(\S*)/){
|
|
28
|
-
print "$def\t".($gc/$len)."\n" if $len;
|
|
29
|
-
$def = $1;
|
|
30
|
-
$len = 0;
|
|
31
|
-
$gc = 0;
|
|
32
|
-
}else{
|
|
33
|
-
s/[^ACTGactg]//g;
|
|
34
|
-
$len += length $_;
|
|
35
|
-
s/[^GC]//g;
|
|
36
|
-
$gc += length $_;
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
print "$def\t".($gc/$len)."\n" if $len;
|
|
40
|
-
close FA;
|
|
41
|
-
}
|
|
42
|
-
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
|
|
3
|
-
# @author Luis M. Rodriguez-R
|
|
4
|
-
# @license artistic license 2.0
|
|
5
|
-
|
|
6
|
-
use strict;
|
|
7
|
-
use warnings;
|
|
8
|
-
use Symbol;
|
|
9
|
-
|
|
10
|
-
my $HELP = <<HELP
|
|
11
|
-
|
|
12
|
-
Description:
|
|
13
|
-
Interposes sequences in FastA format from two files into one output file.
|
|
14
|
-
If more than two files are provided, the script will interpose all the input
|
|
15
|
-
files.
|
|
16
|
-
Note that this script will check for the consistency of the names (assuming
|
|
17
|
-
a pair of related reads contains the same name varying only in a trailing
|
|
18
|
-
slash (/) followed by a digit. If you want to turn this feature off just
|
|
19
|
-
set the -T option to zero. If you want to decrease the sampling period (to
|
|
20
|
-
speed the script up) or increase it (to make it more sensitive to errors)
|
|
21
|
-
just change the -T option accordingly.
|
|
22
|
-
|
|
23
|
-
Usage:
|
|
24
|
-
$0 [-T <int> ]<output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
|
|
25
|
-
|
|
26
|
-
Where,
|
|
27
|
-
-T <int> : Optional. Integer indicating the sampling period for
|
|
28
|
-
names evaluation (see Description above).
|
|
29
|
-
By default: 1000.
|
|
30
|
-
output_fasta : Output file
|
|
31
|
-
input_fasta_1 : First FastA file
|
|
32
|
-
input_fasta_2 : Second FastA file
|
|
33
|
-
... : Any additional FastA files (or none)
|
|
34
|
-
|
|
35
|
-
HELP
|
|
36
|
-
;
|
|
37
|
-
my $eval_T = 1000;
|
|
38
|
-
if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
|
|
39
|
-
$eval_T = $ARGV[1]+0;
|
|
40
|
-
shift @ARGV;
|
|
41
|
-
shift @ARGV;
|
|
42
|
-
}
|
|
43
|
-
my $out = shift @ARGV;
|
|
44
|
-
my @in = @ARGV;
|
|
45
|
-
$/ = "\n>";
|
|
46
|
-
|
|
47
|
-
die $HELP unless $out and $#in >= 1;
|
|
48
|
-
open OUT, ">", $out or die "Unable to write on $out: $!\n";
|
|
49
|
-
print "Output file: $out\n";
|
|
50
|
-
|
|
51
|
-
my @in_fh = ();
|
|
52
|
-
|
|
53
|
-
for my $k (0 .. $#in) {
|
|
54
|
-
$in_fh[$k] = gensym;
|
|
55
|
-
open $in_fh[$k], "<", $in[$k] or die "Unable to read $in[$k]: $!\n";
|
|
56
|
-
print "Input file: $in[$k]\n";
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
my $i = 0;
|
|
60
|
-
my $frl;
|
|
61
|
-
LINE: while(1){
|
|
62
|
-
my $name = "";
|
|
63
|
-
print STDERR "\rEntry: $i " unless $i % 1000;
|
|
64
|
-
FILE: for my $k (0 .. $#in_fh){
|
|
65
|
-
my $ln = readline($in_fh[$k]);
|
|
66
|
-
last LINE if $k==0 and not defined $ln;
|
|
67
|
-
defined $ln or die "Impossible to read next entry ($.) from $in[$k]: $!\n";
|
|
68
|
-
$ln =~ s/^\>?/>/;
|
|
69
|
-
$ln =~ s/\>$//;
|
|
70
|
-
$ln =~ s/^;.*//gm;
|
|
71
|
-
if($eval_T and not $i % $eval_T){
|
|
72
|
-
unless($name){
|
|
73
|
-
$ln =~ m/^>(.*?)[\/ \\_]\d+/ or die "Impossible to evaluate names!\n offending entry:\n$ln\n";
|
|
74
|
-
$name = $1;
|
|
75
|
-
}
|
|
76
|
-
die "Inconsistent name!\n base name is $name\n offending entry is:\n$ln\n" unless $ln =~ /^>$name/;
|
|
77
|
-
}
|
|
78
|
-
unless($frl){
|
|
79
|
-
$ln =~ m/^>.*?\n(.*?)\n/ or die "Unexpected format!\n offending entry:\n$ln\n";
|
|
80
|
-
my $i = $ln;
|
|
81
|
-
$i =~ s/^>.*?\n//;
|
|
82
|
-
$i =~ s/\n//g;
|
|
83
|
-
$frl = length $i;
|
|
84
|
-
}
|
|
85
|
-
print OUT $ln;
|
|
86
|
-
}
|
|
87
|
-
$i++;
|
|
88
|
-
}
|
|
89
|
-
print "\rNumber of entries: $i \nFirst read length: $frl\n";
|
|
90
|
-
close OUT;
|
|
91
|
-
|
|
92
|
-
for my $k(0..$#in_fh){print "ALERT: The file $in[$k] contains trailing entries\n" if defined readline($in_fh[$k])}
|
|
93
|
-
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
#
|
|
3
|
-
# @author Luis M Rodriguez-R
|
|
4
|
-
# @update Oct-07-2015
|
|
5
|
-
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
use warnings;
|
|
9
|
-
use strict;
|
|
10
|
-
|
|
11
|
-
$#ARGV>=0 or die "
|
|
12
|
-
Usage:
|
|
13
|
-
$0 seqs.fa... > length.txt
|
|
14
|
-
|
|
15
|
-
seqs.fa One or more FastA files.
|
|
16
|
-
length.txt A table with the lengths of the sequences.
|
|
17
|
-
|
|
18
|
-
";
|
|
19
|
-
|
|
20
|
-
for my $fa (@ARGV){
|
|
21
|
-
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
|
22
|
-
my $def = '';
|
|
23
|
-
my $len = 0;
|
|
24
|
-
while(<FA>){
|
|
25
|
-
next if /^;/;
|
|
26
|
-
if(m/^>(\S+)\s?/){
|
|
27
|
-
print "$def\t$len\n" if $def;
|
|
28
|
-
$def = $1;
|
|
29
|
-
$len = 0;
|
|
30
|
-
}else{
|
|
31
|
-
s/[^A-Za-z]//g;
|
|
32
|
-
$len+= length $_;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
print "$def\t$len\n" if $def;
|
|
36
|
-
close FA;
|
|
37
|
-
}
|
|
38
|
-
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require 'optparse'
|
|
4
|
-
o = {x: 'N', trim: false, wrap: 70}
|
|
5
|
-
ARGV << '-h' if ARGV.empty?
|
|
6
|
-
OptionParser.new do |opts|
|
|
7
|
-
opts.banner = "
|
|
8
|
-
Mask sequence region(s) in a FastA file.
|
|
9
|
-
|
|
10
|
-
Usage: #{$0} [options]"
|
|
11
|
-
opts.separator ''
|
|
12
|
-
opts.separator 'Mandatory'
|
|
13
|
-
opts.on('-i', '--in FILE', 'Input FastA file.'){ |v| o[:in] = v }
|
|
14
|
-
opts.on('-o', '--out FILE', 'Output FastA file.'){ |v| o[:out] = v }
|
|
15
|
-
opts.on('-r', '--regions REG1,REG2,...', Array,
|
|
16
|
-
'Regions to mask separated by commas.',
|
|
17
|
-
'Each region must be in the format "sequence_id:from..to"'
|
|
18
|
-
){ |v| o[:reg] = v }
|
|
19
|
-
opts.separator ''
|
|
20
|
-
opts.separator 'Options'
|
|
21
|
-
opts.on('-x', '--symbol CHAR',
|
|
22
|
-
'Character used to mask the region(s)',
|
|
23
|
-
"By default: #{o[:x]}."){ |v| o[:x] = v }
|
|
24
|
-
opts.on('-t', '--trim',
|
|
25
|
-
'Trim masked regions extending to the edge of a sequence'
|
|
26
|
-
){ |v| o[:trim] = v }
|
|
27
|
-
opts.on('-w', '--wrap INT',
|
|
28
|
-
'Line length to wrap sequences. Use 0 to generate 1-line sequences.',
|
|
29
|
-
"By default: #{o[:wrap]}."){ |v| o[:wrap] = v.to_i }
|
|
30
|
-
opts.on('-h', '--help', 'Display this screen.') do
|
|
31
|
-
puts opts
|
|
32
|
-
exit
|
|
33
|
-
end
|
|
34
|
-
opts.separator ''
|
|
35
|
-
end.parse!
|
|
36
|
-
abort '-i is mandatory' if o[:in].nil?
|
|
37
|
-
abort '-o is mandatory' if o[:out].nil?
|
|
38
|
-
abort '-r is mandatory' if o[:reg].nil?
|
|
39
|
-
|
|
40
|
-
def wrap_width(txt, len)
|
|
41
|
-
return "" if txt.empty?
|
|
42
|
-
return "#{txt}\n" if len==0
|
|
43
|
-
txt.gsub(/(.{1,#{len}})/,"\\1\n")
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# Read input sequences
|
|
47
|
-
sq = {}
|
|
48
|
-
File.open(o[:in], 'r') do |ifh|
|
|
49
|
-
bf = ''
|
|
50
|
-
ifh.each('>') do |i|
|
|
51
|
-
(dln, seq) = i.split(/[\n\r]+/, 2)
|
|
52
|
-
next if seq.nil?
|
|
53
|
-
id = dln.gsub(/\s.*/, '')
|
|
54
|
-
seq.gsub!(/[\s>]/, '')
|
|
55
|
-
sq[id] = [dln, seq]
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# Parse coordinates and mask regions
|
|
60
|
-
last_id = nil
|
|
61
|
-
o[:reg].each do |i|
|
|
62
|
-
m = i.match(/^(?:(.+):)?(\d+)\.\.(\d+)$/) or
|
|
63
|
-
abort "Unexpected region format: #{i}"
|
|
64
|
-
r = [m[1], m[2].to_i-1, m[3].to_i-1]
|
|
65
|
-
if r[0].nil?
|
|
66
|
-
abort "Region missing sequence ID: #{i}" if last_id.nil?
|
|
67
|
-
r[0] = last_id
|
|
68
|
-
end
|
|
69
|
-
last_id = r[0]
|
|
70
|
-
sq[r[0]] or abort "Cannot find sequence #{r[0]}"
|
|
71
|
-
r[1] <= r[2] or abort "Malformed range: #{i}"
|
|
72
|
-
if r[1] < 0 or r[2] > sq[r[0]][1].size
|
|
73
|
-
abort "Range extends beyond the edge of the sequence: #{i}"
|
|
74
|
-
end
|
|
75
|
-
sq[r[0]][1][r[1] .. r[2]] = o[:x]*(1+r[2]-r[1])
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
# Trim sequences and generate output
|
|
79
|
-
ofh = File.open(o[:out], 'w')
|
|
80
|
-
sq.each do |_k,v|
|
|
81
|
-
ofh.puts ">#{v[0]}"
|
|
82
|
-
if o[:trim]
|
|
83
|
-
v[1].gsub!(/^#{o[:x]}+/,'')
|
|
84
|
-
v[1].gsub!(/#{o[:x]}+$/,'')
|
|
85
|
-
end
|
|
86
|
-
ofh.print wrap_width(v[1], o[:wrap])
|
|
87
|
-
end
|
|
88
|
-
ofh.close
|
|
89
|
-
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env perl
|
|
2
|
-
#
|
|
3
|
-
# @author Luis M Rodriguez-R
|
|
4
|
-
# @update Mar-17-2016
|
|
5
|
-
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
use warnings;
|
|
9
|
-
use strict;
|
|
10
|
-
|
|
11
|
-
$#ARGV>=1 or die "
|
|
12
|
-
Usage:
|
|
13
|
-
$0 outdir seqs.fa...
|
|
14
|
-
|
|
15
|
-
outdir Output directory for the individual files.
|
|
16
|
-
seqs.fa One or more FastA files.
|
|
17
|
-
|
|
18
|
-
";
|
|
19
|
-
|
|
20
|
-
my $dir = shift @ARGV;
|
|
21
|
-
|
|
22
|
-
for my $fa (@ARGV){
|
|
23
|
-
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
|
24
|
-
my $file = '';
|
|
25
|
-
while(<FA>){
|
|
26
|
-
next if /^;/;
|
|
27
|
-
if(m/^>(\S+)\s?/){
|
|
28
|
-
close ONE if $file;
|
|
29
|
-
$file = $dir."/".$1.".fasta";
|
|
30
|
-
open ONE, ">", $file or die "Cannot open file: $file: $!\n";
|
|
31
|
-
}
|
|
32
|
-
print ONE $_ if $file;
|
|
33
|
-
}
|
|
34
|
-
close ONE if $file;
|
|
35
|
-
}
|
|
36
|
-
|