miga-base 0.4.3.0 → 0.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
data/lib/miga/daemon.rb
CHANGED
@@ -35,13 +35,16 @@ class MiGA::Daemon < MiGA::MiGA
|
|
35
35
|
|
36
36
|
##
|
37
37
|
# Initialize an unactive daemon for the MiGA::Project +project+. See #daemon
|
38
|
-
# to wake the daemon.
|
39
|
-
|
38
|
+
# to wake the daemon. If passed, +json+ must be the path to a daemon
|
39
|
+
# definition in json format. Otherwise, the project-stored daemon definition
|
40
|
+
# is used. In either case, missing variables are used as defined in
|
41
|
+
# ~/.miga_daemon.json.
|
42
|
+
def initialize(project, json = nil)
|
40
43
|
$_MIGA_DAEMON_LAIR << self
|
41
44
|
@project = project
|
45
|
+
json ||= File.expand_path('daemon/daemon.json', project.path)
|
42
46
|
@runopts = MiGA::Json.parse(
|
43
|
-
File.expand_path('
|
44
|
-
default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
47
|
+
json, default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
45
48
|
@jobs_to_run = []
|
46
49
|
@jobs_running = []
|
47
50
|
@loop_i = -1
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -35,14 +35,14 @@ module MiGA::Dataset::Base
|
|
35
35
|
##
|
36
36
|
# Supported dataset types.
|
37
37
|
@@KNOWN_TYPES = {
|
38
|
-
genome: {description: 'The genome from an isolate
|
39
|
-
scgenome: {description: 'A Single-cell
|
38
|
+
genome: {description: 'The genome from an isolate', multi: false},
|
39
|
+
scgenome: {description: 'A Single-cell Amplified Genome (SAG)',
|
40
40
|
multi: false},
|
41
|
-
popgenome: {description: 'A Metagenome-Assembled Genome (MAG)
|
41
|
+
popgenome: {description: 'A Metagenome-Assembled Genome (MAG)',
|
42
42
|
:multi=>false},
|
43
|
-
metagenome: {description: 'A metagenome (excluding viromes)
|
43
|
+
metagenome: {description: 'A metagenome (excluding viromes)',
|
44
44
|
multi: true},
|
45
|
-
virome: {description: 'A viral metagenome
|
45
|
+
virome: {description: 'A viral metagenome', multi: true}
|
46
46
|
}
|
47
47
|
|
48
48
|
##
|
data/lib/miga/project/base.rb
CHANGED
@@ -76,13 +76,13 @@ module MiGA::Project::Base
|
|
76
76
|
# Supported types of projects.
|
77
77
|
@@KNOWN_TYPES = {
|
78
78
|
mixed: {
|
79
|
-
description: "Mixed collection of genomes, metagenomes, and viromes
|
79
|
+
description: "Mixed collection of genomes, metagenomes, and viromes",
|
80
80
|
single: true, multi: true},
|
81
|
-
genomes: {description: "Collection of genomes
|
81
|
+
genomes: {description: "Collection of genomes",
|
82
82
|
single: true, multi: false},
|
83
|
-
clade: {description: "Collection of closely-related genomes (ANI >= 90%)
|
83
|
+
clade: {description: "Collection of closely-related genomes (ANI >= 90%)",
|
84
84
|
single: true, multi: false},
|
85
|
-
metagenomes: {description: "Collection of metagenomes and/or viromes
|
85
|
+
metagenomes: {description: "Collection of metagenomes and/or viromes",
|
86
86
|
single: false, multi: true}
|
87
87
|
}
|
88
88
|
|
data/lib/miga/project/result.rb
CHANGED
@@ -110,7 +110,8 @@ module MiGA::Project::Result
|
|
110
110
|
r.add_file(:proposal, 'miga-project.proposed-clades')
|
111
111
|
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
112
112
|
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
113
|
-
r.add_file(:
|
113
|
+
r.add_file(:clades_gsp, 'miga-project.gsp-clades')
|
114
|
+
r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
|
114
115
|
r
|
115
116
|
end
|
116
117
|
|
@@ -35,9 +35,9 @@ module MiGA::RemoteDataset::Base
|
|
35
35
|
@@UNIVERSE = {
|
36
36
|
web: {
|
37
37
|
dbs: {
|
38
|
-
assembly: {stage: :assembly, format: :fasta},
|
39
|
-
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
40
|
-
text: {stage: :metadata, format: :text}
|
38
|
+
assembly: { stage: :assembly, format: :fasta },
|
39
|
+
assembly_gz: { stage: :assembly, format: :fasta_gz },
|
40
|
+
text: { stage: :metadata, format: :text }
|
41
41
|
},
|
42
42
|
url: '%2$s',
|
43
43
|
method: :net
|
@@ -59,8 +59,8 @@ module MiGA::RemoteDataset::Base
|
|
59
59
|
},
|
60
60
|
ncbi_map: {
|
61
61
|
dbs: {
|
62
|
-
nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
|
63
|
-
format: :json},
|
62
|
+
nuccore: { stage: :metadata, map_to: [:biosample, :assembly],
|
63
|
+
format: :json },
|
64
64
|
biosample: {stage: :metadata, map_to: [:assembly], format: :json}
|
65
65
|
},
|
66
66
|
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
data/lib/miga/version.rb
CHANGED
@@ -10,15 +10,15 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.
|
13
|
+
VERSION = [0.5, 0, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
17
|
-
VERSION_NAME = '
|
17
|
+
VERSION_NAME = 'collotype'
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(2019,
|
21
|
+
VERSION_DATE = Date.new(2019, 11, 25)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
data/scripts/cds.bash
CHANGED
@@ -36,7 +36,9 @@ perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
|
|
36
36
|
mv "$DATASET.gff3.t" "$DATASET.gff3"
|
37
37
|
|
38
38
|
# Gzip
|
39
|
-
|
39
|
+
for ext in gff3 faa fna ; do
|
40
|
+
[[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
|
41
|
+
done
|
40
42
|
|
41
43
|
# Finalize
|
42
44
|
miga date > "$DATASET.done"
|
data/scripts/stats.bash
CHANGED
@@ -12,7 +12,7 @@ cd "$DIR"
|
|
12
12
|
miga date > "$DATASET.start"
|
13
13
|
|
14
14
|
# Calculate statistics
|
15
|
-
for i in raw_reads trimmed_fasta assembly cds essential_genes distances ; do
|
15
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
|
16
16
|
echo "# $i"
|
17
17
|
miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
|
18
18
|
done
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -13,9 +13,11 @@ miga date > "$DATASET.start"
|
|
13
13
|
|
14
14
|
# Gunzip (if necessary)
|
15
15
|
for sis in 1 2 ; do
|
16
|
-
|
17
|
-
|
18
|
-
&&
|
16
|
+
for ext in clipped clipped.single ; do
|
17
|
+
[[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
|
18
|
+
&& ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
|
19
|
+
&& gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
|
20
|
+
done
|
19
21
|
done
|
20
22
|
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
21
23
|
|
data/utils/distance/runner.rb
CHANGED
@@ -23,6 +23,9 @@ class MiGA::DistanceRunner
|
|
23
23
|
@dataset = project.dataset(dataset_name)
|
24
24
|
@home = File.expand_path('data/09.distances', project.path)
|
25
25
|
# Default opts
|
26
|
+
if project.metadata[:aai_save_rbm] == false
|
27
|
+
@opts[:aai_save_rbm] ||= 'no-save-rbm'
|
28
|
+
end
|
26
29
|
@opts[:aai_save_rbm] ||= ENV.fetch('MIGA_AAI_SAVE_RBM') do
|
27
30
|
project.is_clade? ? 'save-rbm' : 'no-save-rbm'
|
28
31
|
end
|
data/utils/distance/temporal.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'tmpdir'
|
3
|
+
require 'zlib'
|
3
4
|
|
4
5
|
module MiGA::DistanceRunner::Temporal
|
5
6
|
|
@@ -9,7 +10,15 @@ module MiGA::DistanceRunner::Temporal
|
|
9
10
|
rf.each do |res, file|
|
10
11
|
r = dataset.result(res)
|
11
12
|
f = r.nil? ? nil : r.file_path(file)
|
12
|
-
|
13
|
+
unless f.nil?
|
14
|
+
if f =~ /\.gz/
|
15
|
+
File.open(tmp_file("#{file}.fa"), 'w') do |ofh|
|
16
|
+
Zlib::GzipReader.open(f) { |ifh| ofh.print ifh.read }
|
17
|
+
end
|
18
|
+
else
|
19
|
+
FileUtils.cp(f, tmp_file("#{file}.fa"))
|
20
|
+
end
|
21
|
+
end
|
13
22
|
end
|
14
23
|
end
|
15
24
|
|
@@ -610,6 +610,11 @@
|
|
610
610
|
"opt": "--defline",
|
611
611
|
"description": "Keep the original defline after a space."
|
612
612
|
},
|
613
|
+
{
|
614
|
+
"opt": "--list",
|
615
|
+
"arg": "in_file",
|
616
|
+
"description": "Reads a list of IDS."
|
617
|
+
},
|
613
618
|
{
|
614
619
|
"opt": "--quiet",
|
615
620
|
"description": "Run quietly (no STDERR output)."
|
@@ -388,6 +388,13 @@
|
|
388
388
|
"arg": "out_file",
|
389
389
|
"description": "Output file containing the aligned proteins."
|
390
390
|
},
|
391
|
+
{
|
392
|
+
"opt": "--components",
|
393
|
+
"arg": "out_file",
|
394
|
+
"description": ["Output file containing the components of the",
|
395
|
+
"estimation. Tab-delimited file with model name, matches, and",
|
396
|
+
"columns."]
|
397
|
+
},
|
391
398
|
{
|
392
399
|
"opt": "--quiet",
|
393
400
|
"description": "Run quietly (no STDERR output)."
|
@@ -2,46 +2,46 @@
|
|
2
2
|
|
3
3
|
#
|
4
4
|
# @author: Luis M. Rodriguez-R
|
5
|
-
# @update: Feb-06-2015
|
6
5
|
# @license: artistic license 2.0
|
7
6
|
#
|
8
7
|
|
9
8
|
require 'optparse'
|
10
9
|
|
11
|
-
o = {:
|
12
|
-
ARGV << '-h' if ARGV.size==0
|
10
|
+
o = { sbj: false, q: false }
|
11
|
+
ARGV << '-h' if ARGV.size == 0
|
13
12
|
OptionParser.new do |opts|
|
14
|
-
|
15
|
-
Appends an extra column to a BLAST with the length of the query or the subject
|
16
|
-
You can pipe two instances to add both:
|
17
|
-
|
13
|
+
opts.banner = "
|
14
|
+
Appends an extra column to a BLAST with the length of the query or the subject
|
15
|
+
sequence. You can pipe two instances to add both:
|
16
|
+
cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
|
18
17
|
|
19
18
|
Usage: #{$0} [options] < input.blast > output.blast"
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Mandatory'
|
21
|
+
opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
|
22
|
+
opts.separator ''
|
23
|
+
opts.separator 'Options'
|
24
|
+
opts.on('-s', '--subject',
|
25
|
+
'Use the subject column of the BLAST, by default the query column is used'
|
26
|
+
){ o[:sbj] = true }
|
27
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
28
|
+
opts.on('-h', '--help', 'Display this screen') do
|
29
|
+
puts opts
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
opts.separator ''
|
33
33
|
end.parse!
|
34
|
-
abort
|
34
|
+
abort '-f is mandatory' if o[:fasta].nil?
|
35
35
|
|
36
36
|
len = {}
|
37
|
-
id =
|
38
|
-
$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:
|
39
|
-
fh = File.open(o[:fasta],
|
37
|
+
id = ''
|
38
|
+
$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
|
39
|
+
fh = File.open(o[:fasta], 'r')
|
40
40
|
fh.each_line do |ln|
|
41
41
|
defline = /^>(\S+)/.match(ln)
|
42
42
|
if defline.nil?
|
43
43
|
ln.gsub! /[^A-Za-z]/, ''
|
44
|
-
abort
|
44
|
+
abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
|
45
45
|
len[id] = len[id] + ln.size
|
46
46
|
else
|
47
47
|
id = defline[1]
|
@@ -50,12 +50,14 @@ fh.each_line do |ln|
|
|
50
50
|
end
|
51
51
|
fh.close
|
52
52
|
|
53
|
-
|
53
|
+
unless o[:q]
|
54
|
+
$stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
|
55
|
+
end
|
54
56
|
ARGF.each_line do |ln|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
ln.chomp!
|
58
|
+
row = ln.split /\t/
|
59
|
+
id = o[:sbj] ? row[1] : row[0]
|
60
|
+
abort "Impossible to find sequence of #{id}" if len[id].nil?
|
61
|
+
puts "#{ln}\t#{len[id]}"
|
60
62
|
end
|
61
63
|
|
@@ -1,64 +1,65 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
3
|
# @author Luis M. Rodriguez-R
|
5
|
-
# @update Oct-07-2015
|
6
4
|
# @license artistic license 2.0
|
7
|
-
#
|
8
5
|
|
9
6
|
require 'optparse'
|
10
7
|
|
11
|
-
o = {:
|
8
|
+
o = {q: false, p: '', s: '', d: false}
|
12
9
|
ARGV << '-h' if ARGV.size==0
|
13
10
|
OptionParser.new do |opts|
|
14
|
-
|
11
|
+
opts.banner = "
|
15
12
|
Generates easy-to-parse tagged reads from FastA files.
|
16
13
|
|
17
14
|
Usage: #{$0} [options]"
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
15
|
+
opts.separator ''
|
16
|
+
opts.separator 'Mandatory'
|
17
|
+
opts.on('-i', '--in FILE',
|
18
|
+
'Path to the FastA file containing the sequences.'){ |v| o[:in] = v }
|
19
|
+
opts.on('-o', '--out FILE',
|
20
|
+
'Path to the FastA to create.'){ |v| o[:out] = v }
|
21
|
+
opts.separator ''
|
22
|
+
opts.separator 'ID options'
|
23
|
+
opts.on('-p', '--prefix STR', 'Prefix to use in all IDs.'){ |v| o[:p] = v }
|
24
|
+
opts.on('-s', '--suffix STR', 'Suffix to use in all IDs.'){ |v| o[:s] = v }
|
25
|
+
opts.on('-d', '--defline',
|
26
|
+
'Keep the original defline after a space.'){ o[:d] = true }
|
27
|
+
opts.on('-l', '--list FILE',
|
28
|
+
'Reads a list of IDS.'){ |v| o[:l] = v }
|
29
|
+
opts.separator ''
|
30
|
+
opts.separator 'Other Options'
|
31
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
32
|
+
opts.on('-h', '--help', 'Display this screen') do
|
33
|
+
puts opts
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
opts.separator ''
|
38
37
|
end.parse!
|
39
|
-
abort
|
40
|
-
abort
|
38
|
+
abort '-i is mandatory' if o[:in].nil?
|
39
|
+
abort '-o is mandatory' if o[:out].nil?
|
41
40
|
|
42
41
|
begin
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
42
|
+
list = o[:l].nil? ? nil :
|
43
|
+
File.readlines(o[:l]).map{ |i| i.chomp.gsub(/^>/, '') }
|
44
|
+
ofh = File.open(o[:out], 'w')
|
45
|
+
i = 0
|
46
|
+
File.open(o[:in], 'r') do |ifh|
|
47
|
+
ifh.each do |ln|
|
47
48
|
ln.chomp!
|
48
49
|
next if ln =~ /^;/
|
49
50
|
unless /^>/.match(ln).nil?
|
50
|
-
|
51
|
-
|
51
|
+
i += 1
|
52
|
+
new_id = o[:l].nil? ? i : list.shift
|
53
|
+
ofh.puts ">#{o[:p]}#{new_id}#{o[:s]}#{o[:d]?" #{ln[1, ln.size-1]}":''}"
|
52
54
|
else
|
53
|
-
|
55
|
+
ofh.puts ln
|
54
56
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
57
|
+
end
|
58
|
+
end
|
59
|
+
ofh.close
|
58
60
|
rescue => err
|
59
|
-
|
60
|
-
|
61
|
-
|
61
|
+
$stderr.puts "Exception: #{err}\n\n"
|
62
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
63
|
+
err
|
62
64
|
end
|
63
65
|
|
64
|
-
|
@@ -8,8 +8,10 @@ require 'enveomics_rb/enveomics'
|
|
8
8
|
use 'tmpdir'
|
9
9
|
use 'zlib'
|
10
10
|
|
11
|
-
o = {
|
12
|
-
|
11
|
+
o = {
|
12
|
+
bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
13
|
+
archaea: false, genomeeq: false, metagenome: false, list: false
|
14
|
+
}
|
13
15
|
OptionParser.new do |opts|
|
14
16
|
opts.banner = "
|
15
17
|
Finds and extracts a collection of essential proteins suitable for genome
|
@@ -26,65 +28,86 @@ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
|
|
26
28
|
Usage: #{$0} [options]"
|
27
29
|
opts.separator ''
|
28
30
|
opts.separator 'Mandatory'
|
29
|
-
opts.on(
|
30
|
-
'
|
31
|
-
)
|
31
|
+
opts.on(
|
32
|
+
'-i', '--in FILE',
|
33
|
+
'Path to the FastA file (.gz allowed) with all the proteins in a genome'
|
34
|
+
) { |v| o[:in] = v }
|
32
35
|
opts.separator ''
|
33
36
|
opts.separator 'Report Options'
|
34
|
-
opts.on(
|
35
|
-
'
|
36
|
-
'
|
37
|
-
|
37
|
+
opts.on(
|
38
|
+
'-o', '--out FILE',
|
39
|
+
'Path to the output FastA file with the translated essential genes',
|
40
|
+
'By default the file is not produced'
|
41
|
+
) { |v| o[:out] = v }
|
42
|
+
opts.on(
|
43
|
+
'-m', '--per-model STR',
|
38
44
|
'Prefix of translated genes in independent files with the name of the',
|
39
|
-
'model appended. By default files are not produced
|
40
|
-
|
41
|
-
opts.on(
|
42
|
-
'
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
'model appended. By default files are not produced'
|
46
|
+
) { |v| o[:permodel] = v }
|
47
|
+
opts.on(
|
48
|
+
'-R', '--report FILE',
|
49
|
+
'Path to the report file. By default, the report is sent to the STDOUT'
|
50
|
+
) { |v| o[:report] = v }
|
51
|
+
opts.on(
|
52
|
+
'--hmm-out FILE',
|
53
|
+
'Save HMMsearch output in this file. By default, not saved'
|
54
|
+
) { |v| o[:hmmout] = v }
|
55
|
+
opts.on(
|
56
|
+
'--alignments FILE',
|
48
57
|
'Save the aligned proteins in this file. By default, not saved'
|
49
|
-
|
50
|
-
opts.on(
|
51
|
-
'
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
) { |v| o[:alignments] = v }
|
59
|
+
opts.on(
|
60
|
+
'-B', '--bacteria',
|
61
|
+
'If set, ignores models typically missing in Bacteria'
|
62
|
+
) { |v| o[:bacteria] = v }
|
63
|
+
opts.on(
|
64
|
+
'-A', '--archaea',
|
65
|
+
'If set, ignores models typically missing in Archaea'
|
66
|
+
) { |v| o[:archaea] = v }
|
67
|
+
opts.on(
|
68
|
+
'-G', '--genome-eq',
|
69
|
+
'If set, ignores models not suitable for genome-equivalents estimations',
|
70
|
+
'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
|
71
|
+
) { |v| o[:genomeeq] = v }
|
72
|
+
opts.on(
|
73
|
+
'-r', '--rename STR',
|
61
74
|
'If set, renames the sequences with the string provided and appends it',
|
62
|
-
'with pipe and the gene name (except in --per-model files)
|
63
|
-
|
64
|
-
opts.on(
|
65
|
-
'
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
75
|
+
'with pipe and the gene name (except in --per-model files)'
|
76
|
+
) { |v| o[:rename] = v }
|
77
|
+
opts.on(
|
78
|
+
'-n', '--no-stats',
|
79
|
+
'If set, no statistics are reported on genome evaluation'
|
80
|
+
) { |v| o[:stats] = v }
|
81
|
+
opts.on(
|
82
|
+
'-s', '--no-genes',
|
83
|
+
'If set, statistics won\'t include the lists of missing/multi-copy genes'
|
84
|
+
) { |v| o[:genes] = v }
|
85
|
+
opts.on(
|
86
|
+
'-M', '--metagenome',
|
71
87
|
'If set, it allows for multiple copies of each gene and turns on',
|
72
|
-
'metagenomic report mode
|
88
|
+
'metagenomic report mode'
|
89
|
+
) { |v| o[:metagenome] = v }
|
73
90
|
opts.separator ''
|
74
91
|
opts.separator 'Other Options'
|
75
|
-
opts.on(
|
92
|
+
opts.on(
|
93
|
+
'-L', '--list-models',
|
76
94
|
'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
|
77
|
-
'and -q; ignores all other parameters
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
opts.on(
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
opts.on(
|
95
|
+
'and -q; ignores all other parameters'
|
96
|
+
) { |v| o[:list] = v }
|
97
|
+
opts.on(
|
98
|
+
'-b', '--bin DIR',
|
99
|
+
'Path to the directory containing the binaries of HMMer 3.0+'
|
100
|
+
) { |v| o[:bin] = v }
|
101
|
+
opts.on(
|
102
|
+
'--model-file',
|
103
|
+
'External file containing models to search'
|
104
|
+
) { |v| o[:model_file] = v }
|
105
|
+
opts.on(
|
106
|
+
'-t', '--threads INT', Integer,
|
107
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
108
|
+
) { |v| o[:thr] = v }
|
109
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
110
|
+
opts.on('-h', '--help', 'Display this screen') do
|
88
111
|
puts opts
|
89
112
|
exit
|
90
113
|
end
|
@@ -112,6 +135,13 @@ TIGR00389 TIGR00436 tRNA-synth_1d}
|
|
112
135
|
begin
|
113
136
|
Dir.mktmpdir do |dir|
|
114
137
|
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
138
|
+
if o[:in] =~ /\.gz/
|
139
|
+
tmp_in = File.expand_path('sequences.fa', dir)
|
140
|
+
Zlib::GzipReader.open(o[:in]) do |ifh|
|
141
|
+
File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
|
142
|
+
end
|
143
|
+
o[:in] = tmp_in
|
144
|
+
end
|
115
145
|
|
116
146
|
# Create database.
|
117
147
|
$stderr.puts 'Searching models.' unless o[:q]
|
@@ -144,9 +174,9 @@ begin
|
|
144
174
|
'This script requires HMMER 3.0+.'
|
145
175
|
end
|
146
176
|
o[:hmmout] ||= "#{dir}/hmmsearch"
|
147
|
-
`
|
148
|
-
-A
|
149
|
-
> #{dir}/hmmsearch.log`
|
177
|
+
`'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
|
178
|
+
-A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
|
179
|
+
> '#{dir}/hmmsearch.log'`
|
150
180
|
|
151
181
|
# Parse output
|
152
182
|
$stderr.puts 'Parsing results.' unless o[:q]
|