miga-base 0.4.3.0 → 0.5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
data/lib/miga/daemon.rb
CHANGED
@@ -35,13 +35,16 @@ class MiGA::Daemon < MiGA::MiGA
|
|
35
35
|
|
36
36
|
##
|
37
37
|
# Initialize an unactive daemon for the MiGA::Project +project+. See #daemon
|
38
|
-
# to wake the daemon.
|
39
|
-
|
38
|
+
# to wake the daemon. If passed, +json+ must be the path to a daemon
|
39
|
+
# definition in json format. Otherwise, the project-stored daemon definition
|
40
|
+
# is used. In either case, missing variables are used as defined in
|
41
|
+
# ~/.miga_daemon.json.
|
42
|
+
def initialize(project, json = nil)
|
40
43
|
$_MIGA_DAEMON_LAIR << self
|
41
44
|
@project = project
|
45
|
+
json ||= File.expand_path('daemon/daemon.json', project.path)
|
42
46
|
@runopts = MiGA::Json.parse(
|
43
|
-
File.expand_path('
|
44
|
-
default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
47
|
+
json, default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
45
48
|
@jobs_to_run = []
|
46
49
|
@jobs_running = []
|
47
50
|
@loop_i = -1
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -35,14 +35,14 @@ module MiGA::Dataset::Base
|
|
35
35
|
##
|
36
36
|
# Supported dataset types.
|
37
37
|
@@KNOWN_TYPES = {
|
38
|
-
genome: {description: 'The genome from an isolate
|
39
|
-
scgenome: {description: 'A Single-cell
|
38
|
+
genome: {description: 'The genome from an isolate', multi: false},
|
39
|
+
scgenome: {description: 'A Single-cell Amplified Genome (SAG)',
|
40
40
|
multi: false},
|
41
|
-
popgenome: {description: 'A Metagenome-Assembled Genome (MAG)
|
41
|
+
popgenome: {description: 'A Metagenome-Assembled Genome (MAG)',
|
42
42
|
:multi=>false},
|
43
|
-
metagenome: {description: 'A metagenome (excluding viromes)
|
43
|
+
metagenome: {description: 'A metagenome (excluding viromes)',
|
44
44
|
multi: true},
|
45
|
-
virome: {description: 'A viral metagenome
|
45
|
+
virome: {description: 'A viral metagenome', multi: true}
|
46
46
|
}
|
47
47
|
|
48
48
|
##
|
data/lib/miga/project/base.rb
CHANGED
@@ -76,13 +76,13 @@ module MiGA::Project::Base
|
|
76
76
|
# Supported types of projects.
|
77
77
|
@@KNOWN_TYPES = {
|
78
78
|
mixed: {
|
79
|
-
description: "Mixed collection of genomes, metagenomes, and viromes
|
79
|
+
description: "Mixed collection of genomes, metagenomes, and viromes",
|
80
80
|
single: true, multi: true},
|
81
|
-
genomes: {description: "Collection of genomes
|
81
|
+
genomes: {description: "Collection of genomes",
|
82
82
|
single: true, multi: false},
|
83
|
-
clade: {description: "Collection of closely-related genomes (ANI >= 90%)
|
83
|
+
clade: {description: "Collection of closely-related genomes (ANI >= 90%)",
|
84
84
|
single: true, multi: false},
|
85
|
-
metagenomes: {description: "Collection of metagenomes and/or viromes
|
85
|
+
metagenomes: {description: "Collection of metagenomes and/or viromes",
|
86
86
|
single: false, multi: true}
|
87
87
|
}
|
88
88
|
|
data/lib/miga/project/result.rb
CHANGED
@@ -110,7 +110,8 @@ module MiGA::Project::Result
|
|
110
110
|
r.add_file(:proposal, 'miga-project.proposed-clades')
|
111
111
|
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
112
112
|
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
113
|
-
r.add_file(:
|
113
|
+
r.add_file(:clades_gsp, 'miga-project.gsp-clades')
|
114
|
+
r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
|
114
115
|
r
|
115
116
|
end
|
116
117
|
|
@@ -35,9 +35,9 @@ module MiGA::RemoteDataset::Base
|
|
35
35
|
@@UNIVERSE = {
|
36
36
|
web: {
|
37
37
|
dbs: {
|
38
|
-
assembly: {stage: :assembly, format: :fasta},
|
39
|
-
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
40
|
-
text: {stage: :metadata, format: :text}
|
38
|
+
assembly: { stage: :assembly, format: :fasta },
|
39
|
+
assembly_gz: { stage: :assembly, format: :fasta_gz },
|
40
|
+
text: { stage: :metadata, format: :text }
|
41
41
|
},
|
42
42
|
url: '%2$s',
|
43
43
|
method: :net
|
@@ -59,8 +59,8 @@ module MiGA::RemoteDataset::Base
|
|
59
59
|
},
|
60
60
|
ncbi_map: {
|
61
61
|
dbs: {
|
62
|
-
nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
|
63
|
-
format: :json},
|
62
|
+
nuccore: { stage: :metadata, map_to: [:biosample, :assembly],
|
63
|
+
format: :json },
|
64
64
|
biosample: {stage: :metadata, map_to: [:assembly], format: :json}
|
65
65
|
},
|
66
66
|
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
data/lib/miga/version.rb
CHANGED
@@ -10,15 +10,15 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.
|
13
|
+
VERSION = [0.5, 0, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
17
|
-
VERSION_NAME = '
|
17
|
+
VERSION_NAME = 'collotype'
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(2019,
|
21
|
+
VERSION_DATE = Date.new(2019, 11, 25)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
data/scripts/cds.bash
CHANGED
@@ -36,7 +36,9 @@ perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
|
|
36
36
|
mv "$DATASET.gff3.t" "$DATASET.gff3"
|
37
37
|
|
38
38
|
# Gzip
|
39
|
-
|
39
|
+
for ext in gff3 faa fna ; do
|
40
|
+
[[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
|
41
|
+
done
|
40
42
|
|
41
43
|
# Finalize
|
42
44
|
miga date > "$DATASET.done"
|
data/scripts/stats.bash
CHANGED
@@ -12,7 +12,7 @@ cd "$DIR"
|
|
12
12
|
miga date > "$DATASET.start"
|
13
13
|
|
14
14
|
# Calculate statistics
|
15
|
-
for i in raw_reads trimmed_fasta assembly cds essential_genes distances ; do
|
15
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
|
16
16
|
echo "# $i"
|
17
17
|
miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
|
18
18
|
done
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -13,9 +13,11 @@ miga date > "$DATASET.start"
|
|
13
13
|
|
14
14
|
# Gunzip (if necessary)
|
15
15
|
for sis in 1 2 ; do
|
16
|
-
|
17
|
-
|
18
|
-
&&
|
16
|
+
for ext in clipped clipped.single ; do
|
17
|
+
[[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
|
18
|
+
&& ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
|
19
|
+
&& gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
|
20
|
+
done
|
19
21
|
done
|
20
22
|
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
21
23
|
|
data/utils/distance/runner.rb
CHANGED
@@ -23,6 +23,9 @@ class MiGA::DistanceRunner
|
|
23
23
|
@dataset = project.dataset(dataset_name)
|
24
24
|
@home = File.expand_path('data/09.distances', project.path)
|
25
25
|
# Default opts
|
26
|
+
if project.metadata[:aai_save_rbm] == false
|
27
|
+
@opts[:aai_save_rbm] ||= 'no-save-rbm'
|
28
|
+
end
|
26
29
|
@opts[:aai_save_rbm] ||= ENV.fetch('MIGA_AAI_SAVE_RBM') do
|
27
30
|
project.is_clade? ? 'save-rbm' : 'no-save-rbm'
|
28
31
|
end
|
data/utils/distance/temporal.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'tmpdir'
|
3
|
+
require 'zlib'
|
3
4
|
|
4
5
|
module MiGA::DistanceRunner::Temporal
|
5
6
|
|
@@ -9,7 +10,15 @@ module MiGA::DistanceRunner::Temporal
|
|
9
10
|
rf.each do |res, file|
|
10
11
|
r = dataset.result(res)
|
11
12
|
f = r.nil? ? nil : r.file_path(file)
|
12
|
-
|
13
|
+
unless f.nil?
|
14
|
+
if f =~ /\.gz/
|
15
|
+
File.open(tmp_file("#{file}.fa"), 'w') do |ofh|
|
16
|
+
Zlib::GzipReader.open(f) { |ifh| ofh.print ifh.read }
|
17
|
+
end
|
18
|
+
else
|
19
|
+
FileUtils.cp(f, tmp_file("#{file}.fa"))
|
20
|
+
end
|
21
|
+
end
|
13
22
|
end
|
14
23
|
end
|
15
24
|
|
@@ -610,6 +610,11 @@
|
|
610
610
|
"opt": "--defline",
|
611
611
|
"description": "Keep the original defline after a space."
|
612
612
|
},
|
613
|
+
{
|
614
|
+
"opt": "--list",
|
615
|
+
"arg": "in_file",
|
616
|
+
"description": "Reads a list of IDS."
|
617
|
+
},
|
613
618
|
{
|
614
619
|
"opt": "--quiet",
|
615
620
|
"description": "Run quietly (no STDERR output)."
|
@@ -388,6 +388,13 @@
|
|
388
388
|
"arg": "out_file",
|
389
389
|
"description": "Output file containing the aligned proteins."
|
390
390
|
},
|
391
|
+
{
|
392
|
+
"opt": "--components",
|
393
|
+
"arg": "out_file",
|
394
|
+
"description": ["Output file containing the components of the",
|
395
|
+
"estimation. Tab-delimited file with model name, matches, and",
|
396
|
+
"columns."]
|
397
|
+
},
|
391
398
|
{
|
392
399
|
"opt": "--quiet",
|
393
400
|
"description": "Run quietly (no STDERR output)."
|
@@ -2,46 +2,46 @@
|
|
2
2
|
|
3
3
|
#
|
4
4
|
# @author: Luis M. Rodriguez-R
|
5
|
-
# @update: Feb-06-2015
|
6
5
|
# @license: artistic license 2.0
|
7
6
|
#
|
8
7
|
|
9
8
|
require 'optparse'
|
10
9
|
|
11
|
-
o = {:
|
12
|
-
ARGV << '-h' if ARGV.size==0
|
10
|
+
o = { sbj: false, q: false }
|
11
|
+
ARGV << '-h' if ARGV.size == 0
|
13
12
|
OptionParser.new do |opts|
|
14
|
-
|
15
|
-
Appends an extra column to a BLAST with the length of the query or the subject
|
16
|
-
You can pipe two instances to add both:
|
17
|
-
|
13
|
+
opts.banner = "
|
14
|
+
Appends an extra column to a BLAST with the length of the query or the subject
|
15
|
+
sequence. You can pipe two instances to add both:
|
16
|
+
cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
|
18
17
|
|
19
18
|
Usage: #{$0} [options] < input.blast > output.blast"
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Mandatory'
|
21
|
+
opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
|
22
|
+
opts.separator ''
|
23
|
+
opts.separator 'Options'
|
24
|
+
opts.on('-s', '--subject',
|
25
|
+
'Use the subject column of the BLAST, by default the query column is used'
|
26
|
+
){ o[:sbj] = true }
|
27
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
28
|
+
opts.on('-h', '--help', 'Display this screen') do
|
29
|
+
puts opts
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
opts.separator ''
|
33
33
|
end.parse!
|
34
|
-
abort
|
34
|
+
abort '-f is mandatory' if o[:fasta].nil?
|
35
35
|
|
36
36
|
len = {}
|
37
|
-
id =
|
38
|
-
$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:
|
39
|
-
fh = File.open(o[:fasta],
|
37
|
+
id = ''
|
38
|
+
$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
|
39
|
+
fh = File.open(o[:fasta], 'r')
|
40
40
|
fh.each_line do |ln|
|
41
41
|
defline = /^>(\S+)/.match(ln)
|
42
42
|
if defline.nil?
|
43
43
|
ln.gsub! /[^A-Za-z]/, ''
|
44
|
-
abort
|
44
|
+
abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
|
45
45
|
len[id] = len[id] + ln.size
|
46
46
|
else
|
47
47
|
id = defline[1]
|
@@ -50,12 +50,14 @@ fh.each_line do |ln|
|
|
50
50
|
end
|
51
51
|
fh.close
|
52
52
|
|
53
|
-
|
53
|
+
unless o[:q]
|
54
|
+
$stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
|
55
|
+
end
|
54
56
|
ARGF.each_line do |ln|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
ln.chomp!
|
58
|
+
row = ln.split /\t/
|
59
|
+
id = o[:sbj] ? row[1] : row[0]
|
60
|
+
abort "Impossible to find sequence of #{id}" if len[id].nil?
|
61
|
+
puts "#{ln}\t#{len[id]}"
|
60
62
|
end
|
61
63
|
|
@@ -1,64 +1,65 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
3
|
# @author Luis M. Rodriguez-R
|
5
|
-
# @update Oct-07-2015
|
6
4
|
# @license artistic license 2.0
|
7
|
-
#
|
8
5
|
|
9
6
|
require 'optparse'
|
10
7
|
|
11
|
-
o = {:
|
8
|
+
o = {q: false, p: '', s: '', d: false}
|
12
9
|
ARGV << '-h' if ARGV.size==0
|
13
10
|
OptionParser.new do |opts|
|
14
|
-
|
11
|
+
opts.banner = "
|
15
12
|
Generates easy-to-parse tagged reads from FastA files.
|
16
13
|
|
17
14
|
Usage: #{$0} [options]"
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
15
|
+
opts.separator ''
|
16
|
+
opts.separator 'Mandatory'
|
17
|
+
opts.on('-i', '--in FILE',
|
18
|
+
'Path to the FastA file containing the sequences.'){ |v| o[:in] = v }
|
19
|
+
opts.on('-o', '--out FILE',
|
20
|
+
'Path to the FastA to create.'){ |v| o[:out] = v }
|
21
|
+
opts.separator ''
|
22
|
+
opts.separator 'ID options'
|
23
|
+
opts.on('-p', '--prefix STR', 'Prefix to use in all IDs.'){ |v| o[:p] = v }
|
24
|
+
opts.on('-s', '--suffix STR', 'Suffix to use in all IDs.'){ |v| o[:s] = v }
|
25
|
+
opts.on('-d', '--defline',
|
26
|
+
'Keep the original defline after a space.'){ o[:d] = true }
|
27
|
+
opts.on('-l', '--list FILE',
|
28
|
+
'Reads a list of IDS.'){ |v| o[:l] = v }
|
29
|
+
opts.separator ''
|
30
|
+
opts.separator 'Other Options'
|
31
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
32
|
+
opts.on('-h', '--help', 'Display this screen') do
|
33
|
+
puts opts
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
opts.separator ''
|
38
37
|
end.parse!
|
39
|
-
abort
|
40
|
-
abort
|
38
|
+
abort '-i is mandatory' if o[:in].nil?
|
39
|
+
abort '-o is mandatory' if o[:out].nil?
|
41
40
|
|
42
41
|
begin
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
42
|
+
list = o[:l].nil? ? nil :
|
43
|
+
File.readlines(o[:l]).map{ |i| i.chomp.gsub(/^>/, '') }
|
44
|
+
ofh = File.open(o[:out], 'w')
|
45
|
+
i = 0
|
46
|
+
File.open(o[:in], 'r') do |ifh|
|
47
|
+
ifh.each do |ln|
|
47
48
|
ln.chomp!
|
48
49
|
next if ln =~ /^;/
|
49
50
|
unless /^>/.match(ln).nil?
|
50
|
-
|
51
|
-
|
51
|
+
i += 1
|
52
|
+
new_id = o[:l].nil? ? i : list.shift
|
53
|
+
ofh.puts ">#{o[:p]}#{new_id}#{o[:s]}#{o[:d]?" #{ln[1, ln.size-1]}":''}"
|
52
54
|
else
|
53
|
-
|
55
|
+
ofh.puts ln
|
54
56
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
57
|
+
end
|
58
|
+
end
|
59
|
+
ofh.close
|
58
60
|
rescue => err
|
59
|
-
|
60
|
-
|
61
|
-
|
61
|
+
$stderr.puts "Exception: #{err}\n\n"
|
62
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
63
|
+
err
|
62
64
|
end
|
63
65
|
|
64
|
-
|
@@ -8,8 +8,10 @@ require 'enveomics_rb/enveomics'
|
|
8
8
|
use 'tmpdir'
|
9
9
|
use 'zlib'
|
10
10
|
|
11
|
-
o = {
|
12
|
-
|
11
|
+
o = {
|
12
|
+
bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
13
|
+
archaea: false, genomeeq: false, metagenome: false, list: false
|
14
|
+
}
|
13
15
|
OptionParser.new do |opts|
|
14
16
|
opts.banner = "
|
15
17
|
Finds and extracts a collection of essential proteins suitable for genome
|
@@ -26,65 +28,86 @@ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
|
|
26
28
|
Usage: #{$0} [options]"
|
27
29
|
opts.separator ''
|
28
30
|
opts.separator 'Mandatory'
|
29
|
-
opts.on(
|
30
|
-
'
|
31
|
-
)
|
31
|
+
opts.on(
|
32
|
+
'-i', '--in FILE',
|
33
|
+
'Path to the FastA file (.gz allowed) with all the proteins in a genome'
|
34
|
+
) { |v| o[:in] = v }
|
32
35
|
opts.separator ''
|
33
36
|
opts.separator 'Report Options'
|
34
|
-
opts.on(
|
35
|
-
'
|
36
|
-
'
|
37
|
-
|
37
|
+
opts.on(
|
38
|
+
'-o', '--out FILE',
|
39
|
+
'Path to the output FastA file with the translated essential genes',
|
40
|
+
'By default the file is not produced'
|
41
|
+
) { |v| o[:out] = v }
|
42
|
+
opts.on(
|
43
|
+
'-m', '--per-model STR',
|
38
44
|
'Prefix of translated genes in independent files with the name of the',
|
39
|
-
'model appended. By default files are not produced
|
40
|
-
|
41
|
-
opts.on(
|
42
|
-
'
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
'model appended. By default files are not produced'
|
46
|
+
) { |v| o[:permodel] = v }
|
47
|
+
opts.on(
|
48
|
+
'-R', '--report FILE',
|
49
|
+
'Path to the report file. By default, the report is sent to the STDOUT'
|
50
|
+
) { |v| o[:report] = v }
|
51
|
+
opts.on(
|
52
|
+
'--hmm-out FILE',
|
53
|
+
'Save HMMsearch output in this file. By default, not saved'
|
54
|
+
) { |v| o[:hmmout] = v }
|
55
|
+
opts.on(
|
56
|
+
'--alignments FILE',
|
48
57
|
'Save the aligned proteins in this file. By default, not saved'
|
49
|
-
|
50
|
-
opts.on(
|
51
|
-
'
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
) { |v| o[:alignments] = v }
|
59
|
+
opts.on(
|
60
|
+
'-B', '--bacteria',
|
61
|
+
'If set, ignores models typically missing in Bacteria'
|
62
|
+
) { |v| o[:bacteria] = v }
|
63
|
+
opts.on(
|
64
|
+
'-A', '--archaea',
|
65
|
+
'If set, ignores models typically missing in Archaea'
|
66
|
+
) { |v| o[:archaea] = v }
|
67
|
+
opts.on(
|
68
|
+
'-G', '--genome-eq',
|
69
|
+
'If set, ignores models not suitable for genome-equivalents estimations',
|
70
|
+
'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
|
71
|
+
) { |v| o[:genomeeq] = v }
|
72
|
+
opts.on(
|
73
|
+
'-r', '--rename STR',
|
61
74
|
'If set, renames the sequences with the string provided and appends it',
|
62
|
-
'with pipe and the gene name (except in --per-model files)
|
63
|
-
|
64
|
-
opts.on(
|
65
|
-
'
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
75
|
+
'with pipe and the gene name (except in --per-model files)'
|
76
|
+
) { |v| o[:rename] = v }
|
77
|
+
opts.on(
|
78
|
+
'-n', '--no-stats',
|
79
|
+
'If set, no statistics are reported on genome evaluation'
|
80
|
+
) { |v| o[:stats] = v }
|
81
|
+
opts.on(
|
82
|
+
'-s', '--no-genes',
|
83
|
+
'If set, statistics won\'t include the lists of missing/multi-copy genes'
|
84
|
+
) { |v| o[:genes] = v }
|
85
|
+
opts.on(
|
86
|
+
'-M', '--metagenome',
|
71
87
|
'If set, it allows for multiple copies of each gene and turns on',
|
72
|
-
'metagenomic report mode
|
88
|
+
'metagenomic report mode'
|
89
|
+
) { |v| o[:metagenome] = v }
|
73
90
|
opts.separator ''
|
74
91
|
opts.separator 'Other Options'
|
75
|
-
opts.on(
|
92
|
+
opts.on(
|
93
|
+
'-L', '--list-models',
|
76
94
|
'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
|
77
|
-
'and -q; ignores all other parameters
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
opts.on(
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
opts.on(
|
95
|
+
'and -q; ignores all other parameters'
|
96
|
+
) { |v| o[:list] = v }
|
97
|
+
opts.on(
|
98
|
+
'-b', '--bin DIR',
|
99
|
+
'Path to the directory containing the binaries of HMMer 3.0+'
|
100
|
+
) { |v| o[:bin] = v }
|
101
|
+
opts.on(
|
102
|
+
'--model-file',
|
103
|
+
'External file containing models to search'
|
104
|
+
) { |v| o[:model_file] = v }
|
105
|
+
opts.on(
|
106
|
+
'-t', '--threads INT', Integer,
|
107
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
108
|
+
) { |v| o[:thr] = v }
|
109
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
|
110
|
+
opts.on('-h', '--help', 'Display this screen') do
|
88
111
|
puts opts
|
89
112
|
exit
|
90
113
|
end
|
@@ -112,6 +135,13 @@ TIGR00389 TIGR00436 tRNA-synth_1d}
|
|
112
135
|
begin
|
113
136
|
Dir.mktmpdir do |dir|
|
114
137
|
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
138
|
+
if o[:in] =~ /\.gz/
|
139
|
+
tmp_in = File.expand_path('sequences.fa', dir)
|
140
|
+
Zlib::GzipReader.open(o[:in]) do |ifh|
|
141
|
+
File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
|
142
|
+
end
|
143
|
+
o[:in] = tmp_in
|
144
|
+
end
|
115
145
|
|
116
146
|
# Create database.
|
117
147
|
$stderr.puts 'Searching models.' unless o[:q]
|
@@ -144,9 +174,9 @@ begin
|
|
144
174
|
'This script requires HMMER 3.0+.'
|
145
175
|
end
|
146
176
|
o[:hmmout] ||= "#{dir}/hmmsearch"
|
147
|
-
`
|
148
|
-
-A
|
149
|
-
> #{dir}/hmmsearch.log`
|
177
|
+
`'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
|
178
|
+
-A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
|
179
|
+
> '#{dir}/hmmsearch.log'`
|
150
180
|
|
151
181
|
# Parse output
|
152
182
|
$stderr.puts 'Parsing results.' unless o[:q]
|