miga-base 0.3.11.2 → 0.3.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/dataset/result.rb +2 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +14 -14
- data/test/daemon_test.rb +1 -0
- data/utils/enveomics/Manifest/Tasks/other.json +13 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +33 -1
- data/utils/enveomics/Manifest/categories.json +2 -0
- data/utils/enveomics/Manifest/examples.json +4 -4
- data/utils/enveomics/Scripts/HMM.essential.rb +235 -205
- data/utils/enveomics/Scripts/HMM.haai.rb +159 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48d903a383d237f7b236d8ad1706a5fb017b31d320768353a1bc33846ea0d471
|
4
|
+
data.tar.gz: 9b448f00992aa4152df34ded6a48105afe5c1daf1e994737f253b519f81c998f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1579eecdab3c38bda21678baa4c903c85dfbb07e19f993212d89f6b62e1561d4044b41cfc98d80e43780d1fd172ef607a92d5320ff570de87dafda03520f2d59
|
7
|
+
data.tar.gz: d083fc8ae10735f647681d3924c8990951fc2a6ba7d501c10b89d32b0f00088a5128dfca2b24df118b2f8a56e3a5853d210933e300d7ab7bd3bd14c4bab90b8a
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -226,8 +226,8 @@ module MiGA::Dataset::Result
|
|
226
226
|
def add_result_essential_genes(base, _opts)
|
227
227
|
return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
|
228
228
|
r = MiGA::Result.new("#{base}.json")
|
229
|
-
add_files_to_ds_result(r, name, ess_genes:
|
230
|
-
collection:
|
229
|
+
add_files_to_ds_result(r, name, ess_genes: '.ess.faa',
|
230
|
+
collection: '.ess', report: '.ess/log', alignments: '.ess/proteins.aln')
|
231
231
|
end
|
232
232
|
|
233
233
|
##
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 12, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -18,7 +18,7 @@ module MiGA
|
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(2019, 04,
|
21
|
+
VERSION_DATE = Date.new(2019, 04, 26)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
@@ -7,31 +7,31 @@ SCRIPT="essential_genes"
|
|
7
7
|
cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
8
8
|
|
9
9
|
# Initialize
|
10
|
-
miga date > "$DATASET.start"
|
11
|
-
FAA="../../../06.cds/$DATASET.faa"
|
10
|
+
miga date > "${DATASET}.start"
|
11
|
+
FAA="../../../06.cds/${DATASET}.faa"
|
12
12
|
|
13
13
|
# Check if there are any proteins
|
14
14
|
if [[ ! -s $FAA ]] ; then
|
15
15
|
echo Empty protein set, bypassing essential genes
|
16
|
-
rm "$DATASET.start"
|
17
|
-
miga
|
18
|
-
-m run_essential_genes=false --update
|
16
|
+
rm "${DATASET}.start"
|
17
|
+
miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
|
19
18
|
exit 0
|
20
19
|
fi
|
21
20
|
|
22
21
|
# Find and extract essential genes
|
23
|
-
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
24
|
-
mkdir "$DATASET.ess"
|
22
|
+
[[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
|
23
|
+
mkdir "${DATASET}.ess"
|
25
24
|
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
26
25
|
--metadata "type" | awk '{print $2}')
|
27
26
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
28
|
-
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
29
|
-
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
30
|
-
> "$DATASET.ess/log"
|
27
|
+
HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
|
28
|
+
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
29
|
+
> "${DATASET}.ess/log"
|
31
30
|
else
|
32
|
-
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
33
|
-
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
34
|
-
|
31
|
+
HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
|
32
|
+
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
|
33
|
+
--alignments "${DATASET}.ess/proteins.aln" \
|
34
|
+
> "${DATASET}.ess/log"
|
35
35
|
fi
|
36
36
|
|
37
37
|
# Reduce files
|
@@ -42,5 +42,5 @@ if exists "$DATASET".ess/*.faa ; then
|
|
42
42
|
fi
|
43
43
|
|
44
44
|
# Finalize
|
45
|
-
miga date > "$DATASET.done"
|
45
|
+
miga date > "${DATASET}.done"
|
46
46
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
data/test/daemon_test.rb
CHANGED
@@ -37,6 +37,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
37
37
|
File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
|
38
38
|
FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
|
39
39
|
File.expand_path("data/01.raw_reads/ds1.done", p.path))
|
40
|
+
ds.first_preprocessing(true)
|
40
41
|
out = capture_stdout do
|
41
42
|
d.check_datasets
|
42
43
|
end
|
@@ -401,6 +401,19 @@
|
|
401
401
|
"description": ["Path to the report file. By default, the report is",
|
402
402
|
"sent to the STDOUT."]
|
403
403
|
},
|
404
|
+
{
|
405
|
+
"name": "HMMsearch output",
|
406
|
+
"opt": "--hmm-out",
|
407
|
+
"arg": "out_file",
|
408
|
+
"description": ["Save HMMsearch output in this file. By default,",
|
409
|
+
"not saved."]
|
410
|
+
},
|
411
|
+
{
|
412
|
+
"opt": "--alignments",
|
413
|
+
"opt": "out_file",
|
414
|
+
"description": ["Save the aligned proteins in this file. By default,",
|
415
|
+
"not saved."]
|
416
|
+
},
|
404
417
|
{
|
405
418
|
"opt": "--bacteria",
|
406
419
|
"description": "If set, ignores models typically missing in Bacteria."
|
@@ -189,7 +189,7 @@
|
|
189
189
|
"description": ["Calculates the Average Nucleotide Identity between two",
|
190
190
|
"genomes."],
|
191
191
|
"help_arg": "--help",
|
192
|
-
"see_also": ["aai.rb","rbm.rb"],
|
192
|
+
"see_also": ["aai.rb","rbm.rb","HMM.essential.rb"],
|
193
193
|
"cite": [
|
194
194
|
["Konstantinidis & Tiedje, 2005, PNAS",
|
195
195
|
"http://dx.doi.org/10.1073%2Fpnas.0409727102"],
|
@@ -362,6 +362,38 @@
|
|
362
362
|
}
|
363
363
|
]
|
364
364
|
},
|
365
|
+
{
|
366
|
+
"task": "HMM.haai.rb",
|
367
|
+
"description": ["Estimates Average Amino Acid Identity (AAI) from the",
|
368
|
+
"essential genes extracted and aligned by HMM.essential.rb (see",
|
369
|
+
"Alignments)."],
|
370
|
+
"help_arg": "--help",
|
371
|
+
"see_also": ["HMM.essential.rb","aai.rb"],
|
372
|
+
"options": [
|
373
|
+
{
|
374
|
+
"name": "Alignments 1",
|
375
|
+
"opt": "-1",
|
376
|
+
"arg": "in_file",
|
377
|
+
"description": "Input alignments file for genome 1."
|
378
|
+
},
|
379
|
+
{
|
380
|
+
"name": "Alignments 2",
|
381
|
+
"opt": "-2",
|
382
|
+
"arg": "in_file",
|
383
|
+
"description": "Input alignments file for genome 2."
|
384
|
+
},
|
385
|
+
{
|
386
|
+
"name": "Alignment output",
|
387
|
+
"opt": "--aln-out",
|
388
|
+
"arg": "out_file",
|
389
|
+
"description": "Output file containing the aligned proteins."
|
390
|
+
},
|
391
|
+
{
|
392
|
+
"opt": "--quiet",
|
393
|
+
"description": "Run quietly (no STDERR output)."
|
394
|
+
}
|
395
|
+
]
|
396
|
+
},
|
365
397
|
{
|
366
398
|
"task": "rbm.rb",
|
367
399
|
"description": ["Finds the reciprocal best matches between two sets of",
|
@@ -29,6 +29,7 @@
|
|
29
29
|
"Execution": [
|
30
30
|
"aai.rb",
|
31
31
|
"ani.rb",
|
32
|
+
"HMM.haai.rb",
|
32
33
|
"rbm.rb"
|
33
34
|
]
|
34
35
|
},
|
@@ -101,6 +102,7 @@
|
|
101
102
|
],
|
102
103
|
"Search": [
|
103
104
|
"HMM.essential.rb",
|
105
|
+
"HMM.haai.rb",
|
104
106
|
"HMMsearch.extractIds.rb",
|
105
107
|
"ogs.annotate.rb",
|
106
108
|
"ogs.core-pan.rb",
|
@@ -64,15 +64,15 @@
|
|
64
64
|
"task": "HMM.essential.rb",
|
65
65
|
"description": ["Typical single-copy bacterial genes present in",
|
66
66
|
"Mycoplasma genitalium."],
|
67
|
-
"values": ["Mgen_M2288.faa",null,null,null,
|
68
|
-
null,null,null,null,null,null]
|
67
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
|
68
|
+
null,null,null,null,null,null,null,null]
|
69
69
|
},
|
70
70
|
{
|
71
71
|
"task": "HMM.essential.rb",
|
72
72
|
"description": ["Typical single-copy archaeal genes present in",
|
73
73
|
"Nanoarchaeum equitans."],
|
74
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,
|
75
|
-
null,null,null,null,null,null]
|
74
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
|
75
|
+
null,null,null,null,null,null,null,null]
|
76
76
|
},
|
77
77
|
{
|
78
78
|
"task": "Newick.autoprune.R",
|
@@ -1,20 +1,17 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
3
|
# @author Luis M. Rodriguez-R
|
5
4
|
# @license artistic license 2.0
|
6
|
-
# @update Mar-23-2016
|
7
|
-
#
|
8
5
|
|
9
|
-
$:.push File.expand_path(
|
10
|
-
require
|
11
|
-
use
|
12
|
-
use
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
8
|
+
use 'tmpdir'
|
9
|
+
use 'zlib'
|
13
10
|
|
14
|
-
o = {bin:
|
15
|
-
|
11
|
+
o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
12
|
+
archaea: false, genomeeq: false, metagenome: false, list: false}
|
16
13
|
OptionParser.new do |opts|
|
17
|
-
|
14
|
+
opts.banner = "
|
18
15
|
Finds and extracts a collection of essential proteins suitable for genome
|
19
16
|
completeness evaluation and phylogenetic analyses. Important note: most complete
|
20
17
|
bacterial genomes contain only 106/111 genes in this collection, therefore
|
@@ -27,68 +24,74 @@ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
|
|
27
24
|
Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
|
28
25
|
|
29
26
|
Usage: #{$0} [options]"
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'Mandatory'
|
29
|
+
opts.on('-i', '--in FILE',
|
30
|
+
'Path to the FastA file containing all the proteins in a genome.'
|
31
|
+
){ |v| o[:in] = v }
|
32
|
+
opts.separator ''
|
33
|
+
opts.separator 'Report Options'
|
34
|
+
opts.on('-o', '--out FILE',
|
35
|
+
'Path to the output FastA file with the translated essential genes.',
|
36
|
+
'By default the file is not produced.'){ |v| o[:out] = v }
|
37
|
+
opts.on('-m', '--per-model STR',
|
38
|
+
'Prefix of translated genes in independent files with the name of the',
|
39
|
+
'model appended. By default files are not produced.'
|
40
|
+
){ |v| o[:permodel] = v }
|
41
|
+
opts.on('-R', '--report FILE',
|
42
|
+
'Path to the report file. By default, the report is sent to the STDOUT.'
|
43
|
+
){ |v| o[:report] = v }
|
44
|
+
opts.on('--hmm-out FILE',
|
45
|
+
'Save HMMsearch output in this file. By default, not saved.'
|
46
|
+
){ |v| o[:hmmout] = v }
|
47
|
+
opts.on('--alignments FILE',
|
48
|
+
'Save the aligned proteins in this file. By default, not saved'
|
49
|
+
){ |v| o[:alignments] = v }
|
50
|
+
opts.on('-B', '--bacteria',
|
51
|
+
'If set, ignores models typically missing in Bacteria.'
|
52
|
+
){ |v| o[:bacteria] = v }
|
53
|
+
opts.on('-A', '--archaea',
|
54
|
+
'If set, ignores models typically missing in Archaea.'
|
55
|
+
){ |v| o[:archaea] = v }
|
56
|
+
opts.on('-G', '--genome-eq',
|
57
|
+
'If set, ignores models not suitable for genome-equivalents estimations.',
|
58
|
+
'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
|
59
|
+
){ |v| o[:genomeeq] = v }
|
60
|
+
opts.on('-r', '--rename STR',
|
61
|
+
'If set, renames the sequences with the string provided and appends it',
|
62
|
+
'with pipe and the gene name (except in --per-model files).'
|
63
|
+
){ |v| o[:rename]=v }
|
64
|
+
opts.on('-n', '--no-stats',
|
65
|
+
'If set, no statistics are reported on genome evaluation.'
|
66
|
+
){ |v| o[:stats] = v }
|
67
|
+
opts.on('-s', '--no-genes',
|
68
|
+
'If set, statistics won\'t include the lists of missing/multi-copy genes.'
|
69
|
+
){ |v| o[:genes] = v }
|
70
|
+
opts.on('-M', '--metagenome',
|
71
|
+
'If set, it allows for multiple copies of each gene and turns on',
|
72
|
+
'metagenomic report mode.'){ |v| o[:metagenome] = v }
|
73
|
+
opts.separator ''
|
74
|
+
opts.separator 'Other Options'
|
75
|
+
opts.on('-L', '--list-models',
|
76
|
+
'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
|
77
|
+
'and -q; ignores all other parameters.'){ |v| o[:list] = v }
|
78
|
+
opts.on('-b', '--bin DIR',
|
79
|
+
'Path to the directory containing the binaries of HMMer 3.0+.'
|
80
|
+
){ |v| o[:bin] = v }
|
81
|
+
opts.on('--model-file',
|
82
|
+
'External file containing models to search.'){ |v| o[:model_file] = v }
|
83
|
+
opts.on('-t', '--threads INT',
|
84
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}."
|
85
|
+
){ |v| o[:thr] = v.to_i }
|
86
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
87
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
88
|
+
puts opts
|
89
|
+
exit
|
90
|
+
end
|
91
|
+
opts.separator ''
|
89
92
|
end.parse!
|
90
|
-
abort
|
91
|
-
o[:bin] = o[:bin]+
|
93
|
+
abort '-i is mandatory' if o[:in].nil? and not o[:list]
|
94
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
92
95
|
o[:rename] = nil if o[:metagenome]
|
93
96
|
|
94
97
|
not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
|
@@ -107,148 +110,175 @@ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
|
|
107
110
|
TIGR00389 TIGR00436 tRNA-synth_1d}
|
108
111
|
|
109
112
|
begin
|
110
|
-
|
111
|
-
|
113
|
+
Dir.mktmpdir do |dir|
|
114
|
+
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
115
|
+
|
116
|
+
# Create database.
|
117
|
+
$stderr.puts 'Searching models.' unless o[:q]
|
118
|
+
models = {}
|
119
|
+
model_id = nil
|
120
|
+
dbh = File.open("#{dir}/essential.hmm", 'w')
|
121
|
+
o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
|
122
|
+
mfh = (File.extname(o[:model_file]) == '.gz') ?
|
123
|
+
Zlib::GzipReader.open(o[:model_file]) :
|
124
|
+
File.open(o[:model_file], 'r')
|
125
|
+
while ln = mfh.gets
|
126
|
+
dbh.print ln
|
127
|
+
ln.chomp!
|
128
|
+
model_id = $1 if ln =~ /^NAME\s+(.+)/
|
129
|
+
models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
|
130
|
+
end
|
131
|
+
dbh.close
|
132
|
+
mfh.close
|
133
|
+
models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
|
134
|
+
models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
|
135
|
+
models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
|
136
|
+
if o[:list]
|
137
|
+
models.each_pair{ |id,desc| puts [id,desc].join("\t") }
|
138
|
+
exit
|
139
|
+
end
|
140
|
+
|
141
|
+
# Check HMMer version and run HMMsearch.
|
142
|
+
if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
|
143
|
+
raise 'You have provided an unsupported version of HMMER. ' +
|
144
|
+
'This script requires HMMER 3.0+.'
|
145
|
+
end
|
146
|
+
o[:hmmout] ||= "#{dir}/hmmsearch"
|
147
|
+
`"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
|
148
|
+
-A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
|
149
|
+
> #{dir}/hmmsearch.log`
|
112
150
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
151
|
+
# Parse output
|
152
|
+
$stderr.puts 'Parsing results.' unless o[:q]
|
153
|
+
trash = []
|
154
|
+
genes = {}
|
155
|
+
File.open(o[:hmmout], 'r') do |resh|
|
156
|
+
while ln = resh.gets
|
157
|
+
next if ln =~ /^#/
|
158
|
+
r = ln.split /\s+/
|
159
|
+
next unless models.include? r[2]
|
160
|
+
if o[:metagenome]
|
161
|
+
genes[ r[2] ] = [] if genes[ r[2] ].nil?
|
162
|
+
genes[ r[2] ] << r[0]
|
163
|
+
elsif genes[ r[2] ].nil?
|
164
|
+
genes[ r[2] ] = r[0]
|
165
|
+
else
|
166
|
+
trash << r[2]
|
167
|
+
end
|
128
168
|
end
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
if o[:
|
135
|
-
|
136
|
-
|
169
|
+
end
|
170
|
+
|
171
|
+
# Report statistics
|
172
|
+
if o[:stats]
|
173
|
+
reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
|
174
|
+
if o[:metagenome]
|
175
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
176
|
+
gc = [0] * (models.size - genes.size) +
|
177
|
+
genes.values.map{ |g| g.length }.sort
|
178
|
+
reph.printf "! Mean number of copies per model: %.3f.\n",
|
179
|
+
gc.inject(:+).to_f / models.size
|
180
|
+
reph.printf "! Median number of copies per model: %.1f.\n",
|
181
|
+
gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
|
182
|
+
if o[:genes] and genes.size != models.size
|
183
|
+
reph.printf "! Missing genes: %s\n",
|
184
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
185
|
+
map{|m| "#{m}: #{models[m]}."}).join("\n! ")
|
186
|
+
end
|
187
|
+
else
|
188
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
189
|
+
reph.printf "! Completeness: %.1f%%.\n",
|
190
|
+
100.0 * genes.size / models.size
|
191
|
+
reph.printf "! Contamination: %.1f%%.\n",
|
192
|
+
100.0 * trash.size / models.size
|
193
|
+
if o[:genes]
|
194
|
+
reph.printf "! Multiple copies: %s\n",
|
195
|
+
([''] + trash.uniq.
|
196
|
+
map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
|
197
|
+
join("\n! ") unless trash.empty?
|
198
|
+
reph.printf "! Missing genes: %s\n",
|
199
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
200
|
+
map{ |m| "#{m}: #{models[m]}." }).
|
201
|
+
join("\n! ") unless genes.size == models.size
|
202
|
+
end
|
137
203
|
end
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
204
|
+
reph.close unless o[:report].nil?
|
205
|
+
end
|
206
|
+
|
207
|
+
# Extract sequences
|
208
|
+
unless o[:out].nil? and o[:permodel].nil?
|
209
|
+
$stderr.puts 'Extracting sequences.' unless o[:q]
|
210
|
+
faah = File.open(o[:in], 'r')
|
211
|
+
outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
|
212
|
+
geneh = nil
|
213
|
+
in_gene = nil
|
214
|
+
unless o[:permodel].nil?
|
215
|
+
genes.keys.each do |m|
|
216
|
+
File.open("#{o[:permodel]}#{m}.faa", 'w').close
|
217
|
+
end
|
143
218
|
end
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
219
|
+
while ln = faah.gets
|
220
|
+
if ln =~ /^>(\S+)/
|
221
|
+
if o[:metagenome]
|
222
|
+
in_gene = genes.keys.
|
223
|
+
map{ |k| genes[k].include?($1) ? k : nil }.compact.first
|
224
|
+
in_gene = [in_gene, $1] unless in_gene.nil?
|
225
|
+
else
|
226
|
+
in_gene = genes.rassoc($1)
|
227
|
+
end
|
228
|
+
next if in_gene.nil?
|
229
|
+
geneh.close unless geneh.nil?
|
230
|
+
geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
|
231
|
+
o[:permodel].nil?
|
232
|
+
outh.print(o[:rename].nil? ?
|
233
|
+
ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
|
234
|
+
geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
|
235
|
+
geneh.nil?
|
236
|
+
else
|
237
|
+
next if in_gene.nil?
|
238
|
+
outh.print ln unless outh.nil?
|
239
|
+
geneh.print ln unless geneh.nil?
|
240
|
+
end
|
165
241
|
end
|
242
|
+
geneh.close unless geneh.nil?
|
243
|
+
outh.close unless outh.nil?
|
244
|
+
faah.close
|
245
|
+
end
|
166
246
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
if o[:genes]
|
193
|
-
reph.printf "! Multiple copies: %s\n",
|
194
|
-
([""] +
|
195
|
-
trash.uniq.map{|m|
|
196
|
-
"#{trash.count(m)+1} #{m}: #{models[m]}."}
|
197
|
-
).join("\n! ") unless trash.empty?
|
198
|
-
reph.printf "! Missing genes: %s\n",
|
199
|
-
([""] +
|
200
|
-
models.keys.select{|m| not genes.keys.include? m
|
201
|
-
}.map{|m| "#{m}: #{models[m]}."}
|
202
|
-
).join("\n! ") unless genes.size==models.size
|
203
|
-
end
|
204
|
-
end
|
205
|
-
reph.close unless o[:report].nil?
|
247
|
+
unless o[:alignments].nil?
|
248
|
+
aln = {}
|
249
|
+
File.open("#{dir}/a.sto", 'r') do |fh|
|
250
|
+
cur_model = nil
|
251
|
+
mask = []
|
252
|
+
fh.each_line do |ln|
|
253
|
+
case ln.chomp
|
254
|
+
when /^# STOCKHOLM/
|
255
|
+
cur_model = nil
|
256
|
+
mask = []
|
257
|
+
when /^#=GS (\S+)\/([\d\-]+)\s+DE/
|
258
|
+
cur_model ||= genes.rassoc($1).first
|
259
|
+
aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
|
260
|
+
when /^#=GC RF\s+(\S+)/
|
261
|
+
aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
|
262
|
+
mask.each{ |d| i[d] = '' }
|
263
|
+
end
|
264
|
+
when /^[^#]\S*\s+(\S+)/
|
265
|
+
next if aln[ cur_model ][ 2 ]
|
266
|
+
aln[ cur_model ][ 2 ] = $1.upcase
|
267
|
+
mask = aln[ cur_model ][ 2 ].split('').each_with_index.
|
268
|
+
map{ |v, k| v == '.' ? k : nil }.compact.reverse
|
269
|
+
aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
|
270
|
+
end
|
271
|
+
end
|
206
272
|
end
|
207
|
-
|
208
|
-
|
209
|
-
unless o[:out].nil? and o[:permodel].nil?
|
210
|
-
$stderr.puts "Extracting sequences." unless o[:q]
|
211
|
-
faah = File.open(o[:in], "r")
|
212
|
-
outh = o[:out].nil? ? nil : File.open(o[:out], "w")
|
213
|
-
geneh = nil
|
214
|
-
in_gene = nil
|
215
|
-
unless o[:permodel].nil?
|
216
|
-
genes.keys.each do |m|
|
217
|
-
File.open("#{o[:permodel]}#{m}.faa", "w").close
|
218
|
-
end
|
219
|
-
end
|
220
|
-
while ln = faah.gets
|
221
|
-
if ln =~ /^>(\S+)/
|
222
|
-
if o[:metagenome]
|
223
|
-
in_gene = genes.keys.map{|k| genes[k].include?($1) ? k : nil
|
224
|
-
}.compact.first
|
225
|
-
in_gene = [in_gene, $1] unless in_gene.nil?
|
226
|
-
else
|
227
|
-
in_gene = genes.rassoc($1)
|
228
|
-
end
|
229
|
-
next if in_gene.nil?
|
230
|
-
geneh.close unless geneh.nil?
|
231
|
-
geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa",
|
232
|
-
"a+") unless o[:permodel].nil?
|
233
|
-
outh.print(o[:rename].nil? ?
|
234
|
-
ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
|
235
|
-
geneh.print(
|
236
|
-
o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless geneh.nil?
|
237
|
-
else
|
238
|
-
next if in_gene.nil?
|
239
|
-
outh.print ln unless outh.nil?
|
240
|
-
geneh.print ln unless geneh.nil?
|
241
|
-
end
|
242
|
-
end
|
243
|
-
geneh.close unless geneh.nil?
|
244
|
-
outh.close unless outh.nil?
|
245
|
-
faah.close
|
273
|
+
File.open(o[:alignments], 'w') do |fh|
|
274
|
+
aln.each { |k, v| v.each{ |i| fh.puts i } }
|
246
275
|
end
|
276
|
+
end
|
247
277
|
|
248
|
-
|
249
|
-
|
278
|
+
$stderr.puts 'Done.' unless o[:q]
|
279
|
+
end # |dir|
|
250
280
|
rescue => err
|
251
|
-
|
252
|
-
|
253
|
-
|
281
|
+
$stderr.puts "Exception: #{err}\n\n"
|
282
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
283
|
+
err
|
254
284
|
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
|
8
|
+
o = {q: false}
|
9
|
+
ARGV << '-h' if ARGV.size==0
|
10
|
+
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
opt.banner = "
|
13
|
+
Estimates Average Amino Acid Identity (AAI) from the essential genes extracted
|
14
|
+
and aligned by HMM.essential.rb (see --alignments).
|
15
|
+
|
16
|
+
Usage: #{$0} [options]"
|
17
|
+
opt.separator ''
|
18
|
+
opt.separator 'Mandatory'
|
19
|
+
opt.on('-1 PATH', 'Input alignments file for genome 1.'){ |v| o[:a] = v }
|
20
|
+
opt.on('-2 PATH', 'Input alignments file for genome 2.'){ |v| o[:b] = v }
|
21
|
+
opt.separator ''
|
22
|
+
opt.separator 'Options'
|
23
|
+
opt.on('-a', '--aln-out FILE',
|
24
|
+
'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
|
25
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
26
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
27
|
+
puts opt
|
28
|
+
exit
|
29
|
+
end
|
30
|
+
opt.separator ''
|
31
|
+
end.parse!
|
32
|
+
abort '-1 is mandatory.' if o[:a].nil?
|
33
|
+
abort '-2 is mandatory.' if o[:b].nil?
|
34
|
+
|
35
|
+
class HList
|
36
|
+
attr_accessor :list
|
37
|
+
def initialize(file)
|
38
|
+
@list = {}
|
39
|
+
r = File.readlines(file)
|
40
|
+
while not r.empty?
|
41
|
+
e = HElement.new(*r.shift(3))
|
42
|
+
@list[ e.model_id ] = e
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def [](model_id)
|
47
|
+
list[model_id]
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Returns an array of HAln objects.
|
52
|
+
def align(other)
|
53
|
+
list.keys.map do |model_id|
|
54
|
+
self[model_id].align(other[model_id]) unless other[model_id].nil?
|
55
|
+
end.compact
|
56
|
+
end
|
57
|
+
|
58
|
+
def models
|
59
|
+
list.keys
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class HElement
|
64
|
+
attr_accessor :defline, :model_id, :protein_id, :protein_coords
|
65
|
+
attr_accessor :model_aln, :protein_aln
|
66
|
+
def initialize(defline, model_aln, protein_aln)
|
67
|
+
@defline = defline.chomp
|
68
|
+
@model_aln = model_aln.chomp
|
69
|
+
@protein_aln = protein_aln.chomp
|
70
|
+
if defline =~ /^# (.+) : (.+) : (.+)/
|
71
|
+
@model_id = $1
|
72
|
+
@protein_id = $2
|
73
|
+
@protein_coords = $3
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def dup
|
78
|
+
HElement.new(defline, model_aln, protein_aln)
|
79
|
+
end
|
80
|
+
|
81
|
+
##
|
82
|
+
# Returns an HAln object
|
83
|
+
def align(other)
|
84
|
+
HAln.new(self, other)
|
85
|
+
end
|
86
|
+
|
87
|
+
def mask
|
88
|
+
@mask ||= model_aln.chars.
|
89
|
+
each_with_index.map{ |v, k| v == '.' ? k : nil }.
|
90
|
+
compact.reverse
|
91
|
+
end
|
92
|
+
|
93
|
+
def mask!(template)
|
94
|
+
(template - mask).each do |d|
|
95
|
+
@model_aln[d] = '-' + @model_aln[d]
|
96
|
+
@protein_aln[d] = '-' + @protein_aln[d]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class HAln
|
102
|
+
attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
|
103
|
+
def initialize(a, b)
|
104
|
+
a_masked = a.dup
|
105
|
+
a_masked.mask! b.mask.reverse
|
106
|
+
b_masked = b.dup
|
107
|
+
b_masked.mask! b_masked.mask
|
108
|
+
@protein_1 = a_masked.protein_aln
|
109
|
+
@protein_2 = b_masked.protein_aln
|
110
|
+
@model_id = a.model_id
|
111
|
+
@protein_1_id = a.protein_id + '/' + a.protein_coords
|
112
|
+
@protein_2_id = b.protein_id + '/' + b.protein_coords
|
113
|
+
end
|
114
|
+
|
115
|
+
def stats
|
116
|
+
@stats = { len: 0, gaps: 0, matches: 0 }
|
117
|
+
return @stats unless @stats[:id].nil?
|
118
|
+
protein_1.chars.each_with_index do |v, k|
|
119
|
+
next if v == '-' and protein_2[k] == '-'
|
120
|
+
@stats[:len] += 1
|
121
|
+
if v == protein_2[k]
|
122
|
+
@stats[:matches] += 1
|
123
|
+
elsif v == '-' or protein_2[k] == '-'
|
124
|
+
@stats[:gaps] += 1
|
125
|
+
end
|
126
|
+
end
|
127
|
+
@stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
|
128
|
+
end
|
129
|
+
|
130
|
+
def stats_to_s
|
131
|
+
stats.map{ |k,v| "#{k}:#{v}" }.join " "
|
132
|
+
end
|
133
|
+
|
134
|
+
def to_s
|
135
|
+
"# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
|
136
|
+
protein_1 + "\n" + protein_2 + "\n"
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
hlist1 = HList.new(o[:a])
|
141
|
+
hlist2 = HList.new(o[:b])
|
142
|
+
haln_arr = hlist1.align(hlist2)
|
143
|
+
|
144
|
+
avg_identity = haln_arr.map{ |i| i.stats[:id] }.inject(:+) / haln_arr.size
|
145
|
+
avg2_identity = haln_arr.map{ |i| i.stats[:id] ** 2 }.inject(:+) / haln_arr.size
|
146
|
+
sd_identity = Math.sqrt( avg2_identity - avg_identity ** 2 )
|
147
|
+
puts "Common models: #{haln_arr.size}"
|
148
|
+
puts "All models: #{(hlist1.models | hlist1.models).size}"
|
149
|
+
puts "Average identity: #{avg_identity.round(2)}%"
|
150
|
+
puts "SD identity: #{sd_identity.round(2)}"
|
151
|
+
|
152
|
+
if o[:alnout]
|
153
|
+
File.open(o[:alnout], 'w') do |fh|
|
154
|
+
haln_arr.each do |i|
|
155
|
+
fh.puts i
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -321,6 +321,7 @@ files:
|
|
321
321
|
- utils/enveomics/Scripts/GFF.catsbj.pl
|
322
322
|
- utils/enveomics/Scripts/GenBank.add_fields.rb
|
323
323
|
- utils/enveomics/Scripts/HMM.essential.rb
|
324
|
+
- utils/enveomics/Scripts/HMM.haai.rb
|
324
325
|
- utils/enveomics/Scripts/HMMsearch.extractIds.rb
|
325
326
|
- utils/enveomics/Scripts/JPlace.distances.rb
|
326
327
|
- utils/enveomics/Scripts/JPlace.to_iToL.rb
|