miga-base 0.3.11.2 → 0.3.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/dataset/result.rb +2 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +14 -14
- data/test/daemon_test.rb +1 -0
- data/utils/enveomics/Manifest/Tasks/other.json +13 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +33 -1
- data/utils/enveomics/Manifest/categories.json +2 -0
- data/utils/enveomics/Manifest/examples.json +4 -4
- data/utils/enveomics/Scripts/HMM.essential.rb +235 -205
- data/utils/enveomics/Scripts/HMM.haai.rb +159 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 48d903a383d237f7b236d8ad1706a5fb017b31d320768353a1bc33846ea0d471
|
|
4
|
+
data.tar.gz: 9b448f00992aa4152df34ded6a48105afe5c1daf1e994737f253b519f81c998f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1579eecdab3c38bda21678baa4c903c85dfbb07e19f993212d89f6b62e1561d4044b41cfc98d80e43780d1fd172ef607a92d5320ff570de87dafda03520f2d59
|
|
7
|
+
data.tar.gz: d083fc8ae10735f647681d3924c8990951fc2a6ba7d501c10b89d32b0f00088a5128dfca2b24df118b2f8a56e3a5853d210933e300d7ab7bd3bd14c4bab90b8a
|
data/lib/miga/dataset/result.rb
CHANGED
|
@@ -226,8 +226,8 @@ module MiGA::Dataset::Result
|
|
|
226
226
|
def add_result_essential_genes(base, _opts)
|
|
227
227
|
return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
|
|
228
228
|
r = MiGA::Result.new("#{base}.json")
|
|
229
|
-
add_files_to_ds_result(r, name, ess_genes:
|
|
230
|
-
collection:
|
|
229
|
+
add_files_to_ds_result(r, name, ess_genes: '.ess.faa',
|
|
230
|
+
collection: '.ess', report: '.ess/log', alignments: '.ess/proteins.aln')
|
|
231
231
|
end
|
|
232
232
|
|
|
233
233
|
##
|
data/lib/miga/version.rb
CHANGED
|
@@ -10,7 +10,7 @@ module MiGA
|
|
|
10
10
|
# - Float representing the major.minor version.
|
|
11
11
|
# - Integer representing gem releases of the current version.
|
|
12
12
|
# - Integer representing minor changes that require new version number.
|
|
13
|
-
VERSION = [0.3,
|
|
13
|
+
VERSION = [0.3, 12, 0]
|
|
14
14
|
|
|
15
15
|
##
|
|
16
16
|
# Nickname for the current major.minor version.
|
|
@@ -18,7 +18,7 @@ module MiGA
|
|
|
18
18
|
|
|
19
19
|
##
|
|
20
20
|
# Date of the current gem release.
|
|
21
|
-
VERSION_DATE = Date.new(2019, 04,
|
|
21
|
+
VERSION_DATE = Date.new(2019, 04, 26)
|
|
22
22
|
|
|
23
23
|
##
|
|
24
24
|
# Reference of MiGA.
|
|
@@ -7,31 +7,31 @@ SCRIPT="essential_genes"
|
|
|
7
7
|
cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
|
8
8
|
|
|
9
9
|
# Initialize
|
|
10
|
-
miga date > "$DATASET.start"
|
|
11
|
-
FAA="../../../06.cds/$DATASET.faa"
|
|
10
|
+
miga date > "${DATASET}.start"
|
|
11
|
+
FAA="../../../06.cds/${DATASET}.faa"
|
|
12
12
|
|
|
13
13
|
# Check if there are any proteins
|
|
14
14
|
if [[ ! -s $FAA ]] ; then
|
|
15
15
|
echo Empty protein set, bypassing essential genes
|
|
16
|
-
rm "$DATASET.start"
|
|
17
|
-
miga
|
|
18
|
-
-m run_essential_genes=false --update
|
|
16
|
+
rm "${DATASET}.start"
|
|
17
|
+
miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
|
|
19
18
|
exit 0
|
|
20
19
|
fi
|
|
21
20
|
|
|
22
21
|
# Find and extract essential genes
|
|
23
|
-
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
|
24
|
-
mkdir "$DATASET.ess"
|
|
22
|
+
[[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
|
|
23
|
+
mkdir "${DATASET}.ess"
|
|
25
24
|
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
|
26
25
|
--metadata "type" | awk '{print $2}')
|
|
27
26
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
|
28
|
-
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
|
29
|
-
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
|
30
|
-
> "$DATASET.ess/log"
|
|
27
|
+
HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
|
|
28
|
+
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
|
29
|
+
> "${DATASET}.ess/log"
|
|
31
30
|
else
|
|
32
|
-
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
|
33
|
-
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
|
34
|
-
|
|
31
|
+
HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
|
|
32
|
+
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
|
|
33
|
+
--alignments "${DATASET}.ess/proteins.aln" \
|
|
34
|
+
> "${DATASET}.ess/log"
|
|
35
35
|
fi
|
|
36
36
|
|
|
37
37
|
# Reduce files
|
|
@@ -42,5 +42,5 @@ if exists "$DATASET".ess/*.faa ; then
|
|
|
42
42
|
fi
|
|
43
43
|
|
|
44
44
|
# Finalize
|
|
45
|
-
miga date > "$DATASET.done"
|
|
45
|
+
miga date > "${DATASET}.done"
|
|
46
46
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
data/test/daemon_test.rb
CHANGED
|
@@ -37,6 +37,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
|
37
37
|
File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
|
|
38
38
|
FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
|
|
39
39
|
File.expand_path("data/01.raw_reads/ds1.done", p.path))
|
|
40
|
+
ds.first_preprocessing(true)
|
|
40
41
|
out = capture_stdout do
|
|
41
42
|
d.check_datasets
|
|
42
43
|
end
|
|
@@ -401,6 +401,19 @@
|
|
|
401
401
|
"description": ["Path to the report file. By default, the report is",
|
|
402
402
|
"sent to the STDOUT."]
|
|
403
403
|
},
|
|
404
|
+
{
|
|
405
|
+
"name": "HMMsearch output",
|
|
406
|
+
"opt": "--hmm-out",
|
|
407
|
+
"arg": "out_file",
|
|
408
|
+
"description": ["Save HMMsearch output in this file. By default,",
|
|
409
|
+
"not saved."]
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
"opt": "--alignments",
|
|
413
|
+
"opt": "out_file",
|
|
414
|
+
"description": ["Save the aligned proteins in this file. By default,",
|
|
415
|
+
"not saved."]
|
|
416
|
+
},
|
|
404
417
|
{
|
|
405
418
|
"opt": "--bacteria",
|
|
406
419
|
"description": "If set, ignores models typically missing in Bacteria."
|
|
@@ -189,7 +189,7 @@
|
|
|
189
189
|
"description": ["Calculates the Average Nucleotide Identity between two",
|
|
190
190
|
"genomes."],
|
|
191
191
|
"help_arg": "--help",
|
|
192
|
-
"see_also": ["aai.rb","rbm.rb"],
|
|
192
|
+
"see_also": ["aai.rb","rbm.rb","HMM.essential.rb"],
|
|
193
193
|
"cite": [
|
|
194
194
|
["Konstantinidis & Tiedje, 2005, PNAS",
|
|
195
195
|
"http://dx.doi.org/10.1073%2Fpnas.0409727102"],
|
|
@@ -362,6 +362,38 @@
|
|
|
362
362
|
}
|
|
363
363
|
]
|
|
364
364
|
},
|
|
365
|
+
{
|
|
366
|
+
"task": "HMM.haai.rb",
|
|
367
|
+
"description": ["Estimates Average Amino Acid Identity (AAI) from the",
|
|
368
|
+
"essential genes extracted and aligned by HMM.essential.rb (see",
|
|
369
|
+
"Alignments)."],
|
|
370
|
+
"help_arg": "--help",
|
|
371
|
+
"see_also": ["HMM.essential.rb","aai.rb"],
|
|
372
|
+
"options": [
|
|
373
|
+
{
|
|
374
|
+
"name": "Alignments 1",
|
|
375
|
+
"opt": "-1",
|
|
376
|
+
"arg": "in_file",
|
|
377
|
+
"description": "Input alignments file for genome 1."
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
"name": "Alignments 2",
|
|
381
|
+
"opt": "-2",
|
|
382
|
+
"arg": "in_file",
|
|
383
|
+
"description": "Input alignments file for genome 2."
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
"name": "Alignment output",
|
|
387
|
+
"opt": "--aln-out",
|
|
388
|
+
"arg": "out_file",
|
|
389
|
+
"description": "Output file containing the aligned proteins."
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"opt": "--quiet",
|
|
393
|
+
"description": "Run quietly (no STDERR output)."
|
|
394
|
+
}
|
|
395
|
+
]
|
|
396
|
+
},
|
|
365
397
|
{
|
|
366
398
|
"task": "rbm.rb",
|
|
367
399
|
"description": ["Finds the reciprocal best matches between two sets of",
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
"Execution": [
|
|
30
30
|
"aai.rb",
|
|
31
31
|
"ani.rb",
|
|
32
|
+
"HMM.haai.rb",
|
|
32
33
|
"rbm.rb"
|
|
33
34
|
]
|
|
34
35
|
},
|
|
@@ -101,6 +102,7 @@
|
|
|
101
102
|
],
|
|
102
103
|
"Search": [
|
|
103
104
|
"HMM.essential.rb",
|
|
105
|
+
"HMM.haai.rb",
|
|
104
106
|
"HMMsearch.extractIds.rb",
|
|
105
107
|
"ogs.annotate.rb",
|
|
106
108
|
"ogs.core-pan.rb",
|
|
@@ -64,15 +64,15 @@
|
|
|
64
64
|
"task": "HMM.essential.rb",
|
|
65
65
|
"description": ["Typical single-copy bacterial genes present in",
|
|
66
66
|
"Mycoplasma genitalium."],
|
|
67
|
-
"values": ["Mgen_M2288.faa",null,null,null,
|
|
68
|
-
null,null,null,null,null,null]
|
|
67
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
|
|
68
|
+
null,null,null,null,null,null,null,null]
|
|
69
69
|
},
|
|
70
70
|
{
|
|
71
71
|
"task": "HMM.essential.rb",
|
|
72
72
|
"description": ["Typical single-copy archaeal genes present in",
|
|
73
73
|
"Nanoarchaeum equitans."],
|
|
74
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,
|
|
75
|
-
null,null,null,null,null,null]
|
|
74
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
|
|
75
|
+
null,null,null,null,null,null,null,null]
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
"task": "Newick.autoprune.R",
|
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
|
-
#
|
|
4
3
|
# @author Luis M. Rodriguez-R
|
|
5
4
|
# @license artistic license 2.0
|
|
6
|
-
# @update Mar-23-2016
|
|
7
|
-
#
|
|
8
5
|
|
|
9
|
-
$:.push File.expand_path(
|
|
10
|
-
require
|
|
11
|
-
use
|
|
12
|
-
use
|
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
|
7
|
+
require 'enveomics_rb/enveomics'
|
|
8
|
+
use 'tmpdir'
|
|
9
|
+
use 'zlib'
|
|
13
10
|
|
|
14
|
-
o = {bin:
|
|
15
|
-
|
|
11
|
+
o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
|
12
|
+
archaea: false, genomeeq: false, metagenome: false, list: false}
|
|
16
13
|
OptionParser.new do |opts|
|
|
17
|
-
|
|
14
|
+
opts.banner = "
|
|
18
15
|
Finds and extracts a collection of essential proteins suitable for genome
|
|
19
16
|
completeness evaluation and phylogenetic analyses. Important note: most complete
|
|
20
17
|
bacterial genomes contain only 106/111 genes in this collection, therefore
|
|
@@ -27,68 +24,74 @@ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
|
|
|
27
24
|
Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
|
|
28
25
|
|
|
29
26
|
Usage: #{$0} [options]"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
27
|
+
opts.separator ''
|
|
28
|
+
opts.separator 'Mandatory'
|
|
29
|
+
opts.on('-i', '--in FILE',
|
|
30
|
+
'Path to the FastA file containing all the proteins in a genome.'
|
|
31
|
+
){ |v| o[:in] = v }
|
|
32
|
+
opts.separator ''
|
|
33
|
+
opts.separator 'Report Options'
|
|
34
|
+
opts.on('-o', '--out FILE',
|
|
35
|
+
'Path to the output FastA file with the translated essential genes.',
|
|
36
|
+
'By default the file is not produced.'){ |v| o[:out] = v }
|
|
37
|
+
opts.on('-m', '--per-model STR',
|
|
38
|
+
'Prefix of translated genes in independent files with the name of the',
|
|
39
|
+
'model appended. By default files are not produced.'
|
|
40
|
+
){ |v| o[:permodel] = v }
|
|
41
|
+
opts.on('-R', '--report FILE',
|
|
42
|
+
'Path to the report file. By default, the report is sent to the STDOUT.'
|
|
43
|
+
){ |v| o[:report] = v }
|
|
44
|
+
opts.on('--hmm-out FILE',
|
|
45
|
+
'Save HMMsearch output in this file. By default, not saved.'
|
|
46
|
+
){ |v| o[:hmmout] = v }
|
|
47
|
+
opts.on('--alignments FILE',
|
|
48
|
+
'Save the aligned proteins in this file. By default, not saved'
|
|
49
|
+
){ |v| o[:alignments] = v }
|
|
50
|
+
opts.on('-B', '--bacteria',
|
|
51
|
+
'If set, ignores models typically missing in Bacteria.'
|
|
52
|
+
){ |v| o[:bacteria] = v }
|
|
53
|
+
opts.on('-A', '--archaea',
|
|
54
|
+
'If set, ignores models typically missing in Archaea.'
|
|
55
|
+
){ |v| o[:archaea] = v }
|
|
56
|
+
opts.on('-G', '--genome-eq',
|
|
57
|
+
'If set, ignores models not suitable for genome-equivalents estimations.',
|
|
58
|
+
'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
|
|
59
|
+
){ |v| o[:genomeeq] = v }
|
|
60
|
+
opts.on('-r', '--rename STR',
|
|
61
|
+
'If set, renames the sequences with the string provided and appends it',
|
|
62
|
+
'with pipe and the gene name (except in --per-model files).'
|
|
63
|
+
){ |v| o[:rename]=v }
|
|
64
|
+
opts.on('-n', '--no-stats',
|
|
65
|
+
'If set, no statistics are reported on genome evaluation.'
|
|
66
|
+
){ |v| o[:stats] = v }
|
|
67
|
+
opts.on('-s', '--no-genes',
|
|
68
|
+
'If set, statistics won\'t include the lists of missing/multi-copy genes.'
|
|
69
|
+
){ |v| o[:genes] = v }
|
|
70
|
+
opts.on('-M', '--metagenome',
|
|
71
|
+
'If set, it allows for multiple copies of each gene and turns on',
|
|
72
|
+
'metagenomic report mode.'){ |v| o[:metagenome] = v }
|
|
73
|
+
opts.separator ''
|
|
74
|
+
opts.separator 'Other Options'
|
|
75
|
+
opts.on('-L', '--list-models',
|
|
76
|
+
'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
|
|
77
|
+
'and -q; ignores all other parameters.'){ |v| o[:list] = v }
|
|
78
|
+
opts.on('-b', '--bin DIR',
|
|
79
|
+
'Path to the directory containing the binaries of HMMer 3.0+.'
|
|
80
|
+
){ |v| o[:bin] = v }
|
|
81
|
+
opts.on('--model-file',
|
|
82
|
+
'External file containing models to search.'){ |v| o[:model_file] = v }
|
|
83
|
+
opts.on('-t', '--threads INT',
|
|
84
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}."
|
|
85
|
+
){ |v| o[:thr] = v.to_i }
|
|
86
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
|
87
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
|
88
|
+
puts opts
|
|
89
|
+
exit
|
|
90
|
+
end
|
|
91
|
+
opts.separator ''
|
|
89
92
|
end.parse!
|
|
90
|
-
abort
|
|
91
|
-
o[:bin] = o[:bin]+
|
|
93
|
+
abort '-i is mandatory' if o[:in].nil? and not o[:list]
|
|
94
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
|
92
95
|
o[:rename] = nil if o[:metagenome]
|
|
93
96
|
|
|
94
97
|
not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
|
|
@@ -107,148 +110,175 @@ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
|
|
|
107
110
|
TIGR00389 TIGR00436 tRNA-synth_1d}
|
|
108
111
|
|
|
109
112
|
begin
|
|
110
|
-
|
|
111
|
-
|
|
113
|
+
Dir.mktmpdir do |dir|
|
|
114
|
+
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
|
115
|
+
|
|
116
|
+
# Create database.
|
|
117
|
+
$stderr.puts 'Searching models.' unless o[:q]
|
|
118
|
+
models = {}
|
|
119
|
+
model_id = nil
|
|
120
|
+
dbh = File.open("#{dir}/essential.hmm", 'w')
|
|
121
|
+
o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
|
|
122
|
+
mfh = (File.extname(o[:model_file]) == '.gz') ?
|
|
123
|
+
Zlib::GzipReader.open(o[:model_file]) :
|
|
124
|
+
File.open(o[:model_file], 'r')
|
|
125
|
+
while ln = mfh.gets
|
|
126
|
+
dbh.print ln
|
|
127
|
+
ln.chomp!
|
|
128
|
+
model_id = $1 if ln =~ /^NAME\s+(.+)/
|
|
129
|
+
models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
|
|
130
|
+
end
|
|
131
|
+
dbh.close
|
|
132
|
+
mfh.close
|
|
133
|
+
models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
|
|
134
|
+
models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
|
|
135
|
+
models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
|
|
136
|
+
if o[:list]
|
|
137
|
+
models.each_pair{ |id,desc| puts [id,desc].join("\t") }
|
|
138
|
+
exit
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Check HMMer version and run HMMsearch.
|
|
142
|
+
if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
|
|
143
|
+
raise 'You have provided an unsupported version of HMMER. ' +
|
|
144
|
+
'This script requires HMMER 3.0+.'
|
|
145
|
+
end
|
|
146
|
+
o[:hmmout] ||= "#{dir}/hmmsearch"
|
|
147
|
+
`"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
|
|
148
|
+
-A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
|
|
149
|
+
> #{dir}/hmmsearch.log`
|
|
112
150
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
151
|
+
# Parse output
|
|
152
|
+
$stderr.puts 'Parsing results.' unless o[:q]
|
|
153
|
+
trash = []
|
|
154
|
+
genes = {}
|
|
155
|
+
File.open(o[:hmmout], 'r') do |resh|
|
|
156
|
+
while ln = resh.gets
|
|
157
|
+
next if ln =~ /^#/
|
|
158
|
+
r = ln.split /\s+/
|
|
159
|
+
next unless models.include? r[2]
|
|
160
|
+
if o[:metagenome]
|
|
161
|
+
genes[ r[2] ] = [] if genes[ r[2] ].nil?
|
|
162
|
+
genes[ r[2] ] << r[0]
|
|
163
|
+
elsif genes[ r[2] ].nil?
|
|
164
|
+
genes[ r[2] ] = r[0]
|
|
165
|
+
else
|
|
166
|
+
trash << r[2]
|
|
167
|
+
end
|
|
128
168
|
end
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if o[:
|
|
135
|
-
|
|
136
|
-
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Report statistics
|
|
172
|
+
if o[:stats]
|
|
173
|
+
reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
|
|
174
|
+
if o[:metagenome]
|
|
175
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
|
176
|
+
gc = [0] * (models.size - genes.size) +
|
|
177
|
+
genes.values.map{ |g| g.length }.sort
|
|
178
|
+
reph.printf "! Mean number of copies per model: %.3f.\n",
|
|
179
|
+
gc.inject(:+).to_f / models.size
|
|
180
|
+
reph.printf "! Median number of copies per model: %.1f.\n",
|
|
181
|
+
gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
|
|
182
|
+
if o[:genes] and genes.size != models.size
|
|
183
|
+
reph.printf "! Missing genes: %s\n",
|
|
184
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
|
185
|
+
map{|m| "#{m}: #{models[m]}."}).join("\n! ")
|
|
186
|
+
end
|
|
187
|
+
else
|
|
188
|
+
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
|
189
|
+
reph.printf "! Completeness: %.1f%%.\n",
|
|
190
|
+
100.0 * genes.size / models.size
|
|
191
|
+
reph.printf "! Contamination: %.1f%%.\n",
|
|
192
|
+
100.0 * trash.size / models.size
|
|
193
|
+
if o[:genes]
|
|
194
|
+
reph.printf "! Multiple copies: %s\n",
|
|
195
|
+
([''] + trash.uniq.
|
|
196
|
+
map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
|
|
197
|
+
join("\n! ") unless trash.empty?
|
|
198
|
+
reph.printf "! Missing genes: %s\n",
|
|
199
|
+
([''] + models.keys.select{ |m| not genes.keys.include? m }.
|
|
200
|
+
map{ |m| "#{m}: #{models[m]}." }).
|
|
201
|
+
join("\n! ") unless genes.size == models.size
|
|
202
|
+
end
|
|
137
203
|
end
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
204
|
+
reph.close unless o[:report].nil?
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Extract sequences
|
|
208
|
+
unless o[:out].nil? and o[:permodel].nil?
|
|
209
|
+
$stderr.puts 'Extracting sequences.' unless o[:q]
|
|
210
|
+
faah = File.open(o[:in], 'r')
|
|
211
|
+
outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
|
|
212
|
+
geneh = nil
|
|
213
|
+
in_gene = nil
|
|
214
|
+
unless o[:permodel].nil?
|
|
215
|
+
genes.keys.each do |m|
|
|
216
|
+
File.open("#{o[:permodel]}#{m}.faa", 'w').close
|
|
217
|
+
end
|
|
143
218
|
end
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
219
|
+
while ln = faah.gets
|
|
220
|
+
if ln =~ /^>(\S+)/
|
|
221
|
+
if o[:metagenome]
|
|
222
|
+
in_gene = genes.keys.
|
|
223
|
+
map{ |k| genes[k].include?($1) ? k : nil }.compact.first
|
|
224
|
+
in_gene = [in_gene, $1] unless in_gene.nil?
|
|
225
|
+
else
|
|
226
|
+
in_gene = genes.rassoc($1)
|
|
227
|
+
end
|
|
228
|
+
next if in_gene.nil?
|
|
229
|
+
geneh.close unless geneh.nil?
|
|
230
|
+
geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
|
|
231
|
+
o[:permodel].nil?
|
|
232
|
+
outh.print(o[:rename].nil? ?
|
|
233
|
+
ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
|
|
234
|
+
geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
|
|
235
|
+
geneh.nil?
|
|
236
|
+
else
|
|
237
|
+
next if in_gene.nil?
|
|
238
|
+
outh.print ln unless outh.nil?
|
|
239
|
+
geneh.print ln unless geneh.nil?
|
|
240
|
+
end
|
|
165
241
|
end
|
|
242
|
+
geneh.close unless geneh.nil?
|
|
243
|
+
outh.close unless outh.nil?
|
|
244
|
+
faah.close
|
|
245
|
+
end
|
|
166
246
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if o[:genes]
|
|
193
|
-
reph.printf "! Multiple copies: %s\n",
|
|
194
|
-
([""] +
|
|
195
|
-
trash.uniq.map{|m|
|
|
196
|
-
"#{trash.count(m)+1} #{m}: #{models[m]}."}
|
|
197
|
-
).join("\n! ") unless trash.empty?
|
|
198
|
-
reph.printf "! Missing genes: %s\n",
|
|
199
|
-
([""] +
|
|
200
|
-
models.keys.select{|m| not genes.keys.include? m
|
|
201
|
-
}.map{|m| "#{m}: #{models[m]}."}
|
|
202
|
-
).join("\n! ") unless genes.size==models.size
|
|
203
|
-
end
|
|
204
|
-
end
|
|
205
|
-
reph.close unless o[:report].nil?
|
|
247
|
+
unless o[:alignments].nil?
|
|
248
|
+
aln = {}
|
|
249
|
+
File.open("#{dir}/a.sto", 'r') do |fh|
|
|
250
|
+
cur_model = nil
|
|
251
|
+
mask = []
|
|
252
|
+
fh.each_line do |ln|
|
|
253
|
+
case ln.chomp
|
|
254
|
+
when /^# STOCKHOLM/
|
|
255
|
+
cur_model = nil
|
|
256
|
+
mask = []
|
|
257
|
+
when /^#=GS (\S+)\/([\d\-]+)\s+DE/
|
|
258
|
+
cur_model ||= genes.rassoc($1).first
|
|
259
|
+
aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
|
|
260
|
+
when /^#=GC RF\s+(\S+)/
|
|
261
|
+
aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
|
|
262
|
+
mask.each{ |d| i[d] = '' }
|
|
263
|
+
end
|
|
264
|
+
when /^[^#]\S*\s+(\S+)/
|
|
265
|
+
next if aln[ cur_model ][ 2 ]
|
|
266
|
+
aln[ cur_model ][ 2 ] = $1.upcase
|
|
267
|
+
mask = aln[ cur_model ][ 2 ].split('').each_with_index.
|
|
268
|
+
map{ |v, k| v == '.' ? k : nil }.compact.reverse
|
|
269
|
+
aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
|
|
270
|
+
end
|
|
271
|
+
end
|
|
206
272
|
end
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
unless o[:out].nil? and o[:permodel].nil?
|
|
210
|
-
$stderr.puts "Extracting sequences." unless o[:q]
|
|
211
|
-
faah = File.open(o[:in], "r")
|
|
212
|
-
outh = o[:out].nil? ? nil : File.open(o[:out], "w")
|
|
213
|
-
geneh = nil
|
|
214
|
-
in_gene = nil
|
|
215
|
-
unless o[:permodel].nil?
|
|
216
|
-
genes.keys.each do |m|
|
|
217
|
-
File.open("#{o[:permodel]}#{m}.faa", "w").close
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
while ln = faah.gets
|
|
221
|
-
if ln =~ /^>(\S+)/
|
|
222
|
-
if o[:metagenome]
|
|
223
|
-
in_gene = genes.keys.map{|k| genes[k].include?($1) ? k : nil
|
|
224
|
-
}.compact.first
|
|
225
|
-
in_gene = [in_gene, $1] unless in_gene.nil?
|
|
226
|
-
else
|
|
227
|
-
in_gene = genes.rassoc($1)
|
|
228
|
-
end
|
|
229
|
-
next if in_gene.nil?
|
|
230
|
-
geneh.close unless geneh.nil?
|
|
231
|
-
geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa",
|
|
232
|
-
"a+") unless o[:permodel].nil?
|
|
233
|
-
outh.print(o[:rename].nil? ?
|
|
234
|
-
ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
|
|
235
|
-
geneh.print(
|
|
236
|
-
o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless geneh.nil?
|
|
237
|
-
else
|
|
238
|
-
next if in_gene.nil?
|
|
239
|
-
outh.print ln unless outh.nil?
|
|
240
|
-
geneh.print ln unless geneh.nil?
|
|
241
|
-
end
|
|
242
|
-
end
|
|
243
|
-
geneh.close unless geneh.nil?
|
|
244
|
-
outh.close unless outh.nil?
|
|
245
|
-
faah.close
|
|
273
|
+
File.open(o[:alignments], 'w') do |fh|
|
|
274
|
+
aln.each { |k, v| v.each{ |i| fh.puts i } }
|
|
246
275
|
end
|
|
276
|
+
end
|
|
247
277
|
|
|
248
|
-
|
|
249
|
-
|
|
278
|
+
$stderr.puts 'Done.' unless o[:q]
|
|
279
|
+
end # |dir|
|
|
250
280
|
rescue => err
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
281
|
+
$stderr.puts "Exception: #{err}\n\n"
|
|
282
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
|
283
|
+
err
|
|
254
284
|
end
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
|
4
|
+
# @license Artistic-2.0
|
|
5
|
+
|
|
6
|
+
require 'optparse'
|
|
7
|
+
|
|
8
|
+
o = {q: false}
|
|
9
|
+
ARGV << '-h' if ARGV.size==0
|
|
10
|
+
|
|
11
|
+
OptionParser.new do |opt|
|
|
12
|
+
opt.banner = "
|
|
13
|
+
Estimates Average Amino Acid Identity (AAI) from the essential genes extracted
|
|
14
|
+
and aligned by HMM.essential.rb (see --alignments).
|
|
15
|
+
|
|
16
|
+
Usage: #{$0} [options]"
|
|
17
|
+
opt.separator ''
|
|
18
|
+
opt.separator 'Mandatory'
|
|
19
|
+
opt.on('-1 PATH', 'Input alignments file for genome 1.'){ |v| o[:a] = v }
|
|
20
|
+
opt.on('-2 PATH', 'Input alignments file for genome 2.'){ |v| o[:b] = v }
|
|
21
|
+
opt.separator ''
|
|
22
|
+
opt.separator 'Options'
|
|
23
|
+
opt.on('-a', '--aln-out FILE',
|
|
24
|
+
'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
|
|
25
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
|
26
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
|
27
|
+
puts opt
|
|
28
|
+
exit
|
|
29
|
+
end
|
|
30
|
+
opt.separator ''
|
|
31
|
+
end.parse!
|
|
32
|
+
abort '-1 is mandatory.' if o[:a].nil?
|
|
33
|
+
abort '-2 is mandatory.' if o[:b].nil?
|
|
34
|
+
|
|
35
|
+
class HList
|
|
36
|
+
attr_accessor :list
|
|
37
|
+
def initialize(file)
|
|
38
|
+
@list = {}
|
|
39
|
+
r = File.readlines(file)
|
|
40
|
+
while not r.empty?
|
|
41
|
+
e = HElement.new(*r.shift(3))
|
|
42
|
+
@list[ e.model_id ] = e
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def [](model_id)
|
|
47
|
+
list[model_id]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# Returns an array of HAln objects.
|
|
52
|
+
def align(other)
|
|
53
|
+
list.keys.map do |model_id|
|
|
54
|
+
self[model_id].align(other[model_id]) unless other[model_id].nil?
|
|
55
|
+
end.compact
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def models
|
|
59
|
+
list.keys
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
class HElement
|
|
64
|
+
attr_accessor :defline, :model_id, :protein_id, :protein_coords
|
|
65
|
+
attr_accessor :model_aln, :protein_aln
|
|
66
|
+
def initialize(defline, model_aln, protein_aln)
|
|
67
|
+
@defline = defline.chomp
|
|
68
|
+
@model_aln = model_aln.chomp
|
|
69
|
+
@protein_aln = protein_aln.chomp
|
|
70
|
+
if defline =~ /^# (.+) : (.+) : (.+)/
|
|
71
|
+
@model_id = $1
|
|
72
|
+
@protein_id = $2
|
|
73
|
+
@protein_coords = $3
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def dup
|
|
78
|
+
HElement.new(defline, model_aln, protein_aln)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
##
|
|
82
|
+
# Returns an HAln object
|
|
83
|
+
def align(other)
|
|
84
|
+
HAln.new(self, other)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def mask
|
|
88
|
+
@mask ||= model_aln.chars.
|
|
89
|
+
each_with_index.map{ |v, k| v == '.' ? k : nil }.
|
|
90
|
+
compact.reverse
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def mask!(template)
|
|
94
|
+
(template - mask).each do |d|
|
|
95
|
+
@model_aln[d] = '-' + @model_aln[d]
|
|
96
|
+
@protein_aln[d] = '-' + @protein_aln[d]
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
class HAln
|
|
102
|
+
attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
|
|
103
|
+
def initialize(a, b)
|
|
104
|
+
a_masked = a.dup
|
|
105
|
+
a_masked.mask! b.mask.reverse
|
|
106
|
+
b_masked = b.dup
|
|
107
|
+
b_masked.mask! b_masked.mask
|
|
108
|
+
@protein_1 = a_masked.protein_aln
|
|
109
|
+
@protein_2 = b_masked.protein_aln
|
|
110
|
+
@model_id = a.model_id
|
|
111
|
+
@protein_1_id = a.protein_id + '/' + a.protein_coords
|
|
112
|
+
@protein_2_id = b.protein_id + '/' + b.protein_coords
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def stats
|
|
116
|
+
@stats = { len: 0, gaps: 0, matches: 0 }
|
|
117
|
+
return @stats unless @stats[:id].nil?
|
|
118
|
+
protein_1.chars.each_with_index do |v, k|
|
|
119
|
+
next if v == '-' and protein_2[k] == '-'
|
|
120
|
+
@stats[:len] += 1
|
|
121
|
+
if v == protein_2[k]
|
|
122
|
+
@stats[:matches] += 1
|
|
123
|
+
elsif v == '-' or protein_2[k] == '-'
|
|
124
|
+
@stats[:gaps] += 1
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
@stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def stats_to_s
|
|
131
|
+
stats.map{ |k,v| "#{k}:#{v}" }.join " "
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def to_s
|
|
135
|
+
"# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
|
|
136
|
+
protein_1 + "\n" + protein_2 + "\n"
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
hlist1 = HList.new(o[:a])
|
|
141
|
+
hlist2 = HList.new(o[:b])
|
|
142
|
+
haln_arr = hlist1.align(hlist2)
|
|
143
|
+
|
|
144
|
+
avg_identity = haln_arr.map{ |i| i.stats[:id] }.inject(:+) / haln_arr.size
|
|
145
|
+
avg2_identity = haln_arr.map{ |i| i.stats[:id] ** 2 }.inject(:+) / haln_arr.size
|
|
146
|
+
sd_identity = Math.sqrt( avg2_identity - avg_identity ** 2 )
|
|
147
|
+
puts "Common models: #{haln_arr.size}"
|
|
148
|
+
puts "All models: #{(hlist1.models | hlist1.models).size}"
|
|
149
|
+
puts "Average identity: #{avg_identity.round(2)}%"
|
|
150
|
+
puts "SD identity: #{sd_identity.round(2)}"
|
|
151
|
+
|
|
152
|
+
if o[:alnout]
|
|
153
|
+
File.open(o[:alnout], 'w') do |fh|
|
|
154
|
+
haln_arr.each do |i|
|
|
155
|
+
fh.puts i
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: miga-base
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Luis M. Rodriguez-R
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-04-
|
|
11
|
+
date: 2019-04-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: daemons
|
|
@@ -321,6 +321,7 @@ files:
|
|
|
321
321
|
- utils/enveomics/Scripts/GFF.catsbj.pl
|
|
322
322
|
- utils/enveomics/Scripts/GenBank.add_fields.rb
|
|
323
323
|
- utils/enveomics/Scripts/HMM.essential.rb
|
|
324
|
+
- utils/enveomics/Scripts/HMM.haai.rb
|
|
324
325
|
- utils/enveomics/Scripts/HMMsearch.extractIds.rb
|
|
325
326
|
- utils/enveomics/Scripts/JPlace.distances.rb
|
|
326
327
|
- utils/enveomics/Scripts/JPlace.to_iToL.rb
|