miga-base 0.5.0.0 → 0.5.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor.rb +6 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/cli/action/quality_wf.rb +1 -0
- data/lib/miga/cli/action/stats.rb +9 -8
- data/lib/miga/cli/action/wf.rb +5 -0
- data/lib/miga/cli/objects_helper.rb +1 -0
- data/lib/miga/common/format.rb +5 -2
- data/lib/miga/daemon.rb +2 -2
- data/lib/miga/project/dataset.rb +8 -7
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +9 -8
- data/scripts/mytaxa.bash +3 -1
- data/scripts/mytaxa_scan.bash +15 -8
- data/utils/domain-ess-genes.rb +63 -0
- data/utils/enveomics/Manifest/Tasks/other.json +21 -2
- data/utils/enveomics/Manifest/examples.json +4 -4
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/HMM.essential.rb +54 -17
- data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/R/df2dist.R +16 -17
- data/utils/enveomics/enveomics.R/R/recplot2.R +20 -15
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +5 -4
- data/utils/find-medoid.R +6 -1
- data/utils/mytaxa_scan.rb +49 -46
- data/utils/ref-tree.R +6 -1
- data/utils/subclades-nj.R +6 -1
- data/utils/subclades.R +6 -1
- metadata +6 -6
- data/utils/arch-ess-genes.rb +0 -57
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6888c1ce3756b8cc708736c0da052e5a7396277e0c903ebcfc083f17b6915e7
|
4
|
+
data.tar.gz: d998f6e087316a81de4aa8897452344c1987ce0cb9807f4a1e11a29f52dfbcf2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6f7f8af791664b2bb704744535e0e39c4d5fc06521beb8feb57f658d6187a667100fde4312a3a8e5f47f5dd9d4b3c06326584d95e80262dc0e02a91795e192c
|
7
|
+
data.tar.gz: 4f633972d8ccc1cc06cc14ca6c48b50759d63af72ba75514a0e877a58af4e1d407fda2c2608c2077dd541e30e86add8359bc2e0838d93a19f7cc1bd5c5f5fff2
|
@@ -104,6 +104,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
104
104
|
unless ok
|
105
105
|
cli.say " > Registering again #{d.name}:#{r_k}"
|
106
106
|
d.add_result(r_k, true, force: true)
|
107
|
+
sr = d.result(:stats) and sr.remove!
|
107
108
|
end
|
108
109
|
end
|
109
110
|
end
|
@@ -123,7 +124,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
123
124
|
changed = true
|
124
125
|
end
|
125
126
|
end
|
126
|
-
|
127
|
+
if changed
|
128
|
+
d.add_result(:cds, true, force: true)
|
129
|
+
sr = d.result(:stats) and sr.remove!
|
130
|
+
end
|
127
131
|
end
|
128
132
|
end
|
129
133
|
|
@@ -136,6 +140,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
136
140
|
if dir.nil?
|
137
141
|
cli.say " > Removing #{d.name}:essential_genes"
|
138
142
|
res.remove!
|
143
|
+
sr = d.result(:stats) and sr.remove!
|
139
144
|
next
|
140
145
|
end
|
141
146
|
next if Dir["#{dir}/*.faa"].empty?
|
data/lib/miga/cli/action/init.rb
CHANGED
@@ -220,7 +220,7 @@ BASH
|
|
220
220
|
|
221
221
|
def check_r_packages(paths)
|
222
222
|
cli.puts 'Looking for R packages:'
|
223
|
-
%w(
|
223
|
+
%w(ape cluster vegan).each do |pkg|
|
224
224
|
cli.print "Testing #{pkg}... "
|
225
225
|
if test_r_package(cli, paths, pkg)
|
226
226
|
cli.puts 'yes.'
|
@@ -25,6 +25,7 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
|
|
25
25
|
%w[project_stats haai_distances aai_distances ani_distances clade_finding]
|
26
26
|
.map { |i| ["run_#{i}", false] }
|
27
27
|
]
|
28
|
+
p_metadata[:ess_coll] = cli[:ess_coll]
|
28
29
|
d_metadata = { run_distances: false }
|
29
30
|
d_metadata[:run_mytaxa_scan] = false unless cli[:mytaxa]
|
30
31
|
p = create_project(:assembly, p_metadata, d_metadata)
|
@@ -122,17 +122,18 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
else
|
125
|
-
# Fix estimate
|
126
|
-
if !d.metadata[:tax].nil? &&
|
127
|
-
|
128
|
-
r.file_path(:
|
129
|
-
scr = "#{MiGA.root_path}/utils/
|
125
|
+
# Fix estimate by domain
|
126
|
+
if !(tax = d.metadata[:tax]).nil? &&
|
127
|
+
%w[Archaea Bacteria].include?(tax[:d]) &&
|
128
|
+
r.file_path(:raw_report).nil?
|
129
|
+
scr = "#{MiGA.root_path}/utils/domain-ess-genes.rb"
|
130
130
|
rep = r.file_path(:report)
|
131
131
|
rc_p = File.expand_path('.miga_rc', ENV['HOME'])
|
132
132
|
rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
|
133
|
-
$stderr.print `#{rc} ruby '#{scr}'
|
134
|
-
|
135
|
-
r.add_file(:
|
133
|
+
$stderr.print `#{rc} ruby '#{scr}' \
|
134
|
+
'#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
|
135
|
+
r.add_file(:raw_report, "#{d.name}.ess/log")
|
136
|
+
r.add_file(:report, "#{d.name}.ess/log.domain")
|
136
137
|
end
|
137
138
|
# Extract/compute quality values
|
138
139
|
stats = {completeness: [0.0, '%'], contamination: [0.0, '%']}
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -24,6 +24,11 @@ module MiGA::Cli::Action::Wf
|
|
24
24
|
opt.separator " FILES...: #{files_desc}"
|
25
25
|
opt.separator ''
|
26
26
|
opt.separator 'Workflow Control Options'
|
27
|
+
opt.on(
|
28
|
+
'-C', '--collection STRING',
|
29
|
+
'Collection of essential genes to use as reference',
|
30
|
+
'One of: dupont_2012 (default), lee_2019'
|
31
|
+
) { |v| cli[:ess_coll] = v }
|
27
32
|
if params[:ncbi]
|
28
33
|
opt.on(
|
29
34
|
'-T', '--ncbi-taxon STRING',
|
data/lib/miga/common/format.rb
CHANGED
@@ -25,10 +25,13 @@ module MiGA::Common::Format
|
|
25
25
|
# Cleans a FastA file in place.
|
26
26
|
def clean_fasta_file(file)
|
27
27
|
tmp_fh = nil
|
28
|
+
tmp_path = nil
|
28
29
|
begin
|
29
30
|
if file =~ /\.gz/
|
30
31
|
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
31
|
-
|
32
|
+
File.unlink tmp_path
|
33
|
+
tmp_path += '.gz'
|
34
|
+
tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
|
32
35
|
fh = Zlib::GzipReader.open(file)
|
33
36
|
else
|
34
37
|
tmp_fh = Tempfile.new('MiGA')
|
@@ -50,7 +53,7 @@ module MiGA::Common::Format
|
|
50
53
|
tmp_fh.print buffer.wrap_width(80)
|
51
54
|
tmp_fh.close
|
52
55
|
fh.close
|
53
|
-
FileUtils.
|
56
|
+
FileUtils.mv(tmp_path, file)
|
54
57
|
ensure
|
55
58
|
begin
|
56
59
|
tmp_fh.close unless tmp_fh.nil?
|
data/lib/miga/daemon.rb
CHANGED
@@ -285,10 +285,10 @@ class MiGA::Daemon < MiGA::MiGA
|
|
285
285
|
if [nil, '', 0].include? job[:pid]
|
286
286
|
job[:pid] = nil
|
287
287
|
@jobs_to_run << job
|
288
|
-
say "Unsuccessful #{job[:task_name]}, rescheduling
|
288
|
+
say "Unsuccessful #{job[:task_name]}, rescheduling"
|
289
289
|
else
|
290
290
|
@jobs_running << job
|
291
|
-
say "Spawned pid:#{job[:pid]} for #{job[:task_name]}
|
291
|
+
say "Spawned pid:#{job[:pid]} for #{job[:task_name]}"
|
292
292
|
end
|
293
293
|
end
|
294
294
|
end
|
data/lib/miga/project/dataset.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
##
|
5
5
|
# Helper module including specific functions handle datasets.
|
6
6
|
module MiGA::Project::Dataset
|
7
|
-
|
7
|
+
|
8
8
|
##
|
9
9
|
# Returns Array of MiGA::Dataset.
|
10
10
|
def datasets
|
@@ -23,7 +23,7 @@ module MiGA::Project::Dataset
|
|
23
23
|
def dataset_names_hash
|
24
24
|
@dataset_names_hash ||= Hash[dataset_names.map{ |i| [i,true] }]
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
##
|
28
28
|
# Returns MiGA::Dataset.
|
29
29
|
def dataset(name)
|
@@ -47,18 +47,19 @@ module MiGA::Project::Dataset
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
50
|
-
|
50
|
+
|
51
51
|
##
|
52
52
|
# Add dataset identified by +name+ and return MiGA::Dataset.
|
53
53
|
def add_dataset(name)
|
54
54
|
unless metadata[:datasets].include? name
|
55
55
|
MiGA::Dataset.new(self, name)
|
56
56
|
@metadata[:datasets] << name
|
57
|
+
@dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
|
57
58
|
save
|
58
59
|
end
|
59
60
|
dataset(name)
|
60
61
|
end
|
61
|
-
|
62
|
+
|
62
63
|
##
|
63
64
|
# Unlink dataset identified by +name+ and return MiGA::Dataset.
|
64
65
|
def unlink_dataset(name)
|
@@ -68,7 +69,7 @@ module MiGA::Project::Dataset
|
|
68
69
|
save
|
69
70
|
d
|
70
71
|
end
|
71
|
-
|
72
|
+
|
72
73
|
##
|
73
74
|
# Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
|
74
75
|
# supported by File#generic_transfer.
|
@@ -116,7 +117,7 @@ module MiGA::Project::Dataset
|
|
116
117
|
end
|
117
118
|
datasets.uniq - metadata[:datasets]
|
118
119
|
end
|
119
|
-
|
120
|
+
|
120
121
|
##
|
121
122
|
# Are all the datasets in the project preprocessed? Save intermediate results
|
122
123
|
# if +save+ (until the first incomplete dataset is reached).
|
@@ -149,6 +150,6 @@ module MiGA::Project::Dataset
|
|
149
150
|
def each_dataset_profile_advance(&blk)
|
150
151
|
each_dataset { |ds| blk.call(ds.profile_advance) }
|
151
152
|
end
|
152
|
-
|
153
|
+
|
153
154
|
end
|
154
155
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.5,
|
13
|
+
VERSION = [0.5, 1, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -18,7 +18,7 @@ module MiGA
|
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(
|
21
|
+
VERSION_DATE = Date.new(2020, 1, 6)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
@@ -22,18 +22,19 @@ fi
|
|
22
22
|
# Find and extract essential genes
|
23
23
|
[[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
|
24
24
|
mkdir "${DATASET}.ess"
|
25
|
-
TYPE=$(miga
|
25
|
+
TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" \
|
26
26
|
--metadata "type" | awk '{print $2}')
|
27
|
+
COLL=$(miga about -P "$PROJECT" -m ess_coll)
|
28
|
+
[[ "$COLL" == "?" ]] && COLL=dupont_2012
|
29
|
+
CMD="HMM.essential.rb \
|
30
|
+
-i '$FAA' -o '${DATASET}.ess.faa' -m '${DATASET}.ess/' \
|
31
|
+
-t '$CORES' -r '$DATASET' --collection '$COLL'"
|
27
32
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
28
|
-
|
29
|
-
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
30
|
-
> "${DATASET}.ess/log"
|
33
|
+
CMD="$CMD --metagenome"
|
31
34
|
else
|
32
|
-
|
33
|
-
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
|
34
|
-
--alignments "${DATASET}.ess/proteins.aln" \
|
35
|
-
> "${DATASET}.ess/log"
|
35
|
+
CMD="$CMD --alignments '${DATASET}.ess/proteins.aln'"
|
36
36
|
fi
|
37
|
+
$CMD > "${DATASET}.ess/log"
|
37
38
|
|
38
39
|
# Reduce files
|
39
40
|
if exists "$DATASET".ess/*.faa ; then
|
data/scripts/mytaxa.bash
CHANGED
@@ -38,7 +38,9 @@ else
|
|
38
38
|
fi
|
39
39
|
|
40
40
|
# Execute search
|
41
|
-
|
41
|
+
FAA="../../../06.cds/$DATASET.faa"
|
42
|
+
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
43
|
+
diamond blastp -q "$FAA" -d "$MT/AllGenomes.faa" \
|
42
44
|
-a "$DATASET.daa" -k 5 -p "$CORES" --min-score 60
|
43
45
|
diamond view -a "$DATASET.daa" -o "$DATASET.blast"
|
44
46
|
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -39,12 +39,13 @@ else
|
|
39
39
|
exit 1
|
40
40
|
fi
|
41
41
|
|
42
|
+
FAA="../../../06.cds/$DATASET.faa"
|
43
|
+
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
42
44
|
if [[ ! -s "$DATASET.mytaxa" ]] ; then
|
43
45
|
# Execute search
|
44
46
|
if [[ ! -s "$DATASET.blast" ]] ; then
|
45
|
-
diamond blastp -q "
|
46
|
-
-d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
|
47
|
-
-a "$DATASET.daa" -t "$TMPDIR"
|
47
|
+
diamond blastp -q "$FAA" -a "$DATASET.daa" -t "$TMPDIR" \
|
48
|
+
-d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
|
48
49
|
diamond view -a "$DATASET.daa" -o "$DATASET.blast" -t "$TMPDIR"
|
49
50
|
fi
|
50
51
|
|
@@ -53,8 +54,7 @@ else
|
|
53
54
|
| sort -k 13 > "$DATASET.mytaxain"
|
54
55
|
"$MT/MyTaxa" "$DATASET.mytaxain" "$DATASET.mytaxa" "0.5"
|
55
56
|
fi
|
56
|
-
ruby "$MIGA/utils/mytaxa_scan.rb" "
|
57
|
-
"$DATASET.mytaxa" "$DATASET.wintax"
|
57
|
+
ruby "$MIGA/utils/mytaxa_scan.rb" "$FAA" "$DATASET.mytaxa" "$DATASET.wintax"
|
58
58
|
echo "
|
59
59
|
source('$MIGA/utils/mytaxa_scan.R');
|
60
60
|
pdf('$DATASET.pdf', 12, 7);
|
@@ -70,11 +70,18 @@ else
|
|
70
70
|
let i=$i+1
|
71
71
|
awk "NR==$win" "$DATASET.wintax.genes" | tr "\\t" "\\n" \
|
72
72
|
> "$DATASET.reg/$i.ids"
|
73
|
-
|
74
|
-
|
73
|
+
if [[ "$FAA" == *.gz ]] ; then
|
74
|
+
gzip -c -d "$FAA" \
|
75
|
+
| FastA.filter.pl -q "$DATASET.reg/$i.ids" /dev/stdin \
|
76
|
+
> "$DATASET.reg/$i.faa"
|
77
|
+
else
|
78
|
+
FastA.filter.pl -q "$DATASET.reg/$i.ids" "$FAA" \
|
79
|
+
> "$DATASET.reg/$i.faa"
|
80
|
+
fi
|
75
81
|
done
|
76
82
|
# Archive regions
|
77
|
-
tar
|
83
|
+
tar -cf "$DATASET.reg.tar" "$DATASET.reg"
|
84
|
+
gzip -9 "$DATASET.reg.tar"
|
78
85
|
rm -r "$DATASET.reg"
|
79
86
|
fi
|
80
87
|
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
esslog = ARGV.shift
|
4
|
+
outlog = ARGV.shift
|
5
|
+
domain = ARGV.shift
|
6
|
+
|
7
|
+
def quality(hsh)
|
8
|
+
q = {}
|
9
|
+
q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
|
10
|
+
q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
|
11
|
+
q[:cmp] = 100.0*q[:found].to_f/hsh.size
|
12
|
+
q[:cnt] = 100.0*q[:multi].to_f/hsh.size
|
13
|
+
q
|
14
|
+
end
|
15
|
+
|
16
|
+
# Find collection and detected anomalies
|
17
|
+
cnt_ref = {}
|
18
|
+
at = :header
|
19
|
+
collection = 'dupont_2012'
|
20
|
+
File.open(esslog, 'r') do |fh|
|
21
|
+
fh.each_line do |ln|
|
22
|
+
v = ln.chomp.gsub(/^! +/, '')
|
23
|
+
if v == 'Multiple copies: '
|
24
|
+
at = :multi
|
25
|
+
elsif v == 'Missing genes: '
|
26
|
+
at = :missing
|
27
|
+
elsif v =~ /Collection: (\S+)/
|
28
|
+
collection = $1
|
29
|
+
elsif at == :multi
|
30
|
+
v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
|
31
|
+
cnt_ref[$2] = $1.to_i
|
32
|
+
elsif at == :missing
|
33
|
+
v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
|
34
|
+
cnt_ref[$1] = 0
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Find expected genes for domain
|
40
|
+
n_dom = Hash[
|
41
|
+
`HMM.essential.rb -L -q '-#{domain}' -c '#{collection}'`
|
42
|
+
.chomp.split("\n").map { |i| i.split("\t") }
|
43
|
+
]
|
44
|
+
l_dom = n_dom.keys
|
45
|
+
cnt_dom = {}
|
46
|
+
l_dom.each { |i| cnt_dom[i] = cnt_ref[i] || 1 }
|
47
|
+
|
48
|
+
# Correct report
|
49
|
+
q = quality(cnt_dom)
|
50
|
+
File.open(outlog, 'w') do |ofh|
|
51
|
+
ofh.puts "! Collection: #{collection} #{domain}"
|
52
|
+
ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_dom.size}."
|
53
|
+
ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
|
54
|
+
ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
|
55
|
+
if q[:multi] > 0
|
56
|
+
ofh.puts "! Multiple copies: "
|
57
|
+
cnt_dom.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_dom[k]}." if v>1 }
|
58
|
+
end
|
59
|
+
if q[:found] < cnt_dom.size
|
60
|
+
ofh.puts "! Missing genes: "
|
61
|
+
cnt_dom.each{ |k,v| ofh.puts "! #{k}: #{n_dom[k]}." if v==0 }
|
62
|
+
end
|
63
|
+
end
|
@@ -371,8 +371,18 @@
|
|
371
371
|
"source_url": "http://hmmer.janelia.org/software"
|
372
372
|
}
|
373
373
|
],
|
374
|
-
"cite": [
|
375
|
-
"
|
374
|
+
"cite": [
|
375
|
+
["Eddy, 2011, PLoS CB",
|
376
|
+
"http://dx.doi.org/10.1371/journal.pcbi.1002195"],
|
377
|
+
["Dupont et al, 2012, ISME J",
|
378
|
+
"https://doi.org/10.1038/ismej.2011.189"],
|
379
|
+
["Rodriguez-R et al, 2014, ISME J",
|
380
|
+
"https://doi.org/10.1038/ismej.2015.5"],
|
381
|
+
["Lee, 2019, Bioinf",
|
382
|
+
"https://doi.org/10.1093/bioinformatics/btz188"],
|
383
|
+
["Eren et al, 2015, PeerJ",
|
384
|
+
"https://doi.org/10.7717/peerj.1319"]
|
385
|
+
],
|
376
386
|
"options": [
|
377
387
|
{
|
378
388
|
"name": "Input file",
|
@@ -381,6 +391,15 @@
|
|
381
391
|
"mandatory": true,
|
382
392
|
"description": "FastA file containing all the proteins in the genome."
|
383
393
|
},
|
394
|
+
{
|
395
|
+
"opt": "--collection",
|
396
|
+
"arg": "string",
|
397
|
+
"default": "dupont_2012",
|
398
|
+
"description": ["Reference collection of essential proteins to use.",
|
399
|
+
"One of: dupont_2012 (default, Dupont et al 2012 modified by",
|
400
|
+
"Rodriguez-R et al 2015), or lee_2019 (Lee 2019 modified by Eren",
|
401
|
+
"et al 2015)."]
|
402
|
+
},
|
384
403
|
{
|
385
404
|
"name": "Output file",
|
386
405
|
"opt": "--out",
|
@@ -64,15 +64,15 @@
|
|
64
64
|
"task": "HMM.essential.rb",
|
65
65
|
"description": ["Typical single-copy bacterial genes present in",
|
66
66
|
"Mycoplasma genitalium."],
|
67
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,
|
68
|
-
null,null,null,null,null,null,null,null]
|
67
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
|
68
|
+
null,null,null,null,null,null,null,null,null]
|
69
69
|
},
|
70
70
|
{
|
71
71
|
"task": "HMM.essential.rb",
|
72
72
|
"description": ["Typical single-copy archaeal genes present in",
|
73
73
|
"Nanoarchaeum equitans."],
|
74
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,
|
75
|
-
null,null,null,null,null,null,null,null]
|
74
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,null,true,null,
|
75
|
+
null,null,null,null,null,null,null,null,null]
|
76
76
|
},
|
77
77
|
{
|
78
78
|
"task": "Newick.autoprune.R",
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.N50.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.filterN.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.length.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.split.pl
|
@@ -10,7 +10,8 @@ use 'zlib'
|
|
10
10
|
|
11
11
|
o = {
|
12
12
|
bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
13
|
-
archaea: false, genomeeq: false, metagenome: false, list: false
|
13
|
+
archaea: false, genomeeq: false, metagenome: false, list: false,
|
14
|
+
collection: 'dupont_2012'
|
14
15
|
}
|
15
16
|
OptionParser.new do |opts|
|
16
17
|
opts.banner = "
|
@@ -33,7 +34,15 @@ Usage: #{$0} [options]"
|
|
33
34
|
'Path to the FastA file (.gz allowed) with all the proteins in a genome'
|
34
35
|
) { |v| o[:in] = v }
|
35
36
|
opts.separator ''
|
36
|
-
opts.separator '
|
37
|
+
opts.separator 'Options'
|
38
|
+
opts.on(
|
39
|
+
'-c', '--collection STR',
|
40
|
+
'Reference collection of essential proteins to use. One of:',
|
41
|
+
'> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
|
42
|
+
' modified by https://doi.org/10.1038/ismej.2015.5',
|
43
|
+
'> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
|
44
|
+
' modified by https://doi.org/10.7717/peerj.1319'
|
45
|
+
) { |v| o[:collection] = v }
|
37
46
|
opts.on(
|
38
47
|
'-o', '--out FILE',
|
39
48
|
'Path to the output FastA file with the translated essential genes',
|
@@ -117,20 +126,44 @@ abort '-i is mandatory' if o[:in].nil? and not o[:list]
|
|
117
126
|
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
118
127
|
o[:rename] = nil if o[:metagenome]
|
119
128
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
129
|
+
case o[:collection]
|
130
|
+
when 'dupont_2012'
|
131
|
+
not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
|
132
|
+
TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
|
133
|
+
TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
|
134
|
+
TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
|
135
|
+
TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
|
136
|
+
TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
|
137
|
+
TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
|
138
|
+
TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
|
139
|
+
TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
|
140
|
+
TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
|
141
|
+
TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
|
142
|
+
TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
|
143
|
+
TIGR02729 TIGR03263 TIGR03594}
|
144
|
+
not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
|
145
|
+
not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
|
146
|
+
TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
|
147
|
+
when 'lee_2019'
|
148
|
+
not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
|
149
|
+
EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
|
150
|
+
Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
|
151
|
+
Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
|
152
|
+
Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
|
153
|
+
Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
|
154
|
+
RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
|
155
|
+
not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
|
156
|
+
CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
|
157
|
+
DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
|
158
|
+
Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
|
159
|
+
Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
|
160
|
+
Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
|
161
|
+
Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
|
162
|
+
Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
|
163
|
+
not_as_genomeeq = not_in_archaea + not_in_bacteria
|
164
|
+
else
|
165
|
+
raise "Unsupported collection: '#{o[:collection]}'"
|
166
|
+
end
|
134
167
|
|
135
168
|
begin
|
136
169
|
Dir.mktmpdir do |dir|
|
@@ -148,7 +181,8 @@ begin
|
|
148
181
|
models = {}
|
149
182
|
model_id = nil
|
150
183
|
dbh = File.open("#{dir}/essential.hmm", 'w')
|
151
|
-
o[:model_file] ||= File.expand_path(
|
184
|
+
o[:model_file] ||= File.expand_path(
|
185
|
+
"../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
|
152
186
|
mfh = (File.extname(o[:model_file]) == '.gz') ?
|
153
187
|
Zlib::GzipReader.open(o[:model_file]) :
|
154
188
|
File.open(o[:model_file], 'r')
|
@@ -201,6 +235,9 @@ begin
|
|
201
235
|
# Report statistics
|
202
236
|
if o[:stats]
|
203
237
|
reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
|
238
|
+
modifiers = [:bacteria, :archaea, :genomeeq]
|
239
|
+
.map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
|
240
|
+
reph.puts "! Collection: #{o[:collection]} #{modifiers}"
|
204
241
|
if o[:metagenome]
|
205
242
|
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
206
243
|
gc = [0] * (models.size - genes.size) +
|
File without changes
|
Binary file
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../enveomics.R
|
@@ -25,25 +25,24 @@
|
|
25
25
|
|
26
26
|
enve.df2dist <- function(
|
27
27
|
x,
|
28
|
-
obj1.index=1,
|
29
|
-
obj2.index=2,
|
30
|
-
dist.index=3,
|
31
|
-
default.d=NA,
|
32
|
-
max.sim=0
|
28
|
+
obj1.index = 1,
|
29
|
+
obj2.index = 2,
|
30
|
+
dist.index = 3,
|
31
|
+
default.d = NA,
|
32
|
+
max.sim = 0
|
33
33
|
){
|
34
|
-
x <- as.data.frame(x)
|
35
|
-
a <- as.character(x[, obj1.index])
|
36
|
-
b <- as.character(x[, obj2.index])
|
37
|
-
d <- as.double(x[, dist.index])
|
38
|
-
if(max.sim!=0) d <- (max.sim - d)/max.sim
|
39
|
-
ids <- unique(c(a,b))
|
40
|
-
m <- matrix(default.d,
|
34
|
+
x <- as.data.frame(x)
|
35
|
+
a <- as.character(x[, obj1.index])
|
36
|
+
b <- as.character(x[, obj2.index])
|
37
|
+
d <- as.double(x[, dist.index])
|
38
|
+
if(max.sim != 0) d <- (max.sim - d) / max.sim
|
39
|
+
ids <- unique(c(a,b))
|
40
|
+
m <- matrix(default.d,
|
41
|
+
nrow = length(ids), ncol = length(ids), dimnames = list(ids, ids))
|
41
42
|
diag(m) <- 0.0
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
m <- pmin(m, t(m), na.rm=TRUE)
|
46
|
-
return(as.dist(m));
|
43
|
+
m[cbind(a,b)] <- d
|
44
|
+
m <- pmin(m, t(m), na.rm = TRUE)
|
45
|
+
return(as.dist(m))
|
47
46
|
}
|
48
47
|
|
49
48
|
#' Enveomics: Data Frame to Dist (Group)
|
@@ -666,15 +666,16 @@ enve.recplot2.findPeaks <- function(
|
|
666
666
|
#' A vector of number of components to evaluate.
|
667
667
|
#' @param criterion
|
668
668
|
#' Criterion to use for components selection. Must be one of:
|
669
|
-
#' \code{aic} (Akaike Information Criterion),
|
670
|
-
#'
|
669
|
+
#' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
|
670
|
+
#' (Bayesian Information Criterion or Schwarz Criterion).
|
671
671
|
#' @param merge.tol
|
672
672
|
#' When attempting to merge peaks with very similar sequencing depth, use
|
673
673
|
#' this number of significant digits (in log-scale).
|
674
674
|
#' @param verbose
|
675
675
|
#' Display (mostly debugging) information.
|
676
676
|
#' @param ...
|
677
|
-
#' Any additional parameters supported by
|
677
|
+
#' Any additional parameters supported by
|
678
|
+
#' \code{\link{enve.recplot2.findPeaks.em}}.
|
678
679
|
#'
|
679
680
|
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
680
681
|
#'
|
@@ -684,10 +685,10 @@ enve.recplot2.findPeaks <- function(
|
|
684
685
|
|
685
686
|
enve.recplot2.findPeaks.emauto <- function(
|
686
687
|
x,
|
687
|
-
components=seq(1,
|
688
|
-
criterion='aic',
|
689
|
-
merge.tol=2L,
|
690
|
-
verbose=FALSE,
|
688
|
+
components = seq(1, 5),
|
689
|
+
criterion = 'aic',
|
690
|
+
merge.tol = 2L,
|
691
|
+
verbose = FALSE,
|
691
692
|
...
|
692
693
|
){
|
693
694
|
best <- list(crit=0, pstore=list())
|
@@ -758,19 +759,19 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
758
759
|
|
759
760
|
enve.recplot2.findPeaks.em <- function(
|
760
761
|
x,
|
761
|
-
max.iter=1000,
|
762
|
-
ll.diff.res=1e-8,
|
763
|
-
components=2,
|
764
|
-
rm.top=0.05,
|
765
|
-
verbose=FALSE,
|
762
|
+
max.iter = 1000,
|
763
|
+
ll.diff.res = 1e-8,
|
764
|
+
components = 2,
|
765
|
+
rm.top = 0.05,
|
766
|
+
verbose = FALSE,
|
766
767
|
init,
|
767
|
-
log=TRUE
|
768
|
+
log = TRUE
|
768
769
|
){
|
769
770
|
|
770
771
|
# Essential vars
|
771
772
|
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
772
773
|
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
773
|
-
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
774
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
|
774
775
|
if(log) lsd1 <- log(lsd1)
|
775
776
|
|
776
777
|
# 1. Initialize
|
@@ -779,7 +780,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
779
780
|
init <- list(
|
780
781
|
mu = tapply(lsd1, km.clust, mean),
|
781
782
|
sd = tapply(lsd1, km.clust, sd),
|
782
|
-
alpha = table(km.clust)/length(km.clust)
|
783
|
+
alpha = table(km.clust) / length(km.clust)
|
783
784
|
)
|
784
785
|
}
|
785
786
|
m.step <- init
|
@@ -795,6 +796,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
795
796
|
ll.diff <- abs(cur.ll - e.step[["ll"]])
|
796
797
|
cur.ll <- e.step[["ll"]]
|
797
798
|
if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
|
799
|
+
if(is.na(ll.diff) || ll.diff == Inf) break
|
798
800
|
if(ll.diff <= ll.diff.res) break
|
799
801
|
}
|
800
802
|
|
@@ -1431,6 +1433,9 @@ enve.recplot2.findPeaks.__em_e <- function
|
|
1431
1433
|
theta[['sd']][i])*theta[['alpha']][i]))
|
1432
1434
|
sum.of.components <- rowSums(product)
|
1433
1435
|
posterior <- product / sum.of.components
|
1436
|
+
for(i in which(sum.of.components == Inf)) {
|
1437
|
+
cat(i,'/',nrow(product), ':', product[i,], '\n')
|
1438
|
+
}
|
1434
1439
|
|
1435
1440
|
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1436
1441
|
}
|
@@ -52,6 +52,7 @@ For additional information on recruitment plots, see the
|
|
52
52
|
[Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
|
53
53
|
|
54
54
|
## Changelog
|
55
|
+
* 1.7.1: Improved efficiency of `enve.df2dist` about five-fold.
|
55
56
|
* 1.7.0: Uniformized output for `enve.recplot2.extractWindows` and
|
56
57
|
`enve.recplot2.coordinates` to ease automation. Thanks to Tomeu Viver and
|
57
58
|
Roth Conrad for troubleshooting.
|
@@ -4,7 +4,7 @@
|
|
4
4
|
\alias{enve.recplot2.findPeaks.emauto}
|
5
5
|
\title{Enveomics: Recruitment Plot (2) Emauto Peak Finder}
|
6
6
|
\usage{
|
7
|
-
enve.recplot2.findPeaks.emauto(x, components = seq(1,
|
7
|
+
enve.recplot2.findPeaks.emauto(x, components = seq(1, 5),
|
8
8
|
criterion = "aic", merge.tol = 2L, verbose = FALSE, ...)
|
9
9
|
}
|
10
10
|
\arguments{
|
@@ -13,15 +13,16 @@ enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
|
|
13
13
|
\item{components}{A vector of number of components to evaluate.}
|
14
14
|
|
15
15
|
\item{criterion}{Criterion to use for components selection. Must be one of:
|
16
|
-
\code{aic} (Akaike Information Criterion),
|
17
|
-
|
16
|
+
\code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
|
17
|
+
(Bayesian Information Criterion or Schwarz Criterion).}
|
18
18
|
|
19
19
|
\item{merge.tol}{When attempting to merge peaks with very similar sequencing depth, use
|
20
20
|
this number of significant digits (in log-scale).}
|
21
21
|
|
22
22
|
\item{verbose}{Display (mostly debugging) information.}
|
23
23
|
|
24
|
-
\item{...}{Any additional parameters supported by
|
24
|
+
\item{...}{Any additional parameters supported by
|
25
|
+
\code{\link{enve.recplot2.findPeaks.em}}.}
|
25
26
|
}
|
26
27
|
\value{
|
27
28
|
Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
data/utils/find-medoid.R
CHANGED
@@ -7,7 +7,12 @@
|
|
7
7
|
#= Load stuff
|
8
8
|
argv <- commandArgs(trailingOnly = T)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
|
10
|
+
if(Sys.getenv('MIGA') == ''){
|
11
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
12
|
+
}else{
|
13
|
+
source(file.path(Sys.getenv('MIGA'),
|
14
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
15
|
+
}
|
11
16
|
|
12
17
|
find_medoids <- function(ani.df, out, clades) {
|
13
18
|
if(nrow(ani.df) == 0) return(NULL)
|
data/utils/mytaxa_scan.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'zlib'
|
4
|
+
|
3
5
|
abort "
|
4
6
|
Usage:
|
5
7
|
#{$0} {FastA file} {MyTaxa file} {Data output}
|
@@ -7,52 +9,53 @@ Usage:
|
|
7
9
|
" if ARGV[2].nil?
|
8
10
|
|
9
11
|
begin
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
12
|
+
# Get arguments
|
13
|
+
faa, mytaxa, outdata = ARGV
|
14
|
+
winsize = 10
|
15
|
+
|
16
|
+
# Extract gene IDs
|
17
|
+
ifh = faa =~ /\.gz/ ? Zlib::GzipReader.open(faa) : File.open(faa, 'r')
|
18
|
+
ids = ifh.each_line.grep(/^>/).map{|dl| dl.chomp.sub(/^>/,'').sub(/\s.*/,'')}
|
19
|
+
ifh.close
|
20
|
+
tax = Hash[ids.map{|k| [k, "NA"]}]
|
21
|
+
|
22
|
+
# Get MyTaxa distributions
|
23
|
+
k, l = nil
|
24
|
+
File.open(mytaxa).each do |ln|
|
25
|
+
ln.chomp!
|
26
|
+
if $.%2 == 1
|
27
|
+
k, l = ln.split /\t/
|
28
|
+
else
|
29
|
+
tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
|
33
|
+
|
34
|
+
# Estimate Windows and save gene IDs
|
35
|
+
fh = File.open(outdata + ".genes", "w")
|
36
|
+
c = []
|
37
|
+
c << all_tax.map{|t| tax.values.count(t) }
|
38
|
+
n_wins = (ids.size/winsize).ceil
|
39
|
+
(0 .. (n_wins-1)).each do |win|
|
40
|
+
k = ids[win*winsize, winsize]
|
41
|
+
win_t = tax.values_at(*k)
|
42
|
+
fh.puts k.join("\t")
|
43
|
+
c << all_tax.map{|t| win_t.count(t)}
|
44
|
+
end
|
45
|
+
p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
|
46
|
+
fh.close
|
47
|
+
|
48
|
+
# Save window profiles
|
49
|
+
fh = File.open(outdata, "w")
|
50
|
+
fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
|
51
|
+
fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
|
52
|
+
(0 .. (all_tax.size - 1)).each do |row|
|
53
|
+
fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
|
54
|
+
end
|
55
|
+
fh.close
|
52
56
|
rescue => err
|
53
|
-
|
54
|
-
|
55
|
-
|
57
|
+
$stderr.puts "Exception: #{err}\n\n"
|
58
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
59
|
+
err
|
56
60
|
end
|
57
61
|
|
58
|
-
|
data/utils/ref-tree.R
CHANGED
@@ -7,7 +7,12 @@
|
|
7
7
|
#= Load stuff
|
8
8
|
argv <- commandArgs(trailingOnly=T)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
|
10
|
+
if(Sys.getenv('MIGA') == ''){
|
11
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
12
|
+
}else{
|
13
|
+
source(file.path(Sys.getenv('MIGA'),
|
14
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
15
|
+
}
|
11
16
|
inst <- c("phangorn", "phytools") %in% rownames(installed.packages())
|
12
17
|
if(inst[1]){
|
13
18
|
suppressPackageStartupMessages(library(phangorn))
|
data/utils/subclades-nj.R
CHANGED
@@ -12,7 +12,12 @@ suppressPackageStartupMessages(library(cluster))
|
|
12
12
|
suppressPackageStartupMessages(library(phytools))
|
13
13
|
suppressPackageStartupMessages(library(phangorn))
|
14
14
|
suppressPackageStartupMessages(library(parallel))
|
15
|
-
|
15
|
+
if(Sys.getenv('MIGA') == ''){
|
16
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
17
|
+
}else{
|
18
|
+
source(file.path(Sys.getenv('MIGA'),
|
19
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
20
|
+
}
|
16
21
|
|
17
22
|
#= Main function
|
18
23
|
subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
data/utils/subclades.R
CHANGED
@@ -10,7 +10,12 @@ suppressPackageStartupMessages(library(ape))
|
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
12
12
|
suppressPackageStartupMessages(library(parallel))
|
13
|
-
|
13
|
+
if(Sys.getenv('MIGA') == ''){
|
14
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
15
|
+
}else{
|
16
|
+
source(file.path(Sys.getenv('MIGA'),
|
17
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
18
|
+
}
|
14
19
|
|
15
20
|
#= Main function
|
16
21
|
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -197,7 +197,6 @@ files:
|
|
197
197
|
- test/taxonomy_test.rb
|
198
198
|
- test/test_helper.rb
|
199
199
|
- utils/adapters.fa
|
200
|
-
- utils/arch-ess-genes.rb
|
201
200
|
- utils/cleanup-databases.rb
|
202
201
|
- utils/core-pan-plot.R
|
203
202
|
- utils/distance/base.rb
|
@@ -207,6 +206,7 @@ files:
|
|
207
206
|
- utils/distance/runner.rb
|
208
207
|
- utils/distance/temporal.rb
|
209
208
|
- utils/distances.rb
|
209
|
+
- utils/domain-ess-genes.rb
|
210
210
|
- utils/enveomics/Docs/recplot2.md
|
211
211
|
- utils/enveomics/Examples/aai-matrix.bash
|
212
212
|
- utils/enveomics/Examples/ani-matrix.bash
|
@@ -356,7 +356,8 @@ files:
|
|
356
356
|
- utils/enveomics/Scripts/clust.rand.rb
|
357
357
|
- utils/enveomics/Scripts/gi2tax.rb
|
358
358
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
359
|
-
- utils/enveomics/Scripts/lib/data/
|
359
|
+
- utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz
|
360
|
+
- utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz
|
360
361
|
- utils/enveomics/Scripts/lib/enveomics.R
|
361
362
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
362
363
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
@@ -514,8 +515,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
514
515
|
- !ruby/object:Gem::Version
|
515
516
|
version: '0'
|
516
517
|
requirements: []
|
517
|
-
|
518
|
-
rubygems_version: 2.7.6
|
518
|
+
rubygems_version: 3.0.3
|
519
519
|
signing_key:
|
520
520
|
specification_version: 4
|
521
521
|
summary: MiGA
|
data/utils/arch-ess-genes.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
esslog = ARGV.shift
|
4
|
-
outlog = ARGV.shift
|
5
|
-
l_all = `HMM.essential.rb -l -q`.chomp.split("\n").map{ |i| i.gsub(/\t.*/,"") }
|
6
|
-
n_arc = Hash[
|
7
|
-
`HMM.essential.rb -l -q -A`.chomp.split("\n").map{ |i| i.split("\t") }
|
8
|
-
]
|
9
|
-
l_arc = n_arc.keys
|
10
|
-
|
11
|
-
def quality(hsh)
|
12
|
-
q = {}
|
13
|
-
q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
|
14
|
-
q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
|
15
|
-
q[:cmp] = 100.0*q[:found].to_f/hsh.size
|
16
|
-
q[:cnt] = 100.0*q[:multi].to_f/hsh.size
|
17
|
-
q
|
18
|
-
end
|
19
|
-
|
20
|
-
cnt_ref = {}
|
21
|
-
l_all.each{ |i| cnt_ref[i] = 1 }
|
22
|
-
|
23
|
-
at = :header
|
24
|
-
File.open(esslog, "r") do |fh|
|
25
|
-
fh.each_line do |ln|
|
26
|
-
v = ln.chomp.gsub(/^! +/, "")
|
27
|
-
if v=="Multiple copies: "
|
28
|
-
at = :multi
|
29
|
-
elsif v=="Missing genes: "
|
30
|
-
at = :missing
|
31
|
-
elsif at==:multi
|
32
|
-
v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
|
33
|
-
cnt_ref[$2] = $1.to_i
|
34
|
-
elsif at==:missing
|
35
|
-
v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
|
36
|
-
cnt_ref[$1] = 0
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
cnt_arc = {}
|
42
|
-
l_arc.each{ |i| cnt_arc[i] = cnt_ref[i] }
|
43
|
-
|
44
|
-
q = quality(cnt_arc)
|
45
|
-
File.open(outlog, "w") do |ofh|
|
46
|
-
ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_arc.size}."
|
47
|
-
ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
|
48
|
-
ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
|
49
|
-
if q[:multi] > 0
|
50
|
-
ofh.puts "! Multiple copies: "
|
51
|
-
cnt_arc.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_arc[k]}." if v>1 }
|
52
|
-
end
|
53
|
-
if q[:found] < cnt_arc.size
|
54
|
-
ofh.puts "! Missing genes: "
|
55
|
-
cnt_arc.each{ |k,v| ofh.puts "! #{k}: #{n_arc[k]}." if v==0 }
|
56
|
-
end
|
57
|
-
end
|