miga-base 0.5.0.0 → 0.5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor.rb +6 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/cli/action/quality_wf.rb +1 -0
- data/lib/miga/cli/action/stats.rb +9 -8
- data/lib/miga/cli/action/wf.rb +5 -0
- data/lib/miga/cli/objects_helper.rb +1 -0
- data/lib/miga/common/format.rb +5 -2
- data/lib/miga/daemon.rb +2 -2
- data/lib/miga/project/dataset.rb +8 -7
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +9 -8
- data/scripts/mytaxa.bash +3 -1
- data/scripts/mytaxa_scan.bash +15 -8
- data/utils/domain-ess-genes.rb +63 -0
- data/utils/enveomics/Manifest/Tasks/other.json +21 -2
- data/utils/enveomics/Manifest/examples.json +4 -4
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/HMM.essential.rb +54 -17
- data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/R/df2dist.R +16 -17
- data/utils/enveomics/enveomics.R/R/recplot2.R +20 -15
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +5 -4
- data/utils/find-medoid.R +6 -1
- data/utils/mytaxa_scan.rb +49 -46
- data/utils/ref-tree.R +6 -1
- data/utils/subclades-nj.R +6 -1
- data/utils/subclades.R +6 -1
- metadata +6 -6
- data/utils/arch-ess-genes.rb +0 -57
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6888c1ce3756b8cc708736c0da052e5a7396277e0c903ebcfc083f17b6915e7
|
4
|
+
data.tar.gz: d998f6e087316a81de4aa8897452344c1987ce0cb9807f4a1e11a29f52dfbcf2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6f7f8af791664b2bb704744535e0e39c4d5fc06521beb8feb57f658d6187a667100fde4312a3a8e5f47f5dd9d4b3c06326584d95e80262dc0e02a91795e192c
|
7
|
+
data.tar.gz: 4f633972d8ccc1cc06cc14ca6c48b50759d63af72ba75514a0e877a58af4e1d407fda2c2608c2077dd541e30e86add8359bc2e0838d93a19f7cc1bd5c5f5fff2
|
@@ -104,6 +104,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
104
104
|
unless ok
|
105
105
|
cli.say " > Registering again #{d.name}:#{r_k}"
|
106
106
|
d.add_result(r_k, true, force: true)
|
107
|
+
sr = d.result(:stats) and sr.remove!
|
107
108
|
end
|
108
109
|
end
|
109
110
|
end
|
@@ -123,7 +124,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
123
124
|
changed = true
|
124
125
|
end
|
125
126
|
end
|
126
|
-
|
127
|
+
if changed
|
128
|
+
d.add_result(:cds, true, force: true)
|
129
|
+
sr = d.result(:stats) and sr.remove!
|
130
|
+
end
|
127
131
|
end
|
128
132
|
end
|
129
133
|
|
@@ -136,6 +140,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
136
140
|
if dir.nil?
|
137
141
|
cli.say " > Removing #{d.name}:essential_genes"
|
138
142
|
res.remove!
|
143
|
+
sr = d.result(:stats) and sr.remove!
|
139
144
|
next
|
140
145
|
end
|
141
146
|
next if Dir["#{dir}/*.faa"].empty?
|
data/lib/miga/cli/action/init.rb
CHANGED
@@ -220,7 +220,7 @@ BASH
|
|
220
220
|
|
221
221
|
def check_r_packages(paths)
|
222
222
|
cli.puts 'Looking for R packages:'
|
223
|
-
%w(
|
223
|
+
%w(ape cluster vegan).each do |pkg|
|
224
224
|
cli.print "Testing #{pkg}... "
|
225
225
|
if test_r_package(cli, paths, pkg)
|
226
226
|
cli.puts 'yes.'
|
@@ -25,6 +25,7 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
|
|
25
25
|
%w[project_stats haai_distances aai_distances ani_distances clade_finding]
|
26
26
|
.map { |i| ["run_#{i}", false] }
|
27
27
|
]
|
28
|
+
p_metadata[:ess_coll] = cli[:ess_coll]
|
28
29
|
d_metadata = { run_distances: false }
|
29
30
|
d_metadata[:run_mytaxa_scan] = false unless cli[:mytaxa]
|
30
31
|
p = create_project(:assembly, p_metadata, d_metadata)
|
@@ -122,17 +122,18 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
else
|
125
|
-
# Fix estimate
|
126
|
-
if !d.metadata[:tax].nil? &&
|
127
|
-
|
128
|
-
r.file_path(:
|
129
|
-
scr = "#{MiGA.root_path}/utils/
|
125
|
+
# Fix estimate by domain
|
126
|
+
if !(tax = d.metadata[:tax]).nil? &&
|
127
|
+
%w[Archaea Bacteria].include?(tax[:d]) &&
|
128
|
+
r.file_path(:raw_report).nil?
|
129
|
+
scr = "#{MiGA.root_path}/utils/domain-ess-genes.rb"
|
130
130
|
rep = r.file_path(:report)
|
131
131
|
rc_p = File.expand_path('.miga_rc', ENV['HOME'])
|
132
132
|
rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
|
133
|
-
$stderr.print `#{rc} ruby '#{scr}'
|
134
|
-
|
135
|
-
r.add_file(:
|
133
|
+
$stderr.print `#{rc} ruby '#{scr}' \
|
134
|
+
'#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
|
135
|
+
r.add_file(:raw_report, "#{d.name}.ess/log")
|
136
|
+
r.add_file(:report, "#{d.name}.ess/log.domain")
|
136
137
|
end
|
137
138
|
# Extract/compute quality values
|
138
139
|
stats = {completeness: [0.0, '%'], contamination: [0.0, '%']}
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -24,6 +24,11 @@ module MiGA::Cli::Action::Wf
|
|
24
24
|
opt.separator " FILES...: #{files_desc}"
|
25
25
|
opt.separator ''
|
26
26
|
opt.separator 'Workflow Control Options'
|
27
|
+
opt.on(
|
28
|
+
'-C', '--collection STRING',
|
29
|
+
'Collection of essential genes to use as reference',
|
30
|
+
'One of: dupont_2012 (default), lee_2019'
|
31
|
+
) { |v| cli[:ess_coll] = v }
|
27
32
|
if params[:ncbi]
|
28
33
|
opt.on(
|
29
34
|
'-T', '--ncbi-taxon STRING',
|
data/lib/miga/common/format.rb
CHANGED
@@ -25,10 +25,13 @@ module MiGA::Common::Format
|
|
25
25
|
# Cleans a FastA file in place.
|
26
26
|
def clean_fasta_file(file)
|
27
27
|
tmp_fh = nil
|
28
|
+
tmp_path = nil
|
28
29
|
begin
|
29
30
|
if file =~ /\.gz/
|
30
31
|
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
31
|
-
|
32
|
+
File.unlink tmp_path
|
33
|
+
tmp_path += '.gz'
|
34
|
+
tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
|
32
35
|
fh = Zlib::GzipReader.open(file)
|
33
36
|
else
|
34
37
|
tmp_fh = Tempfile.new('MiGA')
|
@@ -50,7 +53,7 @@ module MiGA::Common::Format
|
|
50
53
|
tmp_fh.print buffer.wrap_width(80)
|
51
54
|
tmp_fh.close
|
52
55
|
fh.close
|
53
|
-
FileUtils.
|
56
|
+
FileUtils.mv(tmp_path, file)
|
54
57
|
ensure
|
55
58
|
begin
|
56
59
|
tmp_fh.close unless tmp_fh.nil?
|
data/lib/miga/daemon.rb
CHANGED
@@ -285,10 +285,10 @@ class MiGA::Daemon < MiGA::MiGA
|
|
285
285
|
if [nil, '', 0].include? job[:pid]
|
286
286
|
job[:pid] = nil
|
287
287
|
@jobs_to_run << job
|
288
|
-
say "Unsuccessful #{job[:task_name]}, rescheduling
|
288
|
+
say "Unsuccessful #{job[:task_name]}, rescheduling"
|
289
289
|
else
|
290
290
|
@jobs_running << job
|
291
|
-
say "Spawned pid:#{job[:pid]} for #{job[:task_name]}
|
291
|
+
say "Spawned pid:#{job[:pid]} for #{job[:task_name]}"
|
292
292
|
end
|
293
293
|
end
|
294
294
|
end
|
data/lib/miga/project/dataset.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
##
|
5
5
|
# Helper module including specific functions handle datasets.
|
6
6
|
module MiGA::Project::Dataset
|
7
|
-
|
7
|
+
|
8
8
|
##
|
9
9
|
# Returns Array of MiGA::Dataset.
|
10
10
|
def datasets
|
@@ -23,7 +23,7 @@ module MiGA::Project::Dataset
|
|
23
23
|
def dataset_names_hash
|
24
24
|
@dataset_names_hash ||= Hash[dataset_names.map{ |i| [i,true] }]
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
##
|
28
28
|
# Returns MiGA::Dataset.
|
29
29
|
def dataset(name)
|
@@ -47,18 +47,19 @@ module MiGA::Project::Dataset
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
50
|
-
|
50
|
+
|
51
51
|
##
|
52
52
|
# Add dataset identified by +name+ and return MiGA::Dataset.
|
53
53
|
def add_dataset(name)
|
54
54
|
unless metadata[:datasets].include? name
|
55
55
|
MiGA::Dataset.new(self, name)
|
56
56
|
@metadata[:datasets] << name
|
57
|
+
@dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
|
57
58
|
save
|
58
59
|
end
|
59
60
|
dataset(name)
|
60
61
|
end
|
61
|
-
|
62
|
+
|
62
63
|
##
|
63
64
|
# Unlink dataset identified by +name+ and return MiGA::Dataset.
|
64
65
|
def unlink_dataset(name)
|
@@ -68,7 +69,7 @@ module MiGA::Project::Dataset
|
|
68
69
|
save
|
69
70
|
d
|
70
71
|
end
|
71
|
-
|
72
|
+
|
72
73
|
##
|
73
74
|
# Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
|
74
75
|
# supported by File#generic_transfer.
|
@@ -116,7 +117,7 @@ module MiGA::Project::Dataset
|
|
116
117
|
end
|
117
118
|
datasets.uniq - metadata[:datasets]
|
118
119
|
end
|
119
|
-
|
120
|
+
|
120
121
|
##
|
121
122
|
# Are all the datasets in the project preprocessed? Save intermediate results
|
122
123
|
# if +save+ (until the first incomplete dataset is reached).
|
@@ -149,6 +150,6 @@ module MiGA::Project::Dataset
|
|
149
150
|
def each_dataset_profile_advance(&blk)
|
150
151
|
each_dataset { |ds| blk.call(ds.profile_advance) }
|
151
152
|
end
|
152
|
-
|
153
|
+
|
153
154
|
end
|
154
155
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.5,
|
13
|
+
VERSION = [0.5, 1, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -18,7 +18,7 @@ module MiGA
|
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(
|
21
|
+
VERSION_DATE = Date.new(2020, 1, 6)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
@@ -22,18 +22,19 @@ fi
|
|
22
22
|
# Find and extract essential genes
|
23
23
|
[[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
|
24
24
|
mkdir "${DATASET}.ess"
|
25
|
-
TYPE=$(miga
|
25
|
+
TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" \
|
26
26
|
--metadata "type" | awk '{print $2}')
|
27
|
+
COLL=$(miga about -P "$PROJECT" -m ess_coll)
|
28
|
+
[[ "$COLL" == "?" ]] && COLL=dupont_2012
|
29
|
+
CMD="HMM.essential.rb \
|
30
|
+
-i '$FAA' -o '${DATASET}.ess.faa' -m '${DATASET}.ess/' \
|
31
|
+
-t '$CORES' -r '$DATASET' --collection '$COLL'"
|
27
32
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
28
|
-
|
29
|
-
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
30
|
-
> "${DATASET}.ess/log"
|
33
|
+
CMD="$CMD --metagenome"
|
31
34
|
else
|
32
|
-
|
33
|
-
-m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
|
34
|
-
--alignments "${DATASET}.ess/proteins.aln" \
|
35
|
-
> "${DATASET}.ess/log"
|
35
|
+
CMD="$CMD --alignments '${DATASET}.ess/proteins.aln'"
|
36
36
|
fi
|
37
|
+
$CMD > "${DATASET}.ess/log"
|
37
38
|
|
38
39
|
# Reduce files
|
39
40
|
if exists "$DATASET".ess/*.faa ; then
|
data/scripts/mytaxa.bash
CHANGED
@@ -38,7 +38,9 @@ else
|
|
38
38
|
fi
|
39
39
|
|
40
40
|
# Execute search
|
41
|
-
|
41
|
+
FAA="../../../06.cds/$DATASET.faa"
|
42
|
+
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
43
|
+
diamond blastp -q "$FAA" -d "$MT/AllGenomes.faa" \
|
42
44
|
-a "$DATASET.daa" -k 5 -p "$CORES" --min-score 60
|
43
45
|
diamond view -a "$DATASET.daa" -o "$DATASET.blast"
|
44
46
|
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -39,12 +39,13 @@ else
|
|
39
39
|
exit 1
|
40
40
|
fi
|
41
41
|
|
42
|
+
FAA="../../../06.cds/$DATASET.faa"
|
43
|
+
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
42
44
|
if [[ ! -s "$DATASET.mytaxa" ]] ; then
|
43
45
|
# Execute search
|
44
46
|
if [[ ! -s "$DATASET.blast" ]] ; then
|
45
|
-
diamond blastp -q "
|
46
|
-
-d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
|
47
|
-
-a "$DATASET.daa" -t "$TMPDIR"
|
47
|
+
diamond blastp -q "$FAA" -a "$DATASET.daa" -t "$TMPDIR" \
|
48
|
+
-d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
|
48
49
|
diamond view -a "$DATASET.daa" -o "$DATASET.blast" -t "$TMPDIR"
|
49
50
|
fi
|
50
51
|
|
@@ -53,8 +54,7 @@ else
|
|
53
54
|
| sort -k 13 > "$DATASET.mytaxain"
|
54
55
|
"$MT/MyTaxa" "$DATASET.mytaxain" "$DATASET.mytaxa" "0.5"
|
55
56
|
fi
|
56
|
-
ruby "$MIGA/utils/mytaxa_scan.rb" "
|
57
|
-
"$DATASET.mytaxa" "$DATASET.wintax"
|
57
|
+
ruby "$MIGA/utils/mytaxa_scan.rb" "$FAA" "$DATASET.mytaxa" "$DATASET.wintax"
|
58
58
|
echo "
|
59
59
|
source('$MIGA/utils/mytaxa_scan.R');
|
60
60
|
pdf('$DATASET.pdf', 12, 7);
|
@@ -70,11 +70,18 @@ else
|
|
70
70
|
let i=$i+1
|
71
71
|
awk "NR==$win" "$DATASET.wintax.genes" | tr "\\t" "\\n" \
|
72
72
|
> "$DATASET.reg/$i.ids"
|
73
|
-
|
74
|
-
|
73
|
+
if [[ "$FAA" == *.gz ]] ; then
|
74
|
+
gzip -c -d "$FAA" \
|
75
|
+
| FastA.filter.pl -q "$DATASET.reg/$i.ids" /dev/stdin \
|
76
|
+
> "$DATASET.reg/$i.faa"
|
77
|
+
else
|
78
|
+
FastA.filter.pl -q "$DATASET.reg/$i.ids" "$FAA" \
|
79
|
+
> "$DATASET.reg/$i.faa"
|
80
|
+
fi
|
75
81
|
done
|
76
82
|
# Archive regions
|
77
|
-
tar
|
83
|
+
tar -cf "$DATASET.reg.tar" "$DATASET.reg"
|
84
|
+
gzip -9 "$DATASET.reg.tar"
|
78
85
|
rm -r "$DATASET.reg"
|
79
86
|
fi
|
80
87
|
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
esslog = ARGV.shift
|
4
|
+
outlog = ARGV.shift
|
5
|
+
domain = ARGV.shift
|
6
|
+
|
7
|
+
def quality(hsh)
|
8
|
+
q = {}
|
9
|
+
q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
|
10
|
+
q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
|
11
|
+
q[:cmp] = 100.0*q[:found].to_f/hsh.size
|
12
|
+
q[:cnt] = 100.0*q[:multi].to_f/hsh.size
|
13
|
+
q
|
14
|
+
end
|
15
|
+
|
16
|
+
# Find collection and detected anomalies
|
17
|
+
cnt_ref = {}
|
18
|
+
at = :header
|
19
|
+
collection = 'dupont_2012'
|
20
|
+
File.open(esslog, 'r') do |fh|
|
21
|
+
fh.each_line do |ln|
|
22
|
+
v = ln.chomp.gsub(/^! +/, '')
|
23
|
+
if v == 'Multiple copies: '
|
24
|
+
at = :multi
|
25
|
+
elsif v == 'Missing genes: '
|
26
|
+
at = :missing
|
27
|
+
elsif v =~ /Collection: (\S+)/
|
28
|
+
collection = $1
|
29
|
+
elsif at == :multi
|
30
|
+
v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
|
31
|
+
cnt_ref[$2] = $1.to_i
|
32
|
+
elsif at == :missing
|
33
|
+
v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
|
34
|
+
cnt_ref[$1] = 0
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Find expected genes for domain
|
40
|
+
n_dom = Hash[
|
41
|
+
`HMM.essential.rb -L -q '-#{domain}' -c '#{collection}'`
|
42
|
+
.chomp.split("\n").map { |i| i.split("\t") }
|
43
|
+
]
|
44
|
+
l_dom = n_dom.keys
|
45
|
+
cnt_dom = {}
|
46
|
+
l_dom.each { |i| cnt_dom[i] = cnt_ref[i] || 1 }
|
47
|
+
|
48
|
+
# Correct report
|
49
|
+
q = quality(cnt_dom)
|
50
|
+
File.open(outlog, 'w') do |ofh|
|
51
|
+
ofh.puts "! Collection: #{collection} #{domain}"
|
52
|
+
ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_dom.size}."
|
53
|
+
ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
|
54
|
+
ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
|
55
|
+
if q[:multi] > 0
|
56
|
+
ofh.puts "! Multiple copies: "
|
57
|
+
cnt_dom.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_dom[k]}." if v>1 }
|
58
|
+
end
|
59
|
+
if q[:found] < cnt_dom.size
|
60
|
+
ofh.puts "! Missing genes: "
|
61
|
+
cnt_dom.each{ |k,v| ofh.puts "! #{k}: #{n_dom[k]}." if v==0 }
|
62
|
+
end
|
63
|
+
end
|
@@ -371,8 +371,18 @@
|
|
371
371
|
"source_url": "http://hmmer.janelia.org/software"
|
372
372
|
}
|
373
373
|
],
|
374
|
-
"cite": [
|
375
|
-
"
|
374
|
+
"cite": [
|
375
|
+
["Eddy, 2011, PLoS CB",
|
376
|
+
"http://dx.doi.org/10.1371/journal.pcbi.1002195"],
|
377
|
+
["Dupont et al, 2012, ISME J",
|
378
|
+
"https://doi.org/10.1038/ismej.2011.189"],
|
379
|
+
["Rodriguez-R et al, 2014, ISME J",
|
380
|
+
"https://doi.org/10.1038/ismej.2015.5"],
|
381
|
+
["Lee, 2019, Bioinf",
|
382
|
+
"https://doi.org/10.1093/bioinformatics/btz188"],
|
383
|
+
["Eren et al, 2015, PeerJ",
|
384
|
+
"https://doi.org/10.7717/peerj.1319"]
|
385
|
+
],
|
376
386
|
"options": [
|
377
387
|
{
|
378
388
|
"name": "Input file",
|
@@ -381,6 +391,15 @@
|
|
381
391
|
"mandatory": true,
|
382
392
|
"description": "FastA file containing all the proteins in the genome."
|
383
393
|
},
|
394
|
+
{
|
395
|
+
"opt": "--collection",
|
396
|
+
"arg": "string",
|
397
|
+
"default": "dupont_2012",
|
398
|
+
"description": ["Reference collection of essential proteins to use.",
|
399
|
+
"One of: dupont_2012 (default, Dupont et al 2012 modified by",
|
400
|
+
"Rodriguez-R et al 2015), or lee_2019 (Lee 2019 modified by Eren",
|
401
|
+
"et al 2015)."]
|
402
|
+
},
|
384
403
|
{
|
385
404
|
"name": "Output file",
|
386
405
|
"opt": "--out",
|
@@ -64,15 +64,15 @@
|
|
64
64
|
"task": "HMM.essential.rb",
|
65
65
|
"description": ["Typical single-copy bacterial genes present in",
|
66
66
|
"Mycoplasma genitalium."],
|
67
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,
|
68
|
-
null,null,null,null,null,null,null,null]
|
67
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
|
68
|
+
null,null,null,null,null,null,null,null,null]
|
69
69
|
},
|
70
70
|
{
|
71
71
|
"task": "HMM.essential.rb",
|
72
72
|
"description": ["Typical single-copy archaeal genes present in",
|
73
73
|
"Nanoarchaeum equitans."],
|
74
|
-
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,
|
75
|
-
null,null,null,null,null,null,null,null]
|
74
|
+
"values": ["Mgen_M2288.faa",null,null,null,null,null,null,null,true,null,
|
75
|
+
null,null,null,null,null,null,null,null,null]
|
76
76
|
},
|
77
77
|
{
|
78
78
|
"task": "Newick.autoprune.R",
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.N50.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.filterN.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.length.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.split.pl
|
@@ -10,7 +10,8 @@ use 'zlib'
|
|
10
10
|
|
11
11
|
o = {
|
12
12
|
bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
|
13
|
-
archaea: false, genomeeq: false, metagenome: false, list: false
|
13
|
+
archaea: false, genomeeq: false, metagenome: false, list: false,
|
14
|
+
collection: 'dupont_2012'
|
14
15
|
}
|
15
16
|
OptionParser.new do |opts|
|
16
17
|
opts.banner = "
|
@@ -33,7 +34,15 @@ Usage: #{$0} [options]"
|
|
33
34
|
'Path to the FastA file (.gz allowed) with all the proteins in a genome'
|
34
35
|
) { |v| o[:in] = v }
|
35
36
|
opts.separator ''
|
36
|
-
opts.separator '
|
37
|
+
opts.separator 'Options'
|
38
|
+
opts.on(
|
39
|
+
'-c', '--collection STR',
|
40
|
+
'Reference collection of essential proteins to use. One of:',
|
41
|
+
'> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
|
42
|
+
' modified by https://doi.org/10.1038/ismej.2015.5',
|
43
|
+
'> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
|
44
|
+
' modified by https://doi.org/10.7717/peerj.1319'
|
45
|
+
) { |v| o[:collection] = v }
|
37
46
|
opts.on(
|
38
47
|
'-o', '--out FILE',
|
39
48
|
'Path to the output FastA file with the translated essential genes',
|
@@ -117,20 +126,44 @@ abort '-i is mandatory' if o[:in].nil? and not o[:list]
|
|
117
126
|
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
118
127
|
o[:rename] = nil if o[:metagenome]
|
119
128
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
129
|
+
case o[:collection]
|
130
|
+
when 'dupont_2012'
|
131
|
+
not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
|
132
|
+
TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
|
133
|
+
TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
|
134
|
+
TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
|
135
|
+
TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
|
136
|
+
TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
|
137
|
+
TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
|
138
|
+
TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
|
139
|
+
TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
|
140
|
+
TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
|
141
|
+
TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
|
142
|
+
TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
|
143
|
+
TIGR02729 TIGR03263 TIGR03594}
|
144
|
+
not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
|
145
|
+
not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
|
146
|
+
TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
|
147
|
+
when 'lee_2019'
|
148
|
+
not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
|
149
|
+
EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
|
150
|
+
Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
|
151
|
+
Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
|
152
|
+
Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
|
153
|
+
Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
|
154
|
+
RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
|
155
|
+
not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
|
156
|
+
CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
|
157
|
+
DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
|
158
|
+
Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
|
159
|
+
Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
|
160
|
+
Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
|
161
|
+
Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
|
162
|
+
Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
|
163
|
+
not_as_genomeeq = not_in_archaea + not_in_bacteria
|
164
|
+
else
|
165
|
+
raise "Unsupported collection: '#{o[:collection]}'"
|
166
|
+
end
|
134
167
|
|
135
168
|
begin
|
136
169
|
Dir.mktmpdir do |dir|
|
@@ -148,7 +181,8 @@ begin
|
|
148
181
|
models = {}
|
149
182
|
model_id = nil
|
150
183
|
dbh = File.open("#{dir}/essential.hmm", 'w')
|
151
|
-
o[:model_file] ||= File.expand_path(
|
184
|
+
o[:model_file] ||= File.expand_path(
|
185
|
+
"../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
|
152
186
|
mfh = (File.extname(o[:model_file]) == '.gz') ?
|
153
187
|
Zlib::GzipReader.open(o[:model_file]) :
|
154
188
|
File.open(o[:model_file], 'r')
|
@@ -201,6 +235,9 @@ begin
|
|
201
235
|
# Report statistics
|
202
236
|
if o[:stats]
|
203
237
|
reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
|
238
|
+
modifiers = [:bacteria, :archaea, :genomeeq]
|
239
|
+
.map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
|
240
|
+
reph.puts "! Collection: #{o[:collection]} #{modifiers}"
|
204
241
|
if o[:metagenome]
|
205
242
|
reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
|
206
243
|
gc = [0] * (models.size - genes.size) +
|
File without changes
|
Binary file
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../enveomics.R
|
@@ -25,25 +25,24 @@
|
|
25
25
|
|
26
26
|
enve.df2dist <- function(
|
27
27
|
x,
|
28
|
-
obj1.index=1,
|
29
|
-
obj2.index=2,
|
30
|
-
dist.index=3,
|
31
|
-
default.d=NA,
|
32
|
-
max.sim=0
|
28
|
+
obj1.index = 1,
|
29
|
+
obj2.index = 2,
|
30
|
+
dist.index = 3,
|
31
|
+
default.d = NA,
|
32
|
+
max.sim = 0
|
33
33
|
){
|
34
|
-
x <- as.data.frame(x)
|
35
|
-
a <- as.character(x[, obj1.index])
|
36
|
-
b <- as.character(x[, obj2.index])
|
37
|
-
d <- as.double(x[, dist.index])
|
38
|
-
if(max.sim!=0) d <- (max.sim - d)/max.sim
|
39
|
-
ids <- unique(c(a,b))
|
40
|
-
m <- matrix(default.d,
|
34
|
+
x <- as.data.frame(x)
|
35
|
+
a <- as.character(x[, obj1.index])
|
36
|
+
b <- as.character(x[, obj2.index])
|
37
|
+
d <- as.double(x[, dist.index])
|
38
|
+
if(max.sim != 0) d <- (max.sim - d) / max.sim
|
39
|
+
ids <- unique(c(a,b))
|
40
|
+
m <- matrix(default.d,
|
41
|
+
nrow = length(ids), ncol = length(ids), dimnames = list(ids, ids))
|
41
42
|
diag(m) <- 0.0
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
m <- pmin(m, t(m), na.rm=TRUE)
|
46
|
-
return(as.dist(m));
|
43
|
+
m[cbind(a,b)] <- d
|
44
|
+
m <- pmin(m, t(m), na.rm = TRUE)
|
45
|
+
return(as.dist(m))
|
47
46
|
}
|
48
47
|
|
49
48
|
#' Enveomics: Data Frame to Dist (Group)
|
@@ -666,15 +666,16 @@ enve.recplot2.findPeaks <- function(
|
|
666
666
|
#' A vector of number of components to evaluate.
|
667
667
|
#' @param criterion
|
668
668
|
#' Criterion to use for components selection. Must be one of:
|
669
|
-
#' \code{aic} (Akaike Information Criterion),
|
670
|
-
#'
|
669
|
+
#' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
|
670
|
+
#' (Bayesian Information Criterion or Schwarz Criterion).
|
671
671
|
#' @param merge.tol
|
672
672
|
#' When attempting to merge peaks with very similar sequencing depth, use
|
673
673
|
#' this number of significant digits (in log-scale).
|
674
674
|
#' @param verbose
|
675
675
|
#' Display (mostly debugging) information.
|
676
676
|
#' @param ...
|
677
|
-
#' Any additional parameters supported by
|
677
|
+
#' Any additional parameters supported by
|
678
|
+
#' \code{\link{enve.recplot2.findPeaks.em}}.
|
678
679
|
#'
|
679
680
|
#' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
680
681
|
#'
|
@@ -684,10 +685,10 @@ enve.recplot2.findPeaks <- function(
|
|
684
685
|
|
685
686
|
enve.recplot2.findPeaks.emauto <- function(
|
686
687
|
x,
|
687
|
-
components=seq(1,
|
688
|
-
criterion='aic',
|
689
|
-
merge.tol=2L,
|
690
|
-
verbose=FALSE,
|
688
|
+
components = seq(1, 5),
|
689
|
+
criterion = 'aic',
|
690
|
+
merge.tol = 2L,
|
691
|
+
verbose = FALSE,
|
691
692
|
...
|
692
693
|
){
|
693
694
|
best <- list(crit=0, pstore=list())
|
@@ -758,19 +759,19 @@ enve.recplot2.findPeaks.emauto <- function(
|
|
758
759
|
|
759
760
|
enve.recplot2.findPeaks.em <- function(
|
760
761
|
x,
|
761
|
-
max.iter=1000,
|
762
|
-
ll.diff.res=1e-8,
|
763
|
-
components=2,
|
764
|
-
rm.top=0.05,
|
765
|
-
verbose=FALSE,
|
762
|
+
max.iter = 1000,
|
763
|
+
ll.diff.res = 1e-8,
|
764
|
+
components = 2,
|
765
|
+
rm.top = 0.05,
|
766
|
+
verbose = FALSE,
|
766
767
|
init,
|
767
|
-
log=TRUE
|
768
|
+
log = TRUE
|
768
769
|
){
|
769
770
|
|
770
771
|
# Essential vars
|
771
772
|
pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
|
772
773
|
lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
|
773
|
-
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
|
774
|
+
lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
|
774
775
|
if(log) lsd1 <- log(lsd1)
|
775
776
|
|
776
777
|
# 1. Initialize
|
@@ -779,7 +780,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
779
780
|
init <- list(
|
780
781
|
mu = tapply(lsd1, km.clust, mean),
|
781
782
|
sd = tapply(lsd1, km.clust, sd),
|
782
|
-
alpha = table(km.clust)/length(km.clust)
|
783
|
+
alpha = table(km.clust) / length(km.clust)
|
783
784
|
)
|
784
785
|
}
|
785
786
|
m.step <- init
|
@@ -795,6 +796,7 @@ enve.recplot2.findPeaks.em <- function(
|
|
795
796
|
ll.diff <- abs(cur.ll - e.step[["ll"]])
|
796
797
|
cur.ll <- e.step[["ll"]]
|
797
798
|
if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
|
799
|
+
if(is.na(ll.diff) || ll.diff == Inf) break
|
798
800
|
if(ll.diff <= ll.diff.res) break
|
799
801
|
}
|
800
802
|
|
@@ -1431,6 +1433,9 @@ enve.recplot2.findPeaks.__em_e <- function
|
|
1431
1433
|
theta[['sd']][i])*theta[['alpha']][i]))
|
1432
1434
|
sum.of.components <- rowSums(product)
|
1433
1435
|
posterior <- product / sum.of.components
|
1436
|
+
for(i in which(sum.of.components == Inf)) {
|
1437
|
+
cat(i,'/',nrow(product), ':', product[i,], '\n')
|
1438
|
+
}
|
1434
1439
|
|
1435
1440
|
return(list(ll=sum(log(sum.of.components)), posterior=posterior))
|
1436
1441
|
}
|
@@ -52,6 +52,7 @@ For additional information on recruitment plots, see the
|
|
52
52
|
[Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
|
53
53
|
|
54
54
|
## Changelog
|
55
|
+
* 1.7.1: Improved efficiency of `enve.df2dist` about five-fold.
|
55
56
|
* 1.7.0: Uniformized output for `enve.recplot2.extractWindows` and
|
56
57
|
`enve.recplot2.coordinates` to ease automation. Thanks to Tomeu Viver and
|
57
58
|
Roth Conrad for troubleshooting.
|
@@ -4,7 +4,7 @@
|
|
4
4
|
\alias{enve.recplot2.findPeaks.emauto}
|
5
5
|
\title{Enveomics: Recruitment Plot (2) Emauto Peak Finder}
|
6
6
|
\usage{
|
7
|
-
enve.recplot2.findPeaks.emauto(x, components = seq(1,
|
7
|
+
enve.recplot2.findPeaks.emauto(x, components = seq(1, 5),
|
8
8
|
criterion = "aic", merge.tol = 2L, verbose = FALSE, ...)
|
9
9
|
}
|
10
10
|
\arguments{
|
@@ -13,15 +13,16 @@ enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
|
|
13
13
|
\item{components}{A vector of number of components to evaluate.}
|
14
14
|
|
15
15
|
\item{criterion}{Criterion to use for components selection. Must be one of:
|
16
|
-
\code{aic} (Akaike Information Criterion),
|
17
|
-
|
16
|
+
\code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
|
17
|
+
(Bayesian Information Criterion or Schwarz Criterion).}
|
18
18
|
|
19
19
|
\item{merge.tol}{When attempting to merge peaks with very similar sequencing depth, use
|
20
20
|
this number of significant digits (in log-scale).}
|
21
21
|
|
22
22
|
\item{verbose}{Display (mostly debugging) information.}
|
23
23
|
|
24
|
-
\item{...}{Any additional parameters supported by
|
24
|
+
\item{...}{Any additional parameters supported by
|
25
|
+
\code{\link{enve.recplot2.findPeaks.em}}.}
|
25
26
|
}
|
26
27
|
\value{
|
27
28
|
Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
|
data/utils/find-medoid.R
CHANGED
@@ -7,7 +7,12 @@
|
|
7
7
|
#= Load stuff
|
8
8
|
argv <- commandArgs(trailingOnly = T)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
|
10
|
+
if(Sys.getenv('MIGA') == ''){
|
11
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
12
|
+
}else{
|
13
|
+
source(file.path(Sys.getenv('MIGA'),
|
14
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
15
|
+
}
|
11
16
|
|
12
17
|
find_medoids <- function(ani.df, out, clades) {
|
13
18
|
if(nrow(ani.df) == 0) return(NULL)
|
data/utils/mytaxa_scan.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'zlib'
|
4
|
+
|
3
5
|
abort "
|
4
6
|
Usage:
|
5
7
|
#{$0} {FastA file} {MyTaxa file} {Data output}
|
@@ -7,52 +9,53 @@ Usage:
|
|
7
9
|
" if ARGV[2].nil?
|
8
10
|
|
9
11
|
begin
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
12
|
+
# Get arguments
|
13
|
+
faa, mytaxa, outdata = ARGV
|
14
|
+
winsize = 10
|
15
|
+
|
16
|
+
# Extract gene IDs
|
17
|
+
ifh = faa =~ /\.gz/ ? Zlib::GzipReader.open(faa) : File.open(faa, 'r')
|
18
|
+
ids = ifh.each_line.grep(/^>/).map{|dl| dl.chomp.sub(/^>/,'').sub(/\s.*/,'')}
|
19
|
+
ifh.close
|
20
|
+
tax = Hash[ids.map{|k| [k, "NA"]}]
|
21
|
+
|
22
|
+
# Get MyTaxa distributions
|
23
|
+
k, l = nil
|
24
|
+
File.open(mytaxa).each do |ln|
|
25
|
+
ln.chomp!
|
26
|
+
if $.%2 == 1
|
27
|
+
k, l = ln.split /\t/
|
28
|
+
else
|
29
|
+
tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
|
33
|
+
|
34
|
+
# Estimate Windows and save gene IDs
|
35
|
+
fh = File.open(outdata + ".genes", "w")
|
36
|
+
c = []
|
37
|
+
c << all_tax.map{|t| tax.values.count(t) }
|
38
|
+
n_wins = (ids.size/winsize).ceil
|
39
|
+
(0 .. (n_wins-1)).each do |win|
|
40
|
+
k = ids[win*winsize, winsize]
|
41
|
+
win_t = tax.values_at(*k)
|
42
|
+
fh.puts k.join("\t")
|
43
|
+
c << all_tax.map{|t| win_t.count(t)}
|
44
|
+
end
|
45
|
+
p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
|
46
|
+
fh.close
|
47
|
+
|
48
|
+
# Save window profiles
|
49
|
+
fh = File.open(outdata, "w")
|
50
|
+
fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
|
51
|
+
fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
|
52
|
+
(0 .. (all_tax.size - 1)).each do |row|
|
53
|
+
fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
|
54
|
+
end
|
55
|
+
fh.close
|
52
56
|
rescue => err
|
53
|
-
|
54
|
-
|
55
|
-
|
57
|
+
$stderr.puts "Exception: #{err}\n\n"
|
58
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
59
|
+
err
|
56
60
|
end
|
57
61
|
|
58
|
-
|
data/utils/ref-tree.R
CHANGED
@@ -7,7 +7,12 @@
|
|
7
7
|
#= Load stuff
|
8
8
|
argv <- commandArgs(trailingOnly=T)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
|
10
|
+
if(Sys.getenv('MIGA') == ''){
|
11
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
12
|
+
}else{
|
13
|
+
source(file.path(Sys.getenv('MIGA'),
|
14
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
15
|
+
}
|
11
16
|
inst <- c("phangorn", "phytools") %in% rownames(installed.packages())
|
12
17
|
if(inst[1]){
|
13
18
|
suppressPackageStartupMessages(library(phangorn))
|
data/utils/subclades-nj.R
CHANGED
@@ -12,7 +12,12 @@ suppressPackageStartupMessages(library(cluster))
|
|
12
12
|
suppressPackageStartupMessages(library(phytools))
|
13
13
|
suppressPackageStartupMessages(library(phangorn))
|
14
14
|
suppressPackageStartupMessages(library(parallel))
|
15
|
-
|
15
|
+
if(Sys.getenv('MIGA') == ''){
|
16
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
17
|
+
}else{
|
18
|
+
source(file.path(Sys.getenv('MIGA'),
|
19
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
20
|
+
}
|
16
21
|
|
17
22
|
#= Main function
|
18
23
|
subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
data/utils/subclades.R
CHANGED
@@ -10,7 +10,12 @@ suppressPackageStartupMessages(library(ape))
|
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
12
12
|
suppressPackageStartupMessages(library(parallel))
|
13
|
-
|
13
|
+
if(Sys.getenv('MIGA') == ''){
|
14
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
15
|
+
}else{
|
16
|
+
source(file.path(Sys.getenv('MIGA'),
|
17
|
+
'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
|
18
|
+
}
|
14
19
|
|
15
20
|
#= Main function
|
16
21
|
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -197,7 +197,6 @@ files:
|
|
197
197
|
- test/taxonomy_test.rb
|
198
198
|
- test/test_helper.rb
|
199
199
|
- utils/adapters.fa
|
200
|
-
- utils/arch-ess-genes.rb
|
201
200
|
- utils/cleanup-databases.rb
|
202
201
|
- utils/core-pan-plot.R
|
203
202
|
- utils/distance/base.rb
|
@@ -207,6 +206,7 @@ files:
|
|
207
206
|
- utils/distance/runner.rb
|
208
207
|
- utils/distance/temporal.rb
|
209
208
|
- utils/distances.rb
|
209
|
+
- utils/domain-ess-genes.rb
|
210
210
|
- utils/enveomics/Docs/recplot2.md
|
211
211
|
- utils/enveomics/Examples/aai-matrix.bash
|
212
212
|
- utils/enveomics/Examples/ani-matrix.bash
|
@@ -356,7 +356,8 @@ files:
|
|
356
356
|
- utils/enveomics/Scripts/clust.rand.rb
|
357
357
|
- utils/enveomics/Scripts/gi2tax.rb
|
358
358
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
359
|
-
- utils/enveomics/Scripts/lib/data/
|
359
|
+
- utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz
|
360
|
+
- utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz
|
360
361
|
- utils/enveomics/Scripts/lib/enveomics.R
|
361
362
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
362
363
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
@@ -514,8 +515,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
514
515
|
- !ruby/object:Gem::Version
|
515
516
|
version: '0'
|
516
517
|
requirements: []
|
517
|
-
|
518
|
-
rubygems_version: 2.7.6
|
518
|
+
rubygems_version: 3.0.3
|
519
519
|
signing_key:
|
520
520
|
specification_version: 4
|
521
521
|
summary: MiGA
|
data/utils/arch-ess-genes.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
esslog = ARGV.shift
|
4
|
-
outlog = ARGV.shift
|
5
|
-
l_all = `HMM.essential.rb -l -q`.chomp.split("\n").map{ |i| i.gsub(/\t.*/,"") }
|
6
|
-
n_arc = Hash[
|
7
|
-
`HMM.essential.rb -l -q -A`.chomp.split("\n").map{ |i| i.split("\t") }
|
8
|
-
]
|
9
|
-
l_arc = n_arc.keys
|
10
|
-
|
11
|
-
def quality(hsh)
|
12
|
-
q = {}
|
13
|
-
q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
|
14
|
-
q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
|
15
|
-
q[:cmp] = 100.0*q[:found].to_f/hsh.size
|
16
|
-
q[:cnt] = 100.0*q[:multi].to_f/hsh.size
|
17
|
-
q
|
18
|
-
end
|
19
|
-
|
20
|
-
cnt_ref = {}
|
21
|
-
l_all.each{ |i| cnt_ref[i] = 1 }
|
22
|
-
|
23
|
-
at = :header
|
24
|
-
File.open(esslog, "r") do |fh|
|
25
|
-
fh.each_line do |ln|
|
26
|
-
v = ln.chomp.gsub(/^! +/, "")
|
27
|
-
if v=="Multiple copies: "
|
28
|
-
at = :multi
|
29
|
-
elsif v=="Missing genes: "
|
30
|
-
at = :missing
|
31
|
-
elsif at==:multi
|
32
|
-
v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
|
33
|
-
cnt_ref[$2] = $1.to_i
|
34
|
-
elsif at==:missing
|
35
|
-
v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
|
36
|
-
cnt_ref[$1] = 0
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
cnt_arc = {}
|
42
|
-
l_arc.each{ |i| cnt_arc[i] = cnt_ref[i] }
|
43
|
-
|
44
|
-
q = quality(cnt_arc)
|
45
|
-
File.open(outlog, "w") do |ofh|
|
46
|
-
ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_arc.size}."
|
47
|
-
ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
|
48
|
-
ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
|
49
|
-
if q[:multi] > 0
|
50
|
-
ofh.puts "! Multiple copies: "
|
51
|
-
cnt_arc.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_arc[k]}." if v>1 }
|
52
|
-
end
|
53
|
-
if q[:found] < cnt_arc.size
|
54
|
-
ofh.puts "! Missing genes: "
|
55
|
-
cnt_arc.each{ |k,v| ofh.puts "! #{k}: #{n_arc[k]}." if v==0 }
|
56
|
-
end
|
57
|
-
end
|