miga-base 0.7.24.0 → 0.7.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/add.rb +9 -6
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/index_wf.rb +4 -2
- data/lib/miga/cli/action/init.rb +60 -59
- data/lib/miga/cli/action/init/files_helper.rb +2 -1
- data/lib/miga/cli/action/preproc_wf.rb +7 -5
- data/lib/miga/cli/action/wf.rb +39 -23
- data/lib/miga/cli/base.rb +16 -5
- data/lib/miga/common/with_option.rb +1 -1
- data/lib/miga/dataset/result.rb +2 -1
- data/lib/miga/project/base.rb +1 -1
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +17 -1
- data/scripts/miga.bash +8 -2
- data/test/lair_test.rb +1 -2
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI/FastAAI +1336 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
- data/utils/distance/base.rb +9 -0
- data/utils/distance/commands.rb +183 -81
- data/utils/distance/database.rb +68 -9
- data/utils/distance/pipeline.rb +14 -18
- data/utils/distance/runner.rb +16 -30
- data/utils/distance/temporal.rb +4 -2
- data/utils/distances.rb +2 -2
- data/utils/requirements.txt +1 -1
- metadata +8 -2
data/utils/distance/base.rb
CHANGED
@@ -2,4 +2,13 @@ require 'miga'
|
|
2
2
|
require 'miga/tax_dist'
|
3
3
|
|
4
4
|
class MiGA::DistanceRunner
|
5
|
+
require_relative 'temporal.rb'
|
6
|
+
require_relative 'database.rb'
|
7
|
+
require_relative 'commands.rb'
|
8
|
+
require_relative 'pipeline.rb'
|
9
|
+
|
10
|
+
include MiGA::DistanceRunner::Temporal
|
11
|
+
include MiGA::DistanceRunner::Database
|
12
|
+
include MiGA::DistanceRunner::Commands
|
13
|
+
include MiGA::DistanceRunner::Pipeline
|
5
14
|
end
|
data/utils/distance/commands.rb
CHANGED
@@ -1,105 +1,207 @@
|
|
1
1
|
module MiGA::DistanceRunner::Commands
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
y = haai(target)
|
14
|
-
return y unless y.nil? || y.zero?
|
2
|
+
##
|
3
|
+
# Estimates AAI against +targets+ using hAAI
|
4
|
+
def haai(targets)
|
5
|
+
puts "[#{Time.now}] hAAI: #{dataset.name} vs #{targets.size} targets"
|
6
|
+
empty_vals = targets.map { |_i| nil }
|
7
|
+
return empty_vals if opts[:haai_p] == 'no'
|
8
|
+
|
9
|
+
# Launch comparisons
|
10
|
+
sbj = pending_targets(targets, :haai)
|
11
|
+
unless sbj.empty?
|
12
|
+
opts[:haai_p] == 'fastaai' ? fastaai_cmd(sbj) : haai_cmd(sbj)
|
15
13
|
end
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dataset.name, target.name, tmp_dbs[:aai]
|
20
|
-
).tap { checkpoint :aai }
|
14
|
+
|
15
|
+
# Return AAI estimates from the database
|
16
|
+
batch_values_from_db(:aai, targets.map { |i| i&.name })
|
21
17
|
end
|
22
18
|
|
23
19
|
##
|
24
|
-
# Estimates AAI against +
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
20
|
+
# Estimates or calculates AAI against +targets+
|
21
|
+
def aai(targets)
|
22
|
+
puts "[#{Time.now}] AAI: #{dataset.name} vs #{targets.size} targets"
|
23
|
+
|
24
|
+
# Try hAAI first
|
25
|
+
haai(targets)
|
26
|
+
|
27
|
+
# Launch comparisons
|
28
|
+
pending_targets(targets, :aai).each do |target|
|
29
|
+
# Full AAI
|
30
|
+
target_cds = target.result(:cds).file_path(:proteins) or next
|
31
|
+
aairb_cmd(
|
32
|
+
tmp_file('proteins.fa'), target_cds,
|
33
|
+
dataset.name, target.name, tmp_dbs[:aai], checkpoint: :aai
|
34
|
+
)
|
39
35
|
end
|
40
|
-
|
41
|
-
|
36
|
+
|
37
|
+
# Return AAI from the database
|
38
|
+
batch_values_from_db(:aai, targets.map { |i| i&.name })
|
42
39
|
end
|
43
40
|
|
44
41
|
##
|
45
|
-
# Calculates ANI against +
|
46
|
-
def ani(
|
47
|
-
#
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
dataset.name, target.name, tmp_dbs[:ani]
|
60
|
-
).tap { checkpoint :ani }
|
42
|
+
# Calculates ANI against +targets+
|
43
|
+
def ani(targets)
|
44
|
+
puts "[#{Time.now}] ANI: #{dataset.name} vs #{targets.size} targets"
|
45
|
+
empty_vals = targets.map { |_i| nil }
|
46
|
+
return empty_vals unless File.size?(tmp_file('largecontigs.fa'))
|
47
|
+
|
48
|
+
# Launch comparisons
|
49
|
+
sbj = pending_targets(targets, :ani)
|
50
|
+
unless sbj.empty?
|
51
|
+
opts[:ani_p] == 'fastani' ? fastani_cmd(sbj) : anirb_cmd(sbj)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Return ANI from the database
|
55
|
+
batch_values_from_db(:ani, targets.map { |i| i&.name })
|
61
56
|
end
|
62
57
|
|
63
58
|
##
|
64
|
-
# Calculates and returns ANI against +
|
65
|
-
#
|
66
|
-
|
67
|
-
|
68
|
-
|
59
|
+
# Calculates and returns ANI against +targets+ if AAI >= +aai_limit+.
|
60
|
+
# Note that ANI values may be returned for lower (or failing) AAIs if the
|
61
|
+
# value is already stored in the database
|
62
|
+
def ani_after_aai(targets, aai_limit = 85.0)
|
63
|
+
# Run AAI and select targets with AAI ≥ aai_limit
|
64
|
+
aai = aai(targets)
|
65
|
+
sbj = aai.each_with_index.map { |i, k| targets[k] if i&.> aai_limit }
|
66
|
+
sbj.compact!
|
67
|
+
|
68
|
+
# Run ANI
|
69
|
+
ani(sbj) unless sbj.empty?
|
70
|
+
|
71
|
+
# Return ANI from the database
|
72
|
+
batch_values_from_db(:ani, targets.map { |i| i&.name })
|
69
73
|
end
|
70
74
|
|
71
75
|
##
|
72
76
|
# Execute an AAI command
|
73
|
-
def
|
77
|
+
def aairb_cmd(f1, f2, n1, n2, db, o = {})
|
74
78
|
o = opts.merge(o)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
79
|
+
run_cmd <<~CMD
|
80
|
+
aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" \
|
81
|
+
--name1 "#{n1}" --name2 "#{n2}" \
|
82
|
+
-t "#{o[:thr]}" -a --#{'no-' unless o[:aai_save_rbm]}save-rbm \
|
83
|
+
-p "#{o[:aai_p]}"
|
84
|
+
CMD
|
85
|
+
ensure
|
86
|
+
checkpoint(o[:checkpoint]) if o[:checkpoint]
|
80
87
|
end
|
81
88
|
|
82
89
|
##
|
83
|
-
# Execute an
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
90
|
+
# Execute an ani.rb command
|
91
|
+
def anirb_cmd(targets)
|
92
|
+
f1 = tmp_file('largecontigs.fa')
|
93
|
+
return unless File.size?(f1)
|
94
|
+
|
95
|
+
targets.each do |target|
|
96
|
+
target_asm = target&.result(:assembly)&.file_path(:largecontigs) or next
|
97
|
+
run_cmd <<~CMD
|
98
|
+
ani.rb -1 "#{f1}" -2 "#{target_asm}" -S "#{tmp_dbs[:ani]}" \
|
99
|
+
--name1 "#{dataset.name}" --name2 "#{target.name}" \
|
100
|
+
-t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm \
|
101
|
+
-p "#{opts[:ani_p]}"
|
102
|
+
CMD
|
103
|
+
checkpoint(:ani)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Execute a FastANI command
|
109
|
+
def fastani_cmd(targets)
|
110
|
+
f1 = tmp_file('largecontigs.fa')
|
111
|
+
return unless File.size?(f1)
|
112
|
+
|
113
|
+
# Run FastANI
|
114
|
+
File.open(f2 = tmp_file, 'w') do |fh|
|
115
|
+
targets.each do |target|
|
116
|
+
target_asm = target&.result(:assembly)&.file_path(:largecontigs)
|
117
|
+
fh.puts target_asm if target_asm
|
118
|
+
end
|
119
|
+
end
|
120
|
+
run_cmd <<~CMD
|
121
|
+
fastANI -q "#{f1}" --rl "#{f2}" -t #{opts[:thr]} \
|
122
|
+
-o "#{f3 = tmp_file}"
|
123
|
+
CMD
|
124
|
+
|
125
|
+
# Retrieve resulting data and save to DB
|
126
|
+
data = {}
|
127
|
+
File.open(f3, 'r') do |fh|
|
128
|
+
fh.each do |ln|
|
129
|
+
row = ln.chomp.split("\t")
|
130
|
+
n2 = File.basename(row[1], '.gz')
|
131
|
+
n2 = File.basename(n2, '.LargeContigs.fna')
|
132
|
+
data[n2] = [row[2].to_f, 0.0, row[3].to_i, row[4].to_i]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
batch_data_to_db(:ani, data)
|
136
|
+
|
137
|
+
# Cleanup
|
138
|
+
[f2, f3].each { |i| File.unlink(i) }
|
139
|
+
end
|
140
|
+
|
141
|
+
##
|
142
|
+
# Execute a FastAAI command
|
143
|
+
def fastaai_cmd(targets)
|
144
|
+
qry_idx = dataset.result(:essential_genes).file_path(:fastaai_index)
|
145
|
+
return nil unless qry_idx
|
146
|
+
|
147
|
+
# Run FastAAI
|
148
|
+
File.open(f1 = tmp_file, 'w') { |fh| fh.puts qry_idx }
|
149
|
+
File.open(f2 = tmp_file, 'w') do |fh|
|
150
|
+
targets.each do |target|
|
151
|
+
target_idx = target&.result(:essential_genes)&.file_path(:fastaai_index)
|
152
|
+
fh.puts target_idx if target_idx
|
153
|
+
end
|
154
|
+
end
|
155
|
+
run_cmd <<~CMD
|
156
|
+
FastAAI --qd "#{f1}" --rd "#{f2}" --output "#{f3 = tmp_file}" \
|
157
|
+
--threads #{opts[:thr]}
|
158
|
+
CMD
|
159
|
+
|
160
|
+
# Save values in the databases
|
161
|
+
haai_data = {}
|
162
|
+
aai_data = {}
|
163
|
+
File.open(f3, 'r') do |fh|
|
164
|
+
fh.each do |ln|
|
165
|
+
out = ln.chomp.split("\t")
|
166
|
+
haai_data[out[1]] = [
|
167
|
+
out[2].to_f * 100, out[3].to_f * 100, out[4].to_i, out[5].to_i
|
168
|
+
]
|
169
|
+
aai_data[out[1]] = [out[6].to_f, 0, 0, 0] if out[6] !~ /^>/
|
95
170
|
end
|
96
|
-
v = out[2]
|
97
|
-
else
|
98
|
-
v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" \
|
99
|
-
--name1 "#{n1}" --name2 "#{n2}" \
|
100
|
-
-t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm \
|
101
|
-
--lookup-first -p "#{o[:ani_p] || 'blast+'}"`.chomp
|
102
171
|
end
|
103
|
-
|
172
|
+
batch_data_to_db(:haai, haai_data)
|
173
|
+
batch_data_to_db(:aai, aai_data)
|
174
|
+
|
175
|
+
# Cleanup
|
176
|
+
[f1, f2, f3].each { |i| File.unlink(i) }
|
177
|
+
end
|
178
|
+
|
179
|
+
##
|
180
|
+
# Execute an hAAI command
|
181
|
+
def haai_cmd(targets)
|
182
|
+
aai_data = {}
|
183
|
+
targets.each do |target|
|
184
|
+
target_ess = target&.result(:essential_genes)&.file_path(:ess_genes)
|
185
|
+
next unless target_ess
|
186
|
+
|
187
|
+
# hAAI
|
188
|
+
h = aairb_cmd(
|
189
|
+
tmp_file('ess_genes.fa'), target_ess,
|
190
|
+
dataset.name, target.name, tmp_dbs[:haai],
|
191
|
+
aai_save_rbm: false, aai_p: opts[:haai_p], checkpoint: :haai
|
192
|
+
)&.chomp&.to_f
|
193
|
+
next if h.nil? || h.zero? || h > 90.0
|
194
|
+
|
195
|
+
# Estimated AAI
|
196
|
+
aai_data[target.name] = [
|
197
|
+
100.0 - Math.exp(2.435076 + 0.4275193 * Math.log(100.0 - h)), 0, 0, 0
|
198
|
+
] unless h&.zero? || h > 90.0
|
199
|
+
end
|
200
|
+
batch_data_to_db(:aai, aai_data)
|
201
|
+
end
|
202
|
+
|
203
|
+
def run_cmd(cmd)
|
204
|
+
puts "CMD: #{cmd}"
|
205
|
+
`#{cmd}`
|
104
206
|
end
|
105
207
|
end
|
data/utils/distance/database.rb
CHANGED
@@ -22,12 +22,16 @@ module MiGA::DistanceRunner::Database
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
# Initialize if it doesn't exist
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
unless File.size? dbs[m]
|
26
|
+
SQLite3::Database.new(dbs[m]) do |conn|
|
27
|
+
conn.execute <<~SQL
|
28
|
+
create table if not exists #{t}(
|
29
|
+
seq1 varchar(256), seq2 varchar(256),
|
30
|
+
#{t} float, sd float, n int, omega int
|
31
|
+
)
|
32
|
+
SQL
|
33
|
+
end
|
34
|
+
end
|
31
35
|
# Copy over to (local) temporals
|
32
36
|
@tmp_dbs[m] = tmp_file("#{m}.db")
|
33
37
|
FileUtils.cp(dbs[m], tmp_dbs[m])
|
@@ -92,27 +96,72 @@ module MiGA::DistanceRunner::Database
|
|
92
96
|
# possible number of matches
|
93
97
|
def data_from_db(n1, n2, db, metric)
|
94
98
|
y = nil
|
99
|
+
table = metric == :haai ? :aai : metric
|
95
100
|
SQLite3::Database.new(db) do |conn|
|
96
101
|
y = conn.execute(
|
97
|
-
"select #{
|
102
|
+
"select #{table}, sd, n, omega from #{table} where seq1=? and seq2=?",
|
98
103
|
[n1, n2]
|
99
104
|
).first
|
100
|
-
end if File.size?
|
105
|
+
end if File.size?(db)
|
101
106
|
y
|
102
107
|
end
|
103
108
|
|
104
109
|
##
|
105
110
|
# Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
|
106
111
|
def data_to_db(n1, n2, db, metric, data)
|
112
|
+
table = metric == :haai ? :aai : metric
|
107
113
|
SQLite3::Database.new(db) do |conn|
|
108
114
|
conn.execute(
|
109
|
-
"insert into #{
|
115
|
+
"insert into #{table} (seq1, seq2, #{table}, sd, n, omega) " +
|
110
116
|
"values (?, ?, ?, ?, ?, ?)", [n1, n2] + data
|
111
117
|
)
|
112
118
|
end
|
113
119
|
checkpoint metric
|
114
120
|
end
|
115
121
|
|
122
|
+
##
|
123
|
+
# Saves +data+ of +metric+ in batch to the temporary database,
|
124
|
+
# and assumes query is +#dataset+. +data+ must be a hash with target names
|
125
|
+
# as key and arrays as values with: [val, sd, n, omega]
|
126
|
+
def batch_data_to_db(metric, data)
|
127
|
+
db = tmp_dbs[metric]
|
128
|
+
table = metric == :haai ? :aai : metric
|
129
|
+
`cp #{db} ~/here.db`
|
130
|
+
SQLite3::Database.new(db) do |conn|
|
131
|
+
data.each do |k, v|
|
132
|
+
sql = <<~SQL
|
133
|
+
insert into #{table} (
|
134
|
+
seq1, seq2, #{table}, sd, n, omega
|
135
|
+
) values (?, ?, ?, ?, ?, ?)
|
136
|
+
SQL
|
137
|
+
conn.execute(sql, [dataset.name, k] + v)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
checkpoint(metric)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# Retrieves data of +metric+ in batch from the temporary database,
|
145
|
+
# and assumes query is +#dataset+. The output data is a hash with the same
|
146
|
+
# structure described for +#batch_data_to_db+
|
147
|
+
def batch_data_from_db(metric)
|
148
|
+
db = tmp_dbs[metric]
|
149
|
+
table = metric == :haai ? :aai : metric
|
150
|
+
data = {}
|
151
|
+
SQLite3::Database.new(db) do |conn|
|
152
|
+
sql = "select seq2, #{table}, sd, n, omega from #{table}"
|
153
|
+
conn.execute(sql).each { |row| data[row.shift] = row }
|
154
|
+
end
|
155
|
+
data
|
156
|
+
end
|
157
|
+
|
158
|
+
##
|
159
|
+
# Retrieve only +metric+ values against +names+
|
160
|
+
def batch_values_from_db(metric, names)
|
161
|
+
data = batch_data_from_db(metric)
|
162
|
+
names.map { |i| data[i]&.first }
|
163
|
+
end
|
164
|
+
|
116
165
|
##
|
117
166
|
# Iterates for each entry in +db+
|
118
167
|
def foreach_in_db(db, metric, &blk)
|
@@ -120,4 +169,14 @@ module MiGA::DistanceRunner::Database
|
|
120
169
|
conn.execute("select * from #{metric}").each { |r| blk[r] }
|
121
170
|
end
|
122
171
|
end
|
172
|
+
|
173
|
+
##
|
174
|
+
# Select only those targets that are not yet stored in either direction
|
175
|
+
def pending_targets(targets, metric)
|
176
|
+
saved = batch_data_from_db(metric).keys
|
177
|
+
targets
|
178
|
+
.compact
|
179
|
+
.select { |i| !saved.include?(i.name) }
|
180
|
+
.select { |i| !stored_value(i, metric)&.> 0.0 }
|
181
|
+
end
|
123
182
|
end
|
data/utils/distance/pipeline.rb
CHANGED
@@ -11,19 +11,14 @@ module MiGA::DistanceRunner::Pipeline
|
|
11
11
|
val_med = ''
|
12
12
|
val_cls = nil
|
13
13
|
i_n = 0
|
14
|
-
File.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
val_cls = i_n
|
23
|
-
puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
14
|
+
sbj_datasets = File.foreach(med).map { |i| ref_project.dataset(i.chomp) }
|
15
|
+
values = send(metric, sbj_datasets)
|
16
|
+
max_idx = values.map(&:to_f).each_with_index.max[1]
|
17
|
+
max_val = values[max_idx]
|
18
|
+
val_med = sbj_dataset[max_idx].name
|
19
|
+
val_cls = max_idx + 1
|
20
|
+
puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
|
21
|
+
|
27
22
|
classif = "#{classif}/miga-project.sc-#{val_cls}"
|
28
23
|
result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
|
29
24
|
classify(clades, classif, metric, result_fh, val_cls)
|
@@ -32,9 +27,8 @@ module MiGA::DistanceRunner::Pipeline
|
|
32
27
|
# Run distances against datasets listed in metadata's +:dist_req+
|
33
28
|
def distances_by_request(metric)
|
34
29
|
$stderr.puts 'Running distances by request'
|
35
|
-
dataset.option(:dist_req).
|
36
|
-
|
37
|
-
end
|
30
|
+
sbj_datasets = dataset.option(:dist_req).map { |i| ref_project.dataset(i) }
|
31
|
+
send(metric, sbj_datasets)
|
38
32
|
end
|
39
33
|
|
40
34
|
# Builds a tree with all visited medoids from any classification level
|
@@ -74,8 +68,10 @@ module MiGA::DistanceRunner::Pipeline
|
|
74
68
|
$stderr.puts "Testing taxonomy | opts = #{opts}"
|
75
69
|
# Get taxonomy of closest relative
|
76
70
|
from_ref_project = (project != ref_project)
|
77
|
-
res_dir =
|
78
|
-
|
71
|
+
res_dir =
|
72
|
+
from_ref_project ?
|
73
|
+
File.expand_path('data/09.distances/05.taxonomy', project.path) :
|
74
|
+
home
|
79
75
|
Dir.mkdir res_dir unless Dir.exist? res_dir
|
80
76
|
File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
|
81
77
|
fh.puts Time.now.to_s
|