miga-base 1.2.14.2 → 1.2.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +16 -40
- data/lib/miga/cli/action/doctor/databases.rb +39 -0
- data/lib/miga/cli/action/doctor/distances.rb +144 -0
- data/lib/miga/cli/action/doctor/operations.rb +159 -0
- data/lib/miga/cli/action/doctor.rb +7 -287
- data/lib/miga/cli/action/download/base.rb +64 -2
- data/lib/miga/cli/action/gtdb_get.rb +2 -31
- data/lib/miga/cli/action/ncbi_get.rb +6 -31
- data/lib/miga/cli/opt_helper.rb +1 -1
- data/lib/miga/common/errors.rb +10 -0
- data/lib/miga/dataset/base.rb +34 -5
- data/lib/miga/dataset/result/add.rb +286 -0
- data/lib/miga/dataset/result/ignore.rb +93 -0
- data/lib/miga/dataset/result.rb +31 -342
- data/lib/miga/remote_dataset/download.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/test/remote_dataset_test.rb +6 -0
- metadata +7 -2
@@ -1,10 +1,15 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/cli/action/doctor/base'
|
5
|
-
|
6
4
|
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
5
|
+
require 'miga/cli/action/doctor/base'
|
6
|
+
require 'miga/cli/action/doctor/databases'
|
7
|
+
require 'miga/cli/action/doctor/distances'
|
8
|
+
require 'miga/cli/action/doctor/operations'
|
7
9
|
include MiGA::Cli::Action::Doctor::Base
|
10
|
+
include MiGA::Cli::Action::Doctor::Databases
|
11
|
+
include MiGA::Cli::Action::Doctor::Distances
|
12
|
+
include MiGA::Cli::Action::Doctor::Operations
|
8
13
|
|
9
14
|
def parse_cli
|
10
15
|
cli.defaults = { threads: 1 }
|
@@ -59,289 +64,4 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
59
64
|
@@OPERATIONS
|
60
65
|
end
|
61
66
|
end
|
62
|
-
|
63
|
-
##
|
64
|
-
# Perform status operation with MiGA::Cli +cli+
|
65
|
-
def check_status(cli)
|
66
|
-
cli.say 'Updating metadata status'
|
67
|
-
p = cli.load_project
|
68
|
-
n = p.dataset_names.size
|
69
|
-
(0 .. cli[:threads] - 1).map do |i|
|
70
|
-
Process.fork do
|
71
|
-
k = 0
|
72
|
-
cli.load_project.each_dataset do |d|
|
73
|
-
k += 1
|
74
|
-
cli.advance('Datasets:', k, n, false) if i == 0
|
75
|
-
d.recalculate_status if k % cli[:threads] == i
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
Process.waitall
|
80
|
-
cli.say
|
81
|
-
end
|
82
|
-
|
83
|
-
##
|
84
|
-
# Perform databases operation with MiGA::Cli +cli+
|
85
|
-
def check_db(cli)
|
86
|
-
cli.say 'Checking integrity of databases'
|
87
|
-
p = cli.load_project
|
88
|
-
n = p.dataset_names.size
|
89
|
-
(0 .. cli[:threads] - 1).map do |i|
|
90
|
-
Process.fork do
|
91
|
-
k = 0
|
92
|
-
p.each_dataset do |d|
|
93
|
-
k += 1
|
94
|
-
cli.advance('Datasets:', k, n, false) if i == 0
|
95
|
-
next unless k % cli[:threads] == i
|
96
|
-
each_database_file(d) do |db_file, metric, result, _rank|
|
97
|
-
check_sqlite3_database(db_file, metric) do
|
98
|
-
cli.say(
|
99
|
-
" > Removing malformed database from #{d.name}:#{result} "
|
100
|
-
)
|
101
|
-
File.unlink(db_file)
|
102
|
-
r = d.result(result) or next
|
103
|
-
[r.path(:done), r.path].each do |f|
|
104
|
-
File.unlink(f) if File.exist?(f)
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
Process.waitall
|
112
|
-
cli.say
|
113
|
-
end
|
114
|
-
|
115
|
-
##
|
116
|
-
# Perform bidirectional operation with MiGA::Cli +cli+
|
117
|
-
def check_bidir(cli)
|
118
|
-
cli.say 'Checking if reference distances are bidirectional'
|
119
|
-
project = cli.load_project
|
120
|
-
ref_ds = project.each_dataset.select(&:ref?)
|
121
|
-
ref_names = ref_ds.map(&:name)
|
122
|
-
n = ref_ds.size
|
123
|
-
|
124
|
-
# Read data first (threaded)
|
125
|
-
tmp = File.join(project.path, 'doctor-bidirectional.tmp')
|
126
|
-
FileUtils.mkdir_p(tmp)
|
127
|
-
MiGA::Parallel.process(cli[:threads]) do |thr|
|
128
|
-
file = File.join(tmp, "#{thr}.json")
|
129
|
-
fh = File.open(file, 'w')
|
130
|
-
[:aai, :ani].each do |metric|
|
131
|
-
fh.puts "# #{metric}"
|
132
|
-
ref_ds.each_with_index do |ds, idx|
|
133
|
-
if idx % cli[:threads] == thr
|
134
|
-
cli.advance('Reading:', idx + 1, n, false) if thr == 0
|
135
|
-
row = read_bidirectional(ds, metric)
|
136
|
-
fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
fh.puts '# end'
|
141
|
-
fh.flush # necessary for large threaded runs
|
142
|
-
fh.close
|
143
|
-
if thr == 0
|
144
|
-
cli.advance('Reading:', n, n, false)
|
145
|
-
cli.say
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# Merge pieces per thread
|
150
|
-
dist = { aai: {}, ani: {} }
|
151
|
-
cli[:threads].times do |i|
|
152
|
-
cli.advance('Merging:', i + 1, cli[:threads], false)
|
153
|
-
file = File.join(tmp, "#{i}.json")
|
154
|
-
File.open(file, 'r') do |fh|
|
155
|
-
metric = nil
|
156
|
-
fh.each do |ln|
|
157
|
-
qry, row = ln.chomp.split(' ', 2)
|
158
|
-
if qry == '#'
|
159
|
-
metric = row.to_sym
|
160
|
-
else
|
161
|
-
raise "Unrecognized metric: #{metric}" unless dist[metric]
|
162
|
-
JSON.parse(row).each do |sbj, val|
|
163
|
-
dist[metric][qry] ||= {}
|
164
|
-
if dist[metric][sbj]&.include?(qry)
|
165
|
-
dist[metric][sbj].delete(qry) # Already bidirectional
|
166
|
-
else
|
167
|
-
dist[metric][qry][sbj] = val
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
raise "Incomplete thread dump: #{file}" unless metric == :end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
cli.say
|
176
|
-
FileUtils.rm_rf(tmp)
|
177
|
-
|
178
|
-
# Write missing values (threaded)
|
179
|
-
MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
|
180
|
-
cli.advance('Datasets:', idx + 1, n, false) if thr == 0
|
181
|
-
save_bidirectional(ds, dist)
|
182
|
-
end
|
183
|
-
cli.say
|
184
|
-
end
|
185
|
-
|
186
|
-
##
|
187
|
-
# Perform distances operation with MiGA::Cli +cli+
|
188
|
-
def check_dist(cli)
|
189
|
-
p = cli.load_project
|
190
|
-
%i[ani aai].each do |dist|
|
191
|
-
res = p.result("#{dist}_distances")
|
192
|
-
next if res.nil?
|
193
|
-
|
194
|
-
cli.say "Checking #{dist} table for consistent datasets"
|
195
|
-
notok, fix = check_dist_eval(cli, p, res)
|
196
|
-
check_dist_fix(cli, p, fix)
|
197
|
-
check_dist_recompute(cli, res, notok)
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
##
|
202
|
-
# Perform files operation with MiGA::Cli +cli+
|
203
|
-
def check_files(cli)
|
204
|
-
cli.say 'Looking for outdated files in results'
|
205
|
-
n, k = cli.load_project.dataset_names.size, 0
|
206
|
-
cli.load_project.each_dataset do |d|
|
207
|
-
cli.advance('Datasets:', k += 1, n, false)
|
208
|
-
d.each_result do |r_k, r|
|
209
|
-
ok = true
|
210
|
-
r.each_file do |_f_sym, _f_rel, f_abs|
|
211
|
-
unless File.exist? f_abs
|
212
|
-
ok = false
|
213
|
-
break
|
214
|
-
end
|
215
|
-
end
|
216
|
-
unless ok
|
217
|
-
cli.say " > Registering again #{d.name}:#{r_k} "
|
218
|
-
d.add_result(r_k, true, force: true)
|
219
|
-
sr = d.result(:stats) and sr.remove!
|
220
|
-
end
|
221
|
-
end
|
222
|
-
end
|
223
|
-
cli.say
|
224
|
-
end
|
225
|
-
|
226
|
-
##
|
227
|
-
# Perform cds operation with MiGA::Cli +cli+
|
228
|
-
def check_cds(cli)
|
229
|
-
cli.say 'Looking for unzipped genes or proteins'
|
230
|
-
n, k = cli.load_project.dataset_names.size, 0
|
231
|
-
cli.load_project.each_dataset do |d|
|
232
|
-
cli.advance('Datasets:', k += 1, n, false)
|
233
|
-
res = d.result(:cds) or next
|
234
|
-
changed = false
|
235
|
-
%i[genes proteins gff3 gff2 tab].each do |f|
|
236
|
-
file = res.file_path(f) or next
|
237
|
-
if file !~ /\.gz/
|
238
|
-
cli.say " > Gzipping #{d.name} #{f} "
|
239
|
-
run_cmd(['gzip', '-9', file])
|
240
|
-
changed = true
|
241
|
-
end
|
242
|
-
end
|
243
|
-
if changed
|
244
|
-
d.add_result(:cds, true, force: true)
|
245
|
-
sr = d.result(:stats) and sr.remove!
|
246
|
-
end
|
247
|
-
end
|
248
|
-
cli.say
|
249
|
-
end
|
250
|
-
|
251
|
-
##
|
252
|
-
# Perform essential-genes operation with MiGA::Cli +cli+
|
253
|
-
def check_ess(cli)
|
254
|
-
cli.say 'Looking for outdated essential genes'
|
255
|
-
cli.load_project.each_dataset do |d|
|
256
|
-
res = d.result(:essential_genes)
|
257
|
-
next if res.nil?
|
258
|
-
|
259
|
-
dir = res.file_path(:collection)
|
260
|
-
if dir.nil? || outdated_fastaai_ess(res)
|
261
|
-
cli.say " > Removing #{d.name}:essential_genes"
|
262
|
-
res.remove!
|
263
|
-
d.result(:stats)&.remove!
|
264
|
-
next
|
265
|
-
end
|
266
|
-
next if Dir["#{dir}/*.faa"].empty?
|
267
|
-
|
268
|
-
cli.say " > Fixing #{d.name}"
|
269
|
-
run_cmd <<~CMD
|
270
|
-
cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
|
271
|
-
CMD
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
##
|
276
|
-
# Check if the essential genes result +res+ has an outdated FastAAI index
|
277
|
-
def outdated_fastaai_ess(res)
|
278
|
-
idx1 = res.file_path(:fastaai_index)
|
279
|
-
idx2 = res.file_path(:fastaai_index_2)
|
280
|
-
idx2.nil? && !idx1.nil?
|
281
|
-
end
|
282
|
-
|
283
|
-
##
|
284
|
-
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
285
|
-
def check_mts(cli)
|
286
|
-
cli.say 'Looking for unarchived MyTaxa Scan runs'
|
287
|
-
cli.load_project.each_dataset do |d|
|
288
|
-
res = d.result(:mytaxa_scan)
|
289
|
-
next if res.nil?
|
290
|
-
|
291
|
-
dir = res.file_path(:regions)
|
292
|
-
fix = false
|
293
|
-
unless dir.nil?
|
294
|
-
if Dir.exist? dir
|
295
|
-
run_cmd <<~CMD
|
296
|
-
cd #{dir.shellescape}/.. \
|
297
|
-
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
298
|
-
&& rm -r '#{d.name}.reg'
|
299
|
-
CMD
|
300
|
-
end
|
301
|
-
fix = true
|
302
|
-
end
|
303
|
-
%i[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
304
|
-
file = res.file_path(ext)
|
305
|
-
unless file.nil?
|
306
|
-
FileUtils.rm(file) if File.exist? file
|
307
|
-
fix = true
|
308
|
-
end
|
309
|
-
end
|
310
|
-
if fix
|
311
|
-
cli.say " > Fixing #{d.name}"
|
312
|
-
d.add_result(:mytaxa_scan, true, force: true)
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
|
317
|
-
##
|
318
|
-
# Perform start operation with MiGA::Cli +cli+
|
319
|
-
def check_start(cli)
|
320
|
-
cli.say 'Looking for legacy .start files lingering'
|
321
|
-
cli.load_project.each_dataset do |d|
|
322
|
-
d.each_result do |r_k, r|
|
323
|
-
if File.exist? r.path(:start)
|
324
|
-
cli.say " > Registering again #{d.name}:#{r_k}"
|
325
|
-
r.save
|
326
|
-
end
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
##
|
332
|
-
# Perform taxonomy operation with MiGA::Cli +cli+
|
333
|
-
def check_tax(cli)
|
334
|
-
# cli.say 'o Checking for taxonomy/distances consistency'
|
335
|
-
# TODO: Find 95%ANI clusters with entries from different species
|
336
|
-
# TODO: Find different 95%ANI clusters with genomes from the same species
|
337
|
-
# TODO: Find AAI values too high or too low for each LCA rank
|
338
|
-
end
|
339
|
-
|
340
|
-
##
|
341
|
-
# Run command +cmd+ with options +opts+
|
342
|
-
def run_cmd(cmd, opts = {})
|
343
|
-
opts = { return: :output, err2out: true, raise: false }.merge(opts)
|
344
|
-
cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
|
345
|
-
warn(cmdo) unless cmdo.empty?
|
346
|
-
end
|
347
67
|
end
|
@@ -7,6 +7,18 @@ end
|
|
7
7
|
##
|
8
8
|
# Helper module including download functions for the *_get actions
|
9
9
|
module MiGA::Cli::Action::Download::Base
|
10
|
+
def cli_base_flags(opt)
|
11
|
+
opt.on(
|
12
|
+
'--max INT', Integer,
|
13
|
+
'Maximum number of datasets to download (by default: unlimited)'
|
14
|
+
) { |v| cli[:max_datasets] = v }
|
15
|
+
opt.on(
|
16
|
+
'-m', '--metadata STRING',
|
17
|
+
'Metadata as key-value pairs separated by = and delimited by comma',
|
18
|
+
'Values are saved as strings except for booleans (true / false) or nil'
|
19
|
+
) { |v| cli[:metadata] = v }
|
20
|
+
end
|
21
|
+
|
10
22
|
def cli_filters(opt)
|
11
23
|
opt.on(
|
12
24
|
'--exclude PATH',
|
@@ -17,6 +29,10 @@ module MiGA::Cli::Action::Download::Base
|
|
17
29
|
'--ignore-until STRING',
|
18
30
|
'Ignores all datasets until a name is found (useful for large reruns)'
|
19
31
|
) { |v| cli[:ignore_until] = v }
|
32
|
+
opt.on(
|
33
|
+
'--ignore-removed',
|
34
|
+
'Ignores entries removed from NCBI (by default fails on removed entries)'
|
35
|
+
) { |v| cli[:ignore_removed] = v }
|
20
36
|
cli.opt_flag(
|
21
37
|
opt, 'get-metadata',
|
22
38
|
'Only download and update metadata for existing datasets', :get_md
|
@@ -49,6 +65,40 @@ module MiGA::Cli::Action::Download::Base
|
|
49
65
|
) { |v| cli[:remote_list] = v }
|
50
66
|
end
|
51
67
|
|
68
|
+
def generic_perform
|
69
|
+
p, ds = load_tasks
|
70
|
+
d, downloaded = download_entries(ds, p)
|
71
|
+
|
72
|
+
# Finalize
|
73
|
+
finalize_tasks(d, downloaded)
|
74
|
+
unlink_entries(p, p.dataset_names - d) if cli[:unlink]
|
75
|
+
end
|
76
|
+
|
77
|
+
def load_tasks
|
78
|
+
sanitize_cli
|
79
|
+
p = cli.load_project
|
80
|
+
ds = remote_list
|
81
|
+
ds = discard_excluded(ds)
|
82
|
+
ds = impose_limit(ds)
|
83
|
+
[p, ds]
|
84
|
+
end
|
85
|
+
|
86
|
+
def finalize_tasks(d, downloaded)
|
87
|
+
cli.say "Datasets listed: #{d.size}"
|
88
|
+
act = cli[:dry] ? 'to download' : 'downloaded'
|
89
|
+
cli.say "Datasets #{act}: #{downloaded}"
|
90
|
+
unless cli[:remote_list].nil?
|
91
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
92
|
+
d.each { |i| fh.puts i }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def unlink_entries(p, unlink)
|
98
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
99
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
100
|
+
end
|
101
|
+
|
52
102
|
def discard_excluded(ds)
|
53
103
|
unless cli[:exclude].nil?
|
54
104
|
cli.say "Discarding datasets in #{cli[:exclude]}"
|
@@ -84,7 +134,11 @@ module MiGA::Cli::Action::Download::Base
|
|
84
134
|
|
85
135
|
downloaded += 1
|
86
136
|
unless cli[:dry]
|
87
|
-
save_entry(name, body, p)
|
137
|
+
unless save_entry(name, body, p)
|
138
|
+
downloaded -= 1
|
139
|
+
d.pop
|
140
|
+
next
|
141
|
+
end
|
88
142
|
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
89
143
|
end
|
90
144
|
end
|
@@ -93,8 +147,11 @@ module MiGA::Cli::Action::Download::Base
|
|
93
147
|
[d, downloaded]
|
94
148
|
end
|
95
149
|
|
150
|
+
##
|
151
|
+
# Saves the (generic remote) entry identified by +name+ with +body+ into the
|
152
|
+
# project +p+, and returns +true+ on success and +false+ otherwise
|
96
153
|
def save_entry(name, body, p)
|
97
|
-
cli.say
|
154
|
+
cli.say " Locating remote dataset: #{name}"
|
98
155
|
body[:md][:metadata_only] = true if cli[:only_md]
|
99
156
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
100
157
|
if cli[:get_md]
|
@@ -105,5 +162,10 @@ module MiGA::Cli::Action::Download::Base
|
|
105
162
|
rd.save_to(p, name, !cli[:query], body[:md])
|
106
163
|
cli.add_metadata(p.add_dataset(name))
|
107
164
|
end
|
165
|
+
true
|
166
|
+
rescue MiGA::RemoteDataMissingError => e
|
167
|
+
raise(e) unless cli[:ignore_removed]
|
168
|
+
cli.say " Removed dataset ignored: #{name}"
|
169
|
+
false
|
108
170
|
end
|
109
171
|
end
|
@@ -18,15 +18,7 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
|
|
18
18
|
'-T', '--taxon STRING',
|
19
19
|
'(Mandatory) Taxon name in GTDB format (e.g., g__Escherichia)'
|
20
20
|
) { |v| cli[:taxon] = v }
|
21
|
-
opt
|
22
|
-
'--max INT', Integer,
|
23
|
-
'Maximum number of datasets to download (by default: unlimited)'
|
24
|
-
) { |v| cli[:max_datasets] = v }
|
25
|
-
opt.on(
|
26
|
-
'-m', '--metadata STRING',
|
27
|
-
'Metadata as key-value pairs separated by = and delimited by comma',
|
28
|
-
'Values are saved as strings except for booleans (true / false) or nil'
|
29
|
-
) { |v| cli[:metadata] = v }
|
21
|
+
cli_base_flags(opt)
|
30
22
|
cli_task_flags(opt)
|
31
23
|
cli_name_modifiers(opt)
|
32
24
|
cli_filters(opt)
|
@@ -39,27 +31,6 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
|
|
39
31
|
end
|
40
32
|
|
41
33
|
def perform
|
42
|
-
|
43
|
-
p = cli.load_project
|
44
|
-
ds = remote_list
|
45
|
-
ds = discard_excluded(ds)
|
46
|
-
ds = impose_limit(ds)
|
47
|
-
d, downloaded = download_entries(ds, p)
|
48
|
-
|
49
|
-
# Finalize
|
50
|
-
cli.say "Datasets listed: #{d.size}"
|
51
|
-
act = cli[:dry] ? 'to download' : 'downloaded'
|
52
|
-
cli.say "Datasets #{act}: #{downloaded}"
|
53
|
-
unless cli[:remote_list].nil?
|
54
|
-
File.open(cli[:remote_list], 'w') do |fh|
|
55
|
-
d.each { |i| fh.puts i }
|
56
|
-
end
|
57
|
-
end
|
58
|
-
return unless cli[:unlink]
|
59
|
-
|
60
|
-
unlink = p.dataset_names - d
|
61
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
62
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
34
|
+
generic_perform
|
63
35
|
end
|
64
|
-
|
65
36
|
end
|
@@ -20,48 +20,23 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
20
20
|
'-T', '--taxon STRING',
|
21
21
|
'(Mandatory) Taxon name (e.g., a species binomial)'
|
22
22
|
) { |v| cli[:taxon] = v }
|
23
|
-
opt
|
24
|
-
'--max INT', Integer,
|
25
|
-
'Maximum number of datasets to download (by default: unlimited)'
|
26
|
-
) { |v| cli[:max_datasets] = v }
|
27
|
-
opt.on(
|
28
|
-
'-m', '--metadata STRING',
|
29
|
-
'Metadata as key-value pairs separated by = and delimited by comma',
|
30
|
-
'Values are saved as strings except for booleans (true / false) or nil'
|
31
|
-
) { |v| cli[:metadata] = v }
|
23
|
+
cli_base_flags(opt)
|
32
24
|
cli_task_flags(opt)
|
33
25
|
cli_name_modifiers(opt)
|
34
26
|
cli_filters(opt)
|
35
27
|
cli_save_actions(opt)
|
36
28
|
opt.on(
|
37
29
|
'--api-key STRING',
|
30
|
+
'::HIDE::' # For backwards compatibility
|
31
|
+
) { |v| ENV['NCBI_API_KEY'] = v }
|
32
|
+
opt.on(
|
33
|
+
'--ncbi-api-key STRING',
|
38
34
|
'NCBI API key'
|
39
35
|
) { |v| ENV['NCBI_API_KEY'] = v }
|
40
36
|
end
|
41
37
|
end
|
42
38
|
|
43
39
|
def perform
|
44
|
-
|
45
|
-
p = cli.load_project
|
46
|
-
ds = remote_list
|
47
|
-
ds = discard_excluded(ds)
|
48
|
-
ds = impose_limit(ds)
|
49
|
-
d, downloaded = download_entries(ds, p)
|
50
|
-
|
51
|
-
# Finalize
|
52
|
-
cli.say "Datasets listed: #{d.size}"
|
53
|
-
act = cli[:dry] ? 'to download' : 'downloaded'
|
54
|
-
cli.say "Datasets #{act}: #{downloaded}"
|
55
|
-
unless cli[:remote_list].nil?
|
56
|
-
File.open(cli[:remote_list], 'w') do |fh|
|
57
|
-
d.each { |i| fh.puts i }
|
58
|
-
end
|
59
|
-
end
|
60
|
-
return unless cli[:unlink]
|
61
|
-
|
62
|
-
unlink = p.dataset_names - d
|
63
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
64
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
40
|
+
generic_perform
|
65
41
|
end
|
66
|
-
|
67
42
|
end
|
data/lib/miga/cli/opt_helper.rb
CHANGED
data/lib/miga/common/errors.rb
CHANGED
@@ -9,4 +9,14 @@ module MiGA
|
|
9
9
|
# An error with a system call
|
10
10
|
class SystemCallError < Error
|
11
11
|
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# An error with remote data
|
15
|
+
class RemoteDataError < Error
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# An error caused by missing remote data
|
20
|
+
class RemoteDataMissingError < RemoteDataError
|
21
|
+
end
|
12
22
|
end
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -7,18 +7,47 @@ class MiGA::Dataset < MiGA::MiGA
|
|
7
7
|
|
8
8
|
# Class-level
|
9
9
|
class << self
|
10
|
+
##
|
11
|
+
# Directories containing the results from dataset-specific tasks
|
10
12
|
def RESULT_DIRS
|
11
13
|
@@RESULT_DIRS
|
12
14
|
end
|
13
15
|
|
16
|
+
##
|
17
|
+
# Supported dataset types
|
14
18
|
def KNOWN_TYPES
|
15
19
|
@@KNOWN_TYPES
|
16
20
|
end
|
17
21
|
|
22
|
+
##
|
23
|
+
# Returns an Array of tasks (Symbols) to be executed before project-wide
|
24
|
+
# tasks
|
18
25
|
def PREPROCESSING_TASKS
|
19
26
|
@@PREPROCESSING_TASKS
|
20
27
|
end
|
21
28
|
|
29
|
+
##
|
30
|
+
# Tasks to be excluded from query datasets
|
31
|
+
def EXCLUDE_NOREF_TASKS
|
32
|
+
@@EXCLUDE_NOREF_TASKS
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Tasks to be executed only in datasets that are single-organism. These
|
37
|
+
# tasks are ignored for multi-organism datasets or for unknown types
|
38
|
+
def ONLY_NONMULTI_TASKS
|
39
|
+
@@ONLY_NONMULTI_TASKS
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Tasks to be executed only in datasets that are multi-organism. These
|
44
|
+
# tasks are ignored for single-organism datasets or for unknwon types
|
45
|
+
def ONLY_MULTI_TASKS
|
46
|
+
@@ONLY_MULTI_TASKS
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Options supported by datasets
|
22
51
|
def OPTIONS
|
23
52
|
@@OPTIONS
|
24
53
|
end
|
@@ -69,7 +98,7 @@ module MiGA::Dataset::Base
|
|
69
98
|
}
|
70
99
|
|
71
100
|
##
|
72
|
-
# Returns an Array of tasks to be executed before project-wide tasks
|
101
|
+
# Returns an Array of tasks (Symbols) to be executed before project-wide tasks
|
73
102
|
@@PREPROCESSING_TASKS = [
|
74
103
|
:raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
|
75
104
|
:assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
|
@@ -77,19 +106,19 @@ module MiGA::Dataset::Base
|
|
77
106
|
]
|
78
107
|
|
79
108
|
##
|
80
|
-
# Tasks to be excluded from query datasets
|
109
|
+
# Tasks to be excluded from query datasets
|
81
110
|
@@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
|
82
111
|
@@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
|
83
112
|
|
84
113
|
##
|
85
|
-
# Tasks to be executed only in datasets that are
|
86
|
-
# tasks are ignored for multi-organism datasets or for unknown types
|
114
|
+
# Tasks to be executed only in datasets that are single-organism. These
|
115
|
+
# tasks are ignored for multi-organism datasets or for unknown types
|
87
116
|
@@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
|
88
117
|
@@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
|
89
118
|
|
90
119
|
##
|
91
120
|
# Tasks to be executed only in datasets that are multi-organism. These
|
92
|
-
# tasks are ignored for single-organism datasets or for unknwon types
|
121
|
+
# tasks are ignored for single-organism datasets or for unknwon types
|
93
122
|
@@ONLY_MULTI_TASKS = [:mytaxa]
|
94
123
|
@@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
|
95
124
|
|