miga-base 1.2.14.2 → 1.2.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +16 -40
- data/lib/miga/cli/action/doctor/databases.rb +39 -0
- data/lib/miga/cli/action/doctor/distances.rb +144 -0
- data/lib/miga/cli/action/doctor/operations.rb +159 -0
- data/lib/miga/cli/action/doctor.rb +7 -287
- data/lib/miga/cli/action/download/base.rb +64 -2
- data/lib/miga/cli/action/gtdb_get.rb +2 -31
- data/lib/miga/cli/action/ncbi_get.rb +6 -31
- data/lib/miga/cli/opt_helper.rb +1 -1
- data/lib/miga/common/errors.rb +10 -0
- data/lib/miga/dataset/base.rb +34 -5
- data/lib/miga/dataset/result/add.rb +286 -0
- data/lib/miga/dataset/result/ignore.rb +93 -0
- data/lib/miga/dataset/result.rb +31 -342
- data/lib/miga/remote_dataset/download.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/test/remote_dataset_test.rb +6 -0
- metadata +7 -2
@@ -1,10 +1,15 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/cli/action/doctor/base'
|
5
|
-
|
6
4
|
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
5
|
+
require 'miga/cli/action/doctor/base'
|
6
|
+
require 'miga/cli/action/doctor/databases'
|
7
|
+
require 'miga/cli/action/doctor/distances'
|
8
|
+
require 'miga/cli/action/doctor/operations'
|
7
9
|
include MiGA::Cli::Action::Doctor::Base
|
10
|
+
include MiGA::Cli::Action::Doctor::Databases
|
11
|
+
include MiGA::Cli::Action::Doctor::Distances
|
12
|
+
include MiGA::Cli::Action::Doctor::Operations
|
8
13
|
|
9
14
|
def parse_cli
|
10
15
|
cli.defaults = { threads: 1 }
|
@@ -59,289 +64,4 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
59
64
|
@@OPERATIONS
|
60
65
|
end
|
61
66
|
end
|
62
|
-
|
63
|
-
##
|
64
|
-
# Perform status operation with MiGA::Cli +cli+
|
65
|
-
def check_status(cli)
|
66
|
-
cli.say 'Updating metadata status'
|
67
|
-
p = cli.load_project
|
68
|
-
n = p.dataset_names.size
|
69
|
-
(0 .. cli[:threads] - 1).map do |i|
|
70
|
-
Process.fork do
|
71
|
-
k = 0
|
72
|
-
cli.load_project.each_dataset do |d|
|
73
|
-
k += 1
|
74
|
-
cli.advance('Datasets:', k, n, false) if i == 0
|
75
|
-
d.recalculate_status if k % cli[:threads] == i
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
Process.waitall
|
80
|
-
cli.say
|
81
|
-
end
|
82
|
-
|
83
|
-
##
|
84
|
-
# Perform databases operation with MiGA::Cli +cli+
|
85
|
-
def check_db(cli)
|
86
|
-
cli.say 'Checking integrity of databases'
|
87
|
-
p = cli.load_project
|
88
|
-
n = p.dataset_names.size
|
89
|
-
(0 .. cli[:threads] - 1).map do |i|
|
90
|
-
Process.fork do
|
91
|
-
k = 0
|
92
|
-
p.each_dataset do |d|
|
93
|
-
k += 1
|
94
|
-
cli.advance('Datasets:', k, n, false) if i == 0
|
95
|
-
next unless k % cli[:threads] == i
|
96
|
-
each_database_file(d) do |db_file, metric, result, _rank|
|
97
|
-
check_sqlite3_database(db_file, metric) do
|
98
|
-
cli.say(
|
99
|
-
" > Removing malformed database from #{d.name}:#{result} "
|
100
|
-
)
|
101
|
-
File.unlink(db_file)
|
102
|
-
r = d.result(result) or next
|
103
|
-
[r.path(:done), r.path].each do |f|
|
104
|
-
File.unlink(f) if File.exist?(f)
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
Process.waitall
|
112
|
-
cli.say
|
113
|
-
end
|
114
|
-
|
115
|
-
##
|
116
|
-
# Perform bidirectional operation with MiGA::Cli +cli+
|
117
|
-
def check_bidir(cli)
|
118
|
-
cli.say 'Checking if reference distances are bidirectional'
|
119
|
-
project = cli.load_project
|
120
|
-
ref_ds = project.each_dataset.select(&:ref?)
|
121
|
-
ref_names = ref_ds.map(&:name)
|
122
|
-
n = ref_ds.size
|
123
|
-
|
124
|
-
# Read data first (threaded)
|
125
|
-
tmp = File.join(project.path, 'doctor-bidirectional.tmp')
|
126
|
-
FileUtils.mkdir_p(tmp)
|
127
|
-
MiGA::Parallel.process(cli[:threads]) do |thr|
|
128
|
-
file = File.join(tmp, "#{thr}.json")
|
129
|
-
fh = File.open(file, 'w')
|
130
|
-
[:aai, :ani].each do |metric|
|
131
|
-
fh.puts "# #{metric}"
|
132
|
-
ref_ds.each_with_index do |ds, idx|
|
133
|
-
if idx % cli[:threads] == thr
|
134
|
-
cli.advance('Reading:', idx + 1, n, false) if thr == 0
|
135
|
-
row = read_bidirectional(ds, metric)
|
136
|
-
fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
fh.puts '# end'
|
141
|
-
fh.flush # necessary for large threaded runs
|
142
|
-
fh.close
|
143
|
-
if thr == 0
|
144
|
-
cli.advance('Reading:', n, n, false)
|
145
|
-
cli.say
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# Merge pieces per thread
|
150
|
-
dist = { aai: {}, ani: {} }
|
151
|
-
cli[:threads].times do |i|
|
152
|
-
cli.advance('Merging:', i + 1, cli[:threads], false)
|
153
|
-
file = File.join(tmp, "#{i}.json")
|
154
|
-
File.open(file, 'r') do |fh|
|
155
|
-
metric = nil
|
156
|
-
fh.each do |ln|
|
157
|
-
qry, row = ln.chomp.split(' ', 2)
|
158
|
-
if qry == '#'
|
159
|
-
metric = row.to_sym
|
160
|
-
else
|
161
|
-
raise "Unrecognized metric: #{metric}" unless dist[metric]
|
162
|
-
JSON.parse(row).each do |sbj, val|
|
163
|
-
dist[metric][qry] ||= {}
|
164
|
-
if dist[metric][sbj]&.include?(qry)
|
165
|
-
dist[metric][sbj].delete(qry) # Already bidirectional
|
166
|
-
else
|
167
|
-
dist[metric][qry][sbj] = val
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
raise "Incomplete thread dump: #{file}" unless metric == :end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
cli.say
|
176
|
-
FileUtils.rm_rf(tmp)
|
177
|
-
|
178
|
-
# Write missing values (threaded)
|
179
|
-
MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
|
180
|
-
cli.advance('Datasets:', idx + 1, n, false) if thr == 0
|
181
|
-
save_bidirectional(ds, dist)
|
182
|
-
end
|
183
|
-
cli.say
|
184
|
-
end
|
185
|
-
|
186
|
-
##
|
187
|
-
# Perform distances operation with MiGA::Cli +cli+
|
188
|
-
def check_dist(cli)
|
189
|
-
p = cli.load_project
|
190
|
-
%i[ani aai].each do |dist|
|
191
|
-
res = p.result("#{dist}_distances")
|
192
|
-
next if res.nil?
|
193
|
-
|
194
|
-
cli.say "Checking #{dist} table for consistent datasets"
|
195
|
-
notok, fix = check_dist_eval(cli, p, res)
|
196
|
-
check_dist_fix(cli, p, fix)
|
197
|
-
check_dist_recompute(cli, res, notok)
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
##
|
202
|
-
# Perform files operation with MiGA::Cli +cli+
|
203
|
-
def check_files(cli)
|
204
|
-
cli.say 'Looking for outdated files in results'
|
205
|
-
n, k = cli.load_project.dataset_names.size, 0
|
206
|
-
cli.load_project.each_dataset do |d|
|
207
|
-
cli.advance('Datasets:', k += 1, n, false)
|
208
|
-
d.each_result do |r_k, r|
|
209
|
-
ok = true
|
210
|
-
r.each_file do |_f_sym, _f_rel, f_abs|
|
211
|
-
unless File.exist? f_abs
|
212
|
-
ok = false
|
213
|
-
break
|
214
|
-
end
|
215
|
-
end
|
216
|
-
unless ok
|
217
|
-
cli.say " > Registering again #{d.name}:#{r_k} "
|
218
|
-
d.add_result(r_k, true, force: true)
|
219
|
-
sr = d.result(:stats) and sr.remove!
|
220
|
-
end
|
221
|
-
end
|
222
|
-
end
|
223
|
-
cli.say
|
224
|
-
end
|
225
|
-
|
226
|
-
##
|
227
|
-
# Perform cds operation with MiGA::Cli +cli+
|
228
|
-
def check_cds(cli)
|
229
|
-
cli.say 'Looking for unzipped genes or proteins'
|
230
|
-
n, k = cli.load_project.dataset_names.size, 0
|
231
|
-
cli.load_project.each_dataset do |d|
|
232
|
-
cli.advance('Datasets:', k += 1, n, false)
|
233
|
-
res = d.result(:cds) or next
|
234
|
-
changed = false
|
235
|
-
%i[genes proteins gff3 gff2 tab].each do |f|
|
236
|
-
file = res.file_path(f) or next
|
237
|
-
if file !~ /\.gz/
|
238
|
-
cli.say " > Gzipping #{d.name} #{f} "
|
239
|
-
run_cmd(['gzip', '-9', file])
|
240
|
-
changed = true
|
241
|
-
end
|
242
|
-
end
|
243
|
-
if changed
|
244
|
-
d.add_result(:cds, true, force: true)
|
245
|
-
sr = d.result(:stats) and sr.remove!
|
246
|
-
end
|
247
|
-
end
|
248
|
-
cli.say
|
249
|
-
end
|
250
|
-
|
251
|
-
##
|
252
|
-
# Perform essential-genes operation with MiGA::Cli +cli+
|
253
|
-
def check_ess(cli)
|
254
|
-
cli.say 'Looking for outdated essential genes'
|
255
|
-
cli.load_project.each_dataset do |d|
|
256
|
-
res = d.result(:essential_genes)
|
257
|
-
next if res.nil?
|
258
|
-
|
259
|
-
dir = res.file_path(:collection)
|
260
|
-
if dir.nil? || outdated_fastaai_ess(res)
|
261
|
-
cli.say " > Removing #{d.name}:essential_genes"
|
262
|
-
res.remove!
|
263
|
-
d.result(:stats)&.remove!
|
264
|
-
next
|
265
|
-
end
|
266
|
-
next if Dir["#{dir}/*.faa"].empty?
|
267
|
-
|
268
|
-
cli.say " > Fixing #{d.name}"
|
269
|
-
run_cmd <<~CMD
|
270
|
-
cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
|
271
|
-
CMD
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
##
|
276
|
-
# Check if the essential genes result +res+ has an outdated FastAAI index
|
277
|
-
def outdated_fastaai_ess(res)
|
278
|
-
idx1 = res.file_path(:fastaai_index)
|
279
|
-
idx2 = res.file_path(:fastaai_index_2)
|
280
|
-
idx2.nil? && !idx1.nil?
|
281
|
-
end
|
282
|
-
|
283
|
-
##
|
284
|
-
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
285
|
-
def check_mts(cli)
|
286
|
-
cli.say 'Looking for unarchived MyTaxa Scan runs'
|
287
|
-
cli.load_project.each_dataset do |d|
|
288
|
-
res = d.result(:mytaxa_scan)
|
289
|
-
next if res.nil?
|
290
|
-
|
291
|
-
dir = res.file_path(:regions)
|
292
|
-
fix = false
|
293
|
-
unless dir.nil?
|
294
|
-
if Dir.exist? dir
|
295
|
-
run_cmd <<~CMD
|
296
|
-
cd #{dir.shellescape}/.. \
|
297
|
-
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
298
|
-
&& rm -r '#{d.name}.reg'
|
299
|
-
CMD
|
300
|
-
end
|
301
|
-
fix = true
|
302
|
-
end
|
303
|
-
%i[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
304
|
-
file = res.file_path(ext)
|
305
|
-
unless file.nil?
|
306
|
-
FileUtils.rm(file) if File.exist? file
|
307
|
-
fix = true
|
308
|
-
end
|
309
|
-
end
|
310
|
-
if fix
|
311
|
-
cli.say " > Fixing #{d.name}"
|
312
|
-
d.add_result(:mytaxa_scan, true, force: true)
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
|
317
|
-
##
|
318
|
-
# Perform start operation with MiGA::Cli +cli+
|
319
|
-
def check_start(cli)
|
320
|
-
cli.say 'Looking for legacy .start files lingering'
|
321
|
-
cli.load_project.each_dataset do |d|
|
322
|
-
d.each_result do |r_k, r|
|
323
|
-
if File.exist? r.path(:start)
|
324
|
-
cli.say " > Registering again #{d.name}:#{r_k}"
|
325
|
-
r.save
|
326
|
-
end
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
##
|
332
|
-
# Perform taxonomy operation with MiGA::Cli +cli+
|
333
|
-
def check_tax(cli)
|
334
|
-
# cli.say 'o Checking for taxonomy/distances consistency'
|
335
|
-
# TODO: Find 95%ANI clusters with entries from different species
|
336
|
-
# TODO: Find different 95%ANI clusters with genomes from the same species
|
337
|
-
# TODO: Find AAI values too high or too low for each LCA rank
|
338
|
-
end
|
339
|
-
|
340
|
-
##
|
341
|
-
# Run command +cmd+ with options +opts+
|
342
|
-
def run_cmd(cmd, opts = {})
|
343
|
-
opts = { return: :output, err2out: true, raise: false }.merge(opts)
|
344
|
-
cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
|
345
|
-
warn(cmdo) unless cmdo.empty?
|
346
|
-
end
|
347
67
|
end
|
@@ -7,6 +7,18 @@ end
|
|
7
7
|
##
|
8
8
|
# Helper module including download functions for the *_get actions
|
9
9
|
module MiGA::Cli::Action::Download::Base
|
10
|
+
def cli_base_flags(opt)
|
11
|
+
opt.on(
|
12
|
+
'--max INT', Integer,
|
13
|
+
'Maximum number of datasets to download (by default: unlimited)'
|
14
|
+
) { |v| cli[:max_datasets] = v }
|
15
|
+
opt.on(
|
16
|
+
'-m', '--metadata STRING',
|
17
|
+
'Metadata as key-value pairs separated by = and delimited by comma',
|
18
|
+
'Values are saved as strings except for booleans (true / false) or nil'
|
19
|
+
) { |v| cli[:metadata] = v }
|
20
|
+
end
|
21
|
+
|
10
22
|
def cli_filters(opt)
|
11
23
|
opt.on(
|
12
24
|
'--exclude PATH',
|
@@ -17,6 +29,10 @@ module MiGA::Cli::Action::Download::Base
|
|
17
29
|
'--ignore-until STRING',
|
18
30
|
'Ignores all datasets until a name is found (useful for large reruns)'
|
19
31
|
) { |v| cli[:ignore_until] = v }
|
32
|
+
opt.on(
|
33
|
+
'--ignore-removed',
|
34
|
+
'Ignores entries removed from NCBI (by default fails on removed entries)'
|
35
|
+
) { |v| cli[:ignore_removed] = v }
|
20
36
|
cli.opt_flag(
|
21
37
|
opt, 'get-metadata',
|
22
38
|
'Only download and update metadata for existing datasets', :get_md
|
@@ -49,6 +65,40 @@ module MiGA::Cli::Action::Download::Base
|
|
49
65
|
) { |v| cli[:remote_list] = v }
|
50
66
|
end
|
51
67
|
|
68
|
+
def generic_perform
|
69
|
+
p, ds = load_tasks
|
70
|
+
d, downloaded = download_entries(ds, p)
|
71
|
+
|
72
|
+
# Finalize
|
73
|
+
finalize_tasks(d, downloaded)
|
74
|
+
unlink_entries(p, p.dataset_names - d) if cli[:unlink]
|
75
|
+
end
|
76
|
+
|
77
|
+
def load_tasks
|
78
|
+
sanitize_cli
|
79
|
+
p = cli.load_project
|
80
|
+
ds = remote_list
|
81
|
+
ds = discard_excluded(ds)
|
82
|
+
ds = impose_limit(ds)
|
83
|
+
[p, ds]
|
84
|
+
end
|
85
|
+
|
86
|
+
def finalize_tasks(d, downloaded)
|
87
|
+
cli.say "Datasets listed: #{d.size}"
|
88
|
+
act = cli[:dry] ? 'to download' : 'downloaded'
|
89
|
+
cli.say "Datasets #{act}: #{downloaded}"
|
90
|
+
unless cli[:remote_list].nil?
|
91
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
92
|
+
d.each { |i| fh.puts i }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def unlink_entries(p, unlink)
|
98
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
99
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
100
|
+
end
|
101
|
+
|
52
102
|
def discard_excluded(ds)
|
53
103
|
unless cli[:exclude].nil?
|
54
104
|
cli.say "Discarding datasets in #{cli[:exclude]}"
|
@@ -84,7 +134,11 @@ module MiGA::Cli::Action::Download::Base
|
|
84
134
|
|
85
135
|
downloaded += 1
|
86
136
|
unless cli[:dry]
|
87
|
-
save_entry(name, body, p)
|
137
|
+
unless save_entry(name, body, p)
|
138
|
+
downloaded -= 1
|
139
|
+
d.pop
|
140
|
+
next
|
141
|
+
end
|
88
142
|
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
89
143
|
end
|
90
144
|
end
|
@@ -93,8 +147,11 @@ module MiGA::Cli::Action::Download::Base
|
|
93
147
|
[d, downloaded]
|
94
148
|
end
|
95
149
|
|
150
|
+
##
|
151
|
+
# Saves the (generic remote) entry identified by +name+ with +body+ into the
|
152
|
+
# project +p+, and returns +true+ on success and +false+ otherwise
|
96
153
|
def save_entry(name, body, p)
|
97
|
-
cli.say
|
154
|
+
cli.say " Locating remote dataset: #{name}"
|
98
155
|
body[:md][:metadata_only] = true if cli[:only_md]
|
99
156
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
100
157
|
if cli[:get_md]
|
@@ -105,5 +162,10 @@ module MiGA::Cli::Action::Download::Base
|
|
105
162
|
rd.save_to(p, name, !cli[:query], body[:md])
|
106
163
|
cli.add_metadata(p.add_dataset(name))
|
107
164
|
end
|
165
|
+
true
|
166
|
+
rescue MiGA::RemoteDataMissingError => e
|
167
|
+
raise(e) unless cli[:ignore_removed]
|
168
|
+
cli.say " Removed dataset ignored: #{name}"
|
169
|
+
false
|
108
170
|
end
|
109
171
|
end
|
@@ -18,15 +18,7 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
|
|
18
18
|
'-T', '--taxon STRING',
|
19
19
|
'(Mandatory) Taxon name in GTDB format (e.g., g__Escherichia)'
|
20
20
|
) { |v| cli[:taxon] = v }
|
21
|
-
opt
|
22
|
-
'--max INT', Integer,
|
23
|
-
'Maximum number of datasets to download (by default: unlimited)'
|
24
|
-
) { |v| cli[:max_datasets] = v }
|
25
|
-
opt.on(
|
26
|
-
'-m', '--metadata STRING',
|
27
|
-
'Metadata as key-value pairs separated by = and delimited by comma',
|
28
|
-
'Values are saved as strings except for booleans (true / false) or nil'
|
29
|
-
) { |v| cli[:metadata] = v }
|
21
|
+
cli_base_flags(opt)
|
30
22
|
cli_task_flags(opt)
|
31
23
|
cli_name_modifiers(opt)
|
32
24
|
cli_filters(opt)
|
@@ -39,27 +31,6 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
|
|
39
31
|
end
|
40
32
|
|
41
33
|
def perform
|
42
|
-
|
43
|
-
p = cli.load_project
|
44
|
-
ds = remote_list
|
45
|
-
ds = discard_excluded(ds)
|
46
|
-
ds = impose_limit(ds)
|
47
|
-
d, downloaded = download_entries(ds, p)
|
48
|
-
|
49
|
-
# Finalize
|
50
|
-
cli.say "Datasets listed: #{d.size}"
|
51
|
-
act = cli[:dry] ? 'to download' : 'downloaded'
|
52
|
-
cli.say "Datasets #{act}: #{downloaded}"
|
53
|
-
unless cli[:remote_list].nil?
|
54
|
-
File.open(cli[:remote_list], 'w') do |fh|
|
55
|
-
d.each { |i| fh.puts i }
|
56
|
-
end
|
57
|
-
end
|
58
|
-
return unless cli[:unlink]
|
59
|
-
|
60
|
-
unlink = p.dataset_names - d
|
61
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
62
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
34
|
+
generic_perform
|
63
35
|
end
|
64
|
-
|
65
36
|
end
|
@@ -20,48 +20,23 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
20
20
|
'-T', '--taxon STRING',
|
21
21
|
'(Mandatory) Taxon name (e.g., a species binomial)'
|
22
22
|
) { |v| cli[:taxon] = v }
|
23
|
-
opt
|
24
|
-
'--max INT', Integer,
|
25
|
-
'Maximum number of datasets to download (by default: unlimited)'
|
26
|
-
) { |v| cli[:max_datasets] = v }
|
27
|
-
opt.on(
|
28
|
-
'-m', '--metadata STRING',
|
29
|
-
'Metadata as key-value pairs separated by = and delimited by comma',
|
30
|
-
'Values are saved as strings except for booleans (true / false) or nil'
|
31
|
-
) { |v| cli[:metadata] = v }
|
23
|
+
cli_base_flags(opt)
|
32
24
|
cli_task_flags(opt)
|
33
25
|
cli_name_modifiers(opt)
|
34
26
|
cli_filters(opt)
|
35
27
|
cli_save_actions(opt)
|
36
28
|
opt.on(
|
37
29
|
'--api-key STRING',
|
30
|
+
'::HIDE::' # For backwards compatibility
|
31
|
+
) { |v| ENV['NCBI_API_KEY'] = v }
|
32
|
+
opt.on(
|
33
|
+
'--ncbi-api-key STRING',
|
38
34
|
'NCBI API key'
|
39
35
|
) { |v| ENV['NCBI_API_KEY'] = v }
|
40
36
|
end
|
41
37
|
end
|
42
38
|
|
43
39
|
def perform
|
44
|
-
|
45
|
-
p = cli.load_project
|
46
|
-
ds = remote_list
|
47
|
-
ds = discard_excluded(ds)
|
48
|
-
ds = impose_limit(ds)
|
49
|
-
d, downloaded = download_entries(ds, p)
|
50
|
-
|
51
|
-
# Finalize
|
52
|
-
cli.say "Datasets listed: #{d.size}"
|
53
|
-
act = cli[:dry] ? 'to download' : 'downloaded'
|
54
|
-
cli.say "Datasets #{act}: #{downloaded}"
|
55
|
-
unless cli[:remote_list].nil?
|
56
|
-
File.open(cli[:remote_list], 'w') do |fh|
|
57
|
-
d.each { |i| fh.puts i }
|
58
|
-
end
|
59
|
-
end
|
60
|
-
return unless cli[:unlink]
|
61
|
-
|
62
|
-
unlink = p.dataset_names - d
|
63
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
64
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
40
|
+
generic_perform
|
65
41
|
end
|
66
|
-
|
67
42
|
end
|
data/lib/miga/cli/opt_helper.rb
CHANGED
data/lib/miga/common/errors.rb
CHANGED
@@ -9,4 +9,14 @@ module MiGA
|
|
9
9
|
# An error with a system call
|
10
10
|
class SystemCallError < Error
|
11
11
|
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# An error with remote data
|
15
|
+
class RemoteDataError < Error
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# An error caused by missing remote data
|
20
|
+
class RemoteDataMissingError < RemoteDataError
|
21
|
+
end
|
12
22
|
end
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -7,18 +7,47 @@ class MiGA::Dataset < MiGA::MiGA
|
|
7
7
|
|
8
8
|
# Class-level
|
9
9
|
class << self
|
10
|
+
##
|
11
|
+
# Directories containing the results from dataset-specific tasks
|
10
12
|
def RESULT_DIRS
|
11
13
|
@@RESULT_DIRS
|
12
14
|
end
|
13
15
|
|
16
|
+
##
|
17
|
+
# Supported dataset types
|
14
18
|
def KNOWN_TYPES
|
15
19
|
@@KNOWN_TYPES
|
16
20
|
end
|
17
21
|
|
22
|
+
##
|
23
|
+
# Returns an Array of tasks (Symbols) to be executed before project-wide
|
24
|
+
# tasks
|
18
25
|
def PREPROCESSING_TASKS
|
19
26
|
@@PREPROCESSING_TASKS
|
20
27
|
end
|
21
28
|
|
29
|
+
##
|
30
|
+
# Tasks to be excluded from query datasets
|
31
|
+
def EXCLUDE_NOREF_TASKS
|
32
|
+
@@EXCLUDE_NOREF_TASKS
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Tasks to be executed only in datasets that are single-organism. These
|
37
|
+
# tasks are ignored for multi-organism datasets or for unknown types
|
38
|
+
def ONLY_NONMULTI_TASKS
|
39
|
+
@@ONLY_NONMULTI_TASKS
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Tasks to be executed only in datasets that are multi-organism. These
|
44
|
+
# tasks are ignored for single-organism datasets or for unknwon types
|
45
|
+
def ONLY_MULTI_TASKS
|
46
|
+
@@ONLY_MULTI_TASKS
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Options supported by datasets
|
22
51
|
def OPTIONS
|
23
52
|
@@OPTIONS
|
24
53
|
end
|
@@ -69,7 +98,7 @@ module MiGA::Dataset::Base
|
|
69
98
|
}
|
70
99
|
|
71
100
|
##
|
72
|
-
# Returns an Array of tasks to be executed before project-wide tasks
|
101
|
+
# Returns an Array of tasks (Symbols) to be executed before project-wide tasks
|
73
102
|
@@PREPROCESSING_TASKS = [
|
74
103
|
:raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
|
75
104
|
:assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
|
@@ -77,19 +106,19 @@ module MiGA::Dataset::Base
|
|
77
106
|
]
|
78
107
|
|
79
108
|
##
|
80
|
-
# Tasks to be excluded from query datasets
|
109
|
+
# Tasks to be excluded from query datasets
|
81
110
|
@@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
|
82
111
|
@@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
|
83
112
|
|
84
113
|
##
|
85
|
-
# Tasks to be executed only in datasets that are
|
86
|
-
# tasks are ignored for multi-organism datasets or for unknown types
|
114
|
+
# Tasks to be executed only in datasets that are single-organism. These
|
115
|
+
# tasks are ignored for multi-organism datasets or for unknown types
|
87
116
|
@@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
|
88
117
|
@@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
|
89
118
|
|
90
119
|
##
|
91
120
|
# Tasks to be executed only in datasets that are multi-organism. These
|
92
|
-
# tasks are ignored for single-organism datasets or for unknwon types
|
121
|
+
# tasks are ignored for single-organism datasets or for unknwon types
|
93
122
|
@@ONLY_MULTI_TASKS = [:mytaxa]
|
94
123
|
@@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
|
95
124
|
|