miga-base 1.2.14.2 → 1.2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,15 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require 'miga/cli/action/doctor/base'
5
-
6
4
  class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
5
+ require 'miga/cli/action/doctor/base'
6
+ require 'miga/cli/action/doctor/databases'
7
+ require 'miga/cli/action/doctor/distances'
8
+ require 'miga/cli/action/doctor/operations'
7
9
  include MiGA::Cli::Action::Doctor::Base
10
+ include MiGA::Cli::Action::Doctor::Databases
11
+ include MiGA::Cli::Action::Doctor::Distances
12
+ include MiGA::Cli::Action::Doctor::Operations
8
13
 
9
14
  def parse_cli
10
15
  cli.defaults = { threads: 1 }
@@ -59,289 +64,4 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
59
64
  @@OPERATIONS
60
65
  end
61
66
  end
62
-
63
- ##
64
- # Perform status operation with MiGA::Cli +cli+
65
- def check_status(cli)
66
- cli.say 'Updating metadata status'
67
- p = cli.load_project
68
- n = p.dataset_names.size
69
- (0 .. cli[:threads] - 1).map do |i|
70
- Process.fork do
71
- k = 0
72
- cli.load_project.each_dataset do |d|
73
- k += 1
74
- cli.advance('Datasets:', k, n, false) if i == 0
75
- d.recalculate_status if k % cli[:threads] == i
76
- end
77
- end
78
- end
79
- Process.waitall
80
- cli.say
81
- end
82
-
83
- ##
84
- # Perform databases operation with MiGA::Cli +cli+
85
- def check_db(cli)
86
- cli.say 'Checking integrity of databases'
87
- p = cli.load_project
88
- n = p.dataset_names.size
89
- (0 .. cli[:threads] - 1).map do |i|
90
- Process.fork do
91
- k = 0
92
- p.each_dataset do |d|
93
- k += 1
94
- cli.advance('Datasets:', k, n, false) if i == 0
95
- next unless k % cli[:threads] == i
96
- each_database_file(d) do |db_file, metric, result, _rank|
97
- check_sqlite3_database(db_file, metric) do
98
- cli.say(
99
- " > Removing malformed database from #{d.name}:#{result} "
100
- )
101
- File.unlink(db_file)
102
- r = d.result(result) or next
103
- [r.path(:done), r.path].each do |f|
104
- File.unlink(f) if File.exist?(f)
105
- end
106
- end
107
- end
108
- end
109
- end
110
- end
111
- Process.waitall
112
- cli.say
113
- end
114
-
115
- ##
116
- # Perform bidirectional operation with MiGA::Cli +cli+
117
- def check_bidir(cli)
118
- cli.say 'Checking if reference distances are bidirectional'
119
- project = cli.load_project
120
- ref_ds = project.each_dataset.select(&:ref?)
121
- ref_names = ref_ds.map(&:name)
122
- n = ref_ds.size
123
-
124
- # Read data first (threaded)
125
- tmp = File.join(project.path, 'doctor-bidirectional.tmp')
126
- FileUtils.mkdir_p(tmp)
127
- MiGA::Parallel.process(cli[:threads]) do |thr|
128
- file = File.join(tmp, "#{thr}.json")
129
- fh = File.open(file, 'w')
130
- [:aai, :ani].each do |metric|
131
- fh.puts "# #{metric}"
132
- ref_ds.each_with_index do |ds, idx|
133
- if idx % cli[:threads] == thr
134
- cli.advance('Reading:', idx + 1, n, false) if thr == 0
135
- row = read_bidirectional(ds, metric)
136
- fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
137
- end
138
- end
139
- end
140
- fh.puts '# end'
141
- fh.flush # necessary for large threaded runs
142
- fh.close
143
- if thr == 0
144
- cli.advance('Reading:', n, n, false)
145
- cli.say
146
- end
147
- end
148
-
149
- # Merge pieces per thread
150
- dist = { aai: {}, ani: {} }
151
- cli[:threads].times do |i|
152
- cli.advance('Merging:', i + 1, cli[:threads], false)
153
- file = File.join(tmp, "#{i}.json")
154
- File.open(file, 'r') do |fh|
155
- metric = nil
156
- fh.each do |ln|
157
- qry, row = ln.chomp.split(' ', 2)
158
- if qry == '#'
159
- metric = row.to_sym
160
- else
161
- raise "Unrecognized metric: #{metric}" unless dist[metric]
162
- JSON.parse(row).each do |sbj, val|
163
- dist[metric][qry] ||= {}
164
- if dist[metric][sbj]&.include?(qry)
165
- dist[metric][sbj].delete(qry) # Already bidirectional
166
- else
167
- dist[metric][qry][sbj] = val
168
- end
169
- end
170
- end
171
- end
172
- raise "Incomplete thread dump: #{file}" unless metric == :end
173
- end
174
- end
175
- cli.say
176
- FileUtils.rm_rf(tmp)
177
-
178
- # Write missing values (threaded)
179
- MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
180
- cli.advance('Datasets:', idx + 1, n, false) if thr == 0
181
- save_bidirectional(ds, dist)
182
- end
183
- cli.say
184
- end
185
-
186
- ##
187
- # Perform distances operation with MiGA::Cli +cli+
188
- def check_dist(cli)
189
- p = cli.load_project
190
- %i[ani aai].each do |dist|
191
- res = p.result("#{dist}_distances")
192
- next if res.nil?
193
-
194
- cli.say "Checking #{dist} table for consistent datasets"
195
- notok, fix = check_dist_eval(cli, p, res)
196
- check_dist_fix(cli, p, fix)
197
- check_dist_recompute(cli, res, notok)
198
- end
199
- end
200
-
201
- ##
202
- # Perform files operation with MiGA::Cli +cli+
203
- def check_files(cli)
204
- cli.say 'Looking for outdated files in results'
205
- n, k = cli.load_project.dataset_names.size, 0
206
- cli.load_project.each_dataset do |d|
207
- cli.advance('Datasets:', k += 1, n, false)
208
- d.each_result do |r_k, r|
209
- ok = true
210
- r.each_file do |_f_sym, _f_rel, f_abs|
211
- unless File.exist? f_abs
212
- ok = false
213
- break
214
- end
215
- end
216
- unless ok
217
- cli.say " > Registering again #{d.name}:#{r_k} "
218
- d.add_result(r_k, true, force: true)
219
- sr = d.result(:stats) and sr.remove!
220
- end
221
- end
222
- end
223
- cli.say
224
- end
225
-
226
- ##
227
- # Perform cds operation with MiGA::Cli +cli+
228
- def check_cds(cli)
229
- cli.say 'Looking for unzipped genes or proteins'
230
- n, k = cli.load_project.dataset_names.size, 0
231
- cli.load_project.each_dataset do |d|
232
- cli.advance('Datasets:', k += 1, n, false)
233
- res = d.result(:cds) or next
234
- changed = false
235
- %i[genes proteins gff3 gff2 tab].each do |f|
236
- file = res.file_path(f) or next
237
- if file !~ /\.gz/
238
- cli.say " > Gzipping #{d.name} #{f} "
239
- run_cmd(['gzip', '-9', file])
240
- changed = true
241
- end
242
- end
243
- if changed
244
- d.add_result(:cds, true, force: true)
245
- sr = d.result(:stats) and sr.remove!
246
- end
247
- end
248
- cli.say
249
- end
250
-
251
- ##
252
- # Perform essential-genes operation with MiGA::Cli +cli+
253
- def check_ess(cli)
254
- cli.say 'Looking for outdated essential genes'
255
- cli.load_project.each_dataset do |d|
256
- res = d.result(:essential_genes)
257
- next if res.nil?
258
-
259
- dir = res.file_path(:collection)
260
- if dir.nil? || outdated_fastaai_ess(res)
261
- cli.say " > Removing #{d.name}:essential_genes"
262
- res.remove!
263
- d.result(:stats)&.remove!
264
- next
265
- end
266
- next if Dir["#{dir}/*.faa"].empty?
267
-
268
- cli.say " > Fixing #{d.name}"
269
- run_cmd <<~CMD
270
- cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
271
- CMD
272
- end
273
- end
274
-
275
- ##
276
- # Check if the essential genes result +res+ has an outdated FastAAI index
277
- def outdated_fastaai_ess(res)
278
- idx1 = res.file_path(:fastaai_index)
279
- idx2 = res.file_path(:fastaai_index_2)
280
- idx2.nil? && !idx1.nil?
281
- end
282
-
283
- ##
284
- # Perform mytaxa-scan operation with MiGA::Cli +cli+
285
- def check_mts(cli)
286
- cli.say 'Looking for unarchived MyTaxa Scan runs'
287
- cli.load_project.each_dataset do |d|
288
- res = d.result(:mytaxa_scan)
289
- next if res.nil?
290
-
291
- dir = res.file_path(:regions)
292
- fix = false
293
- unless dir.nil?
294
- if Dir.exist? dir
295
- run_cmd <<~CMD
296
- cd #{dir.shellescape}/.. \
297
- && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
298
- && rm -r '#{d.name}.reg'
299
- CMD
300
- end
301
- fix = true
302
- end
303
- %i[blast mytaxain wintax gene_ids region_ids].each do |ext|
304
- file = res.file_path(ext)
305
- unless file.nil?
306
- FileUtils.rm(file) if File.exist? file
307
- fix = true
308
- end
309
- end
310
- if fix
311
- cli.say " > Fixing #{d.name}"
312
- d.add_result(:mytaxa_scan, true, force: true)
313
- end
314
- end
315
- end
316
-
317
- ##
318
- # Perform start operation with MiGA::Cli +cli+
319
- def check_start(cli)
320
- cli.say 'Looking for legacy .start files lingering'
321
- cli.load_project.each_dataset do |d|
322
- d.each_result do |r_k, r|
323
- if File.exist? r.path(:start)
324
- cli.say " > Registering again #{d.name}:#{r_k}"
325
- r.save
326
- end
327
- end
328
- end
329
- end
330
-
331
- ##
332
- # Perform taxonomy operation with MiGA::Cli +cli+
333
- def check_tax(cli)
334
- # cli.say 'o Checking for taxonomy/distances consistency'
335
- # TODO: Find 95%ANI clusters with entries from different species
336
- # TODO: Find different 95%ANI clusters with genomes from the same species
337
- # TODO: Find AAI values too high or too low for each LCA rank
338
- end
339
-
340
- ##
341
- # Run command +cmd+ with options +opts+
342
- def run_cmd(cmd, opts = {})
343
- opts = { return: :output, err2out: true, raise: false }.merge(opts)
344
- cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
345
- warn(cmdo) unless cmdo.empty?
346
- end
347
67
  end
@@ -7,6 +7,18 @@ end
7
7
  ##
8
8
  # Helper module including download functions for the *_get actions
9
9
  module MiGA::Cli::Action::Download::Base
10
+ def cli_base_flags(opt)
11
+ opt.on(
12
+ '--max INT', Integer,
13
+ 'Maximum number of datasets to download (by default: unlimited)'
14
+ ) { |v| cli[:max_datasets] = v }
15
+ opt.on(
16
+ '-m', '--metadata STRING',
17
+ 'Metadata as key-value pairs separated by = and delimited by comma',
18
+ 'Values are saved as strings except for booleans (true / false) or nil'
19
+ ) { |v| cli[:metadata] = v }
20
+ end
21
+
10
22
  def cli_filters(opt)
11
23
  opt.on(
12
24
  '--exclude PATH',
@@ -17,6 +29,10 @@ module MiGA::Cli::Action::Download::Base
17
29
  '--ignore-until STRING',
18
30
  'Ignores all datasets until a name is found (useful for large reruns)'
19
31
  ) { |v| cli[:ignore_until] = v }
32
+ opt.on(
33
+ '--ignore-removed',
34
+ 'Ignores entries removed from NCBI (by default fails on removed entries)'
35
+ ) { |v| cli[:ignore_removed] = v }
20
36
  cli.opt_flag(
21
37
  opt, 'get-metadata',
22
38
  'Only download and update metadata for existing datasets', :get_md
@@ -49,6 +65,40 @@ module MiGA::Cli::Action::Download::Base
49
65
  ) { |v| cli[:remote_list] = v }
50
66
  end
51
67
 
68
+ def generic_perform
69
+ p, ds = load_tasks
70
+ d, downloaded = download_entries(ds, p)
71
+
72
+ # Finalize
73
+ finalize_tasks(d, downloaded)
74
+ unlink_entries(p, p.dataset_names - d) if cli[:unlink]
75
+ end
76
+
77
+ def load_tasks
78
+ sanitize_cli
79
+ p = cli.load_project
80
+ ds = remote_list
81
+ ds = discard_excluded(ds)
82
+ ds = impose_limit(ds)
83
+ [p, ds]
84
+ end
85
+
86
+ def finalize_tasks(d, downloaded)
87
+ cli.say "Datasets listed: #{d.size}"
88
+ act = cli[:dry] ? 'to download' : 'downloaded'
89
+ cli.say "Datasets #{act}: #{downloaded}"
90
+ unless cli[:remote_list].nil?
91
+ File.open(cli[:remote_list], 'w') do |fh|
92
+ d.each { |i| fh.puts i }
93
+ end
94
+ end
95
+ end
96
+
97
+ def unlink_entries(p, unlink)
98
+ unlink.each { |i| p.unlink_dataset(i).remove! }
99
+ cli.say "Datasets unlinked: #{unlink.size}"
100
+ end
101
+
52
102
  def discard_excluded(ds)
53
103
  unless cli[:exclude].nil?
54
104
  cli.say "Discarding datasets in #{cli[:exclude]}"
@@ -84,7 +134,11 @@ module MiGA::Cli::Action::Download::Base
84
134
 
85
135
  downloaded += 1
86
136
  unless cli[:dry]
87
- save_entry(name, body, p)
137
+ unless save_entry(name, body, p)
138
+ downloaded -= 1
139
+ d.pop
140
+ next
141
+ end
88
142
  p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
89
143
  end
90
144
  end
@@ -93,8 +147,11 @@ module MiGA::Cli::Action::Download::Base
93
147
  [d, downloaded]
94
148
  end
95
149
 
150
+ ##
151
+ # Saves the (generic remote) entry identified by +name+ with +body+ into the
152
+ # project +p+, and returns +true+ on success and +false+ otherwise
96
153
  def save_entry(name, body, p)
97
- cli.say ' Locating remote dataset'
154
+ cli.say " Locating remote dataset: #{name}"
98
155
  body[:md][:metadata_only] = true if cli[:only_md]
99
156
  rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
100
157
  if cli[:get_md]
@@ -105,5 +162,10 @@ module MiGA::Cli::Action::Download::Base
105
162
  rd.save_to(p, name, !cli[:query], body[:md])
106
163
  cli.add_metadata(p.add_dataset(name))
107
164
  end
165
+ true
166
+ rescue MiGA::RemoteDataMissingError => e
167
+ raise(e) unless cli[:ignore_removed]
168
+ cli.say " Removed dataset ignored: #{name}"
169
+ false
108
170
  end
109
171
  end
@@ -18,15 +18,7 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
18
18
  '-T', '--taxon STRING',
19
19
  '(Mandatory) Taxon name in GTDB format (e.g., g__Escherichia)'
20
20
  ) { |v| cli[:taxon] = v }
21
- opt.on(
22
- '--max INT', Integer,
23
- 'Maximum number of datasets to download (by default: unlimited)'
24
- ) { |v| cli[:max_datasets] = v }
25
- opt.on(
26
- '-m', '--metadata STRING',
27
- 'Metadata as key-value pairs separated by = and delimited by comma',
28
- 'Values are saved as strings except for booleans (true / false) or nil'
29
- ) { |v| cli[:metadata] = v }
21
+ cli_base_flags(opt)
30
22
  cli_task_flags(opt)
31
23
  cli_name_modifiers(opt)
32
24
  cli_filters(opt)
@@ -39,27 +31,6 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
39
31
  end
40
32
 
41
33
  def perform
42
- sanitize_cli
43
- p = cli.load_project
44
- ds = remote_list
45
- ds = discard_excluded(ds)
46
- ds = impose_limit(ds)
47
- d, downloaded = download_entries(ds, p)
48
-
49
- # Finalize
50
- cli.say "Datasets listed: #{d.size}"
51
- act = cli[:dry] ? 'to download' : 'downloaded'
52
- cli.say "Datasets #{act}: #{downloaded}"
53
- unless cli[:remote_list].nil?
54
- File.open(cli[:remote_list], 'w') do |fh|
55
- d.each { |i| fh.puts i }
56
- end
57
- end
58
- return unless cli[:unlink]
59
-
60
- unlink = p.dataset_names - d
61
- unlink.each { |i| p.unlink_dataset(i).remove! }
62
- cli.say "Datasets unlinked: #{unlink.size}"
34
+ generic_perform
63
35
  end
64
-
65
36
  end
@@ -20,48 +20,23 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
20
20
  '-T', '--taxon STRING',
21
21
  '(Mandatory) Taxon name (e.g., a species binomial)'
22
22
  ) { |v| cli[:taxon] = v }
23
- opt.on(
24
- '--max INT', Integer,
25
- 'Maximum number of datasets to download (by default: unlimited)'
26
- ) { |v| cli[:max_datasets] = v }
27
- opt.on(
28
- '-m', '--metadata STRING',
29
- 'Metadata as key-value pairs separated by = and delimited by comma',
30
- 'Values are saved as strings except for booleans (true / false) or nil'
31
- ) { |v| cli[:metadata] = v }
23
+ cli_base_flags(opt)
32
24
  cli_task_flags(opt)
33
25
  cli_name_modifiers(opt)
34
26
  cli_filters(opt)
35
27
  cli_save_actions(opt)
36
28
  opt.on(
37
29
  '--api-key STRING',
30
+ '::HIDE::' # For backwards compatibility
31
+ ) { |v| ENV['NCBI_API_KEY'] = v }
32
+ opt.on(
33
+ '--ncbi-api-key STRING',
38
34
  'NCBI API key'
39
35
  ) { |v| ENV['NCBI_API_KEY'] = v }
40
36
  end
41
37
  end
42
38
 
43
39
  def perform
44
- sanitize_cli
45
- p = cli.load_project
46
- ds = remote_list
47
- ds = discard_excluded(ds)
48
- ds = impose_limit(ds)
49
- d, downloaded = download_entries(ds, p)
50
-
51
- # Finalize
52
- cli.say "Datasets listed: #{d.size}"
53
- act = cli[:dry] ? 'to download' : 'downloaded'
54
- cli.say "Datasets #{act}: #{downloaded}"
55
- unless cli[:remote_list].nil?
56
- File.open(cli[:remote_list], 'w') do |fh|
57
- d.each { |i| fh.puts i }
58
- end
59
- end
60
- return unless cli[:unlink]
61
-
62
- unlink = p.dataset_names - d
63
- unlink.each { |i| p.unlink_dataset(i).remove! }
64
- cli.say "Datasets unlinked: #{unlink.size}"
40
+ generic_perform
65
41
  end
66
-
67
42
  end
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
43
43
  '-h', '--help',
44
44
  'Display this screen'
45
45
  ) do
46
- puts opt
46
+ puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
47
47
  exit
48
48
  end
49
49
  opt.separator ''
@@ -9,4 +9,14 @@ module MiGA
9
9
  # An error with a system call
10
10
  class SystemCallError < Error
11
11
  end
12
+
13
+ ##
14
+ # An error with remote data
15
+ class RemoteDataError < Error
16
+ end
17
+
18
+ ##
19
+ # An error caused by missing remote data
20
+ class RemoteDataMissingError < RemoteDataError
21
+ end
12
22
  end
@@ -7,18 +7,47 @@ class MiGA::Dataset < MiGA::MiGA
7
7
 
8
8
  # Class-level
9
9
  class << self
10
+ ##
11
+ # Directories containing the results from dataset-specific tasks
10
12
  def RESULT_DIRS
11
13
  @@RESULT_DIRS
12
14
  end
13
15
 
16
+ ##
17
+ # Supported dataset types
14
18
  def KNOWN_TYPES
15
19
  @@KNOWN_TYPES
16
20
  end
17
21
 
22
+ ##
23
+ # Returns an Array of tasks (Symbols) to be executed before project-wide
24
+ # tasks
18
25
  def PREPROCESSING_TASKS
19
26
  @@PREPROCESSING_TASKS
20
27
  end
21
28
 
29
+ ##
30
+ # Tasks to be excluded from query datasets
31
+ def EXCLUDE_NOREF_TASKS
32
+ @@EXCLUDE_NOREF_TASKS
33
+ end
34
+
35
+ ##
36
+ # Tasks to be executed only in datasets that are single-organism. These
37
+ # tasks are ignored for multi-organism datasets or for unknown types
38
+ def ONLY_NONMULTI_TASKS
39
+ @@ONLY_NONMULTI_TASKS
40
+ end
41
+
42
+ ##
43
+ # Tasks to be executed only in datasets that are multi-organism. These
44
+ # tasks are ignored for single-organism datasets or for unknwon types
45
+ def ONLY_MULTI_TASKS
46
+ @@ONLY_MULTI_TASKS
47
+ end
48
+
49
+ ##
50
+ # Options supported by datasets
22
51
  def OPTIONS
23
52
  @@OPTIONS
24
53
  end
@@ -69,7 +98,7 @@ module MiGA::Dataset::Base
69
98
  }
70
99
 
71
100
  ##
72
- # Returns an Array of tasks to be executed before project-wide tasks
101
+ # Returns an Array of tasks (Symbols) to be executed before project-wide tasks
73
102
  @@PREPROCESSING_TASKS = [
74
103
  :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
75
104
  :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
@@ -77,19 +106,19 @@ module MiGA::Dataset::Base
77
106
  ]
78
107
 
79
108
  ##
80
- # Tasks to be excluded from query datasets.
109
+ # Tasks to be excluded from query datasets
81
110
  @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
82
111
  @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
83
112
 
84
113
  ##
85
- # Tasks to be executed only in datasets that are not multi-organism. These
86
- # tasks are ignored for multi-organism datasets or for unknown types.
114
+ # Tasks to be executed only in datasets that are single-organism. These
115
+ # tasks are ignored for multi-organism datasets or for unknown types
87
116
  @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
88
117
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
89
118
 
90
119
  ##
91
120
  # Tasks to be executed only in datasets that are multi-organism. These
92
- # tasks are ignored for single-organism datasets or for unknwon types.
121
+ # tasks are ignored for single-organism datasets or for unknwon types
93
122
  @@ONLY_MULTI_TASKS = [:mytaxa]
94
123
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
95
124