miga-base 1.2.15.0 → 1.2.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,15 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require 'miga/cli/action/doctor/base'
5
-
6
4
  class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
5
+ require 'miga/cli/action/doctor/base'
6
+ require 'miga/cli/action/doctor/databases'
7
+ require 'miga/cli/action/doctor/distances'
8
+ require 'miga/cli/action/doctor/operations'
7
9
  include MiGA::Cli::Action::Doctor::Base
10
+ include MiGA::Cli::Action::Doctor::Databases
11
+ include MiGA::Cli::Action::Doctor::Distances
12
+ include MiGA::Cli::Action::Doctor::Operations
8
13
 
9
14
  def parse_cli
10
15
  cli.defaults = { threads: 1 }
@@ -59,289 +64,4 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
59
64
  @@OPERATIONS
60
65
  end
61
66
  end
62
-
63
- ##
64
- # Perform status operation with MiGA::Cli +cli+
65
- def check_status(cli)
66
- cli.say 'Updating metadata status'
67
- p = cli.load_project
68
- n = p.dataset_names.size
69
- (0 .. cli[:threads] - 1).map do |i|
70
- Process.fork do
71
- k = 0
72
- cli.load_project.each_dataset do |d|
73
- k += 1
74
- cli.advance('Datasets:', k, n, false) if i == 0
75
- d.recalculate_status if k % cli[:threads] == i
76
- end
77
- end
78
- end
79
- Process.waitall
80
- cli.say
81
- end
82
-
83
- ##
84
- # Perform databases operation with MiGA::Cli +cli+
85
- def check_db(cli)
86
- cli.say 'Checking integrity of databases'
87
- p = cli.load_project
88
- n = p.dataset_names.size
89
- (0 .. cli[:threads] - 1).map do |i|
90
- Process.fork do
91
- k = 0
92
- p.each_dataset do |d|
93
- k += 1
94
- cli.advance('Datasets:', k, n, false) if i == 0
95
- next unless k % cli[:threads] == i
96
- each_database_file(d) do |db_file, metric, result, _rank|
97
- check_sqlite3_database(db_file, metric) do
98
- cli.say(
99
- " > Removing malformed database from #{d.name}:#{result} "
100
- )
101
- File.unlink(db_file)
102
- r = d.result(result) or next
103
- [r.path(:done), r.path].each do |f|
104
- File.unlink(f) if File.exist?(f)
105
- end
106
- end
107
- end
108
- end
109
- end
110
- end
111
- Process.waitall
112
- cli.say
113
- end
114
-
115
- ##
116
- # Perform bidirectional operation with MiGA::Cli +cli+
117
- def check_bidir(cli)
118
- cli.say 'Checking if reference distances are bidirectional'
119
- project = cli.load_project
120
- ref_ds = project.each_dataset.select(&:ref?)
121
- ref_names = ref_ds.map(&:name)
122
- n = ref_ds.size
123
-
124
- # Read data first (threaded)
125
- tmp = File.join(project.path, 'doctor-bidirectional.tmp')
126
- FileUtils.mkdir_p(tmp)
127
- MiGA::Parallel.process(cli[:threads]) do |thr|
128
- file = File.join(tmp, "#{thr}.json")
129
- fh = File.open(file, 'w')
130
- [:aai, :ani].each do |metric|
131
- fh.puts "# #{metric}"
132
- ref_ds.each_with_index do |ds, idx|
133
- if idx % cli[:threads] == thr
134
- cli.advance('Reading:', idx + 1, n, false) if thr == 0
135
- row = read_bidirectional(ds, metric)
136
- fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
137
- end
138
- end
139
- end
140
- fh.puts '# end'
141
- fh.flush # necessary for large threaded runs
142
- fh.close
143
- if thr == 0
144
- cli.advance('Reading:', n, n, false)
145
- cli.say
146
- end
147
- end
148
-
149
- # Merge pieces per thread
150
- dist = { aai: {}, ani: {} }
151
- cli[:threads].times do |i|
152
- cli.advance('Merging:', i + 1, cli[:threads], false)
153
- file = File.join(tmp, "#{i}.json")
154
- File.open(file, 'r') do |fh|
155
- metric = nil
156
- fh.each do |ln|
157
- qry, row = ln.chomp.split(' ', 2)
158
- if qry == '#'
159
- metric = row.to_sym
160
- else
161
- raise "Unrecognized metric: #{metric}" unless dist[metric]
162
- JSON.parse(row).each do |sbj, val|
163
- dist[metric][qry] ||= {}
164
- if dist[metric][sbj]&.include?(qry)
165
- dist[metric][sbj].delete(qry) # Already bidirectional
166
- else
167
- dist[metric][qry][sbj] = val
168
- end
169
- end
170
- end
171
- end
172
- raise "Incomplete thread dump: #{file}" unless metric == :end
173
- end
174
- end
175
- cli.say
176
- FileUtils.rm_rf(tmp)
177
-
178
- # Write missing values (threaded)
179
- MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
180
- cli.advance('Datasets:', idx + 1, n, false) if thr == 0
181
- save_bidirectional(ds, dist)
182
- end
183
- cli.say
184
- end
185
-
186
- ##
187
- # Perform distances operation with MiGA::Cli +cli+
188
- def check_dist(cli)
189
- p = cli.load_project
190
- %i[ani aai].each do |dist|
191
- res = p.result("#{dist}_distances")
192
- next if res.nil?
193
-
194
- cli.say "Checking #{dist} table for consistent datasets"
195
- notok, fix = check_dist_eval(cli, p, res)
196
- check_dist_fix(cli, p, fix)
197
- check_dist_recompute(cli, res, notok)
198
- end
199
- end
200
-
201
- ##
202
- # Perform files operation with MiGA::Cli +cli+
203
- def check_files(cli)
204
- cli.say 'Looking for outdated files in results'
205
- n, k = cli.load_project.dataset_names.size, 0
206
- cli.load_project.each_dataset do |d|
207
- cli.advance('Datasets:', k += 1, n, false)
208
- d.each_result do |r_k, r|
209
- ok = true
210
- r.each_file do |_f_sym, _f_rel, f_abs|
211
- unless File.exist? f_abs
212
- ok = false
213
- break
214
- end
215
- end
216
- unless ok
217
- cli.say " > Registering again #{d.name}:#{r_k} "
218
- d.add_result(r_k, true, force: true)
219
- sr = d.result(:stats) and sr.remove!
220
- end
221
- end
222
- end
223
- cli.say
224
- end
225
-
226
- ##
227
- # Perform cds operation with MiGA::Cli +cli+
228
- def check_cds(cli)
229
- cli.say 'Looking for unzipped genes or proteins'
230
- n, k = cli.load_project.dataset_names.size, 0
231
- cli.load_project.each_dataset do |d|
232
- cli.advance('Datasets:', k += 1, n, false)
233
- res = d.result(:cds) or next
234
- changed = false
235
- %i[genes proteins gff3 gff2 tab].each do |f|
236
- file = res.file_path(f) or next
237
- if file !~ /\.gz/
238
- cli.say " > Gzipping #{d.name} #{f} "
239
- run_cmd(['gzip', '-9', file])
240
- changed = true
241
- end
242
- end
243
- if changed
244
- d.add_result(:cds, true, force: true)
245
- sr = d.result(:stats) and sr.remove!
246
- end
247
- end
248
- cli.say
249
- end
250
-
251
- ##
252
- # Perform essential-genes operation with MiGA::Cli +cli+
253
- def check_ess(cli)
254
- cli.say 'Looking for outdated essential genes'
255
- cli.load_project.each_dataset do |d|
256
- res = d.result(:essential_genes)
257
- next if res.nil?
258
-
259
- dir = res.file_path(:collection)
260
- if dir.nil? || outdated_fastaai_ess(res)
261
- cli.say " > Removing #{d.name}:essential_genes"
262
- res.remove!
263
- d.result(:stats)&.remove!
264
- next
265
- end
266
- next if Dir["#{dir}/*.faa"].empty?
267
-
268
- cli.say " > Fixing #{d.name}"
269
- run_cmd <<~CMD
270
- cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
271
- CMD
272
- end
273
- end
274
-
275
- ##
276
- # Check if the essential genes result +res+ has an outdated FastAAI index
277
- def outdated_fastaai_ess(res)
278
- idx1 = res.file_path(:fastaai_index)
279
- idx2 = res.file_path(:fastaai_index_2)
280
- idx2.nil? && !idx1.nil?
281
- end
282
-
283
- ##
284
- # Perform mytaxa-scan operation with MiGA::Cli +cli+
285
- def check_mts(cli)
286
- cli.say 'Looking for unarchived MyTaxa Scan runs'
287
- cli.load_project.each_dataset do |d|
288
- res = d.result(:mytaxa_scan)
289
- next if res.nil?
290
-
291
- dir = res.file_path(:regions)
292
- fix = false
293
- unless dir.nil?
294
- if Dir.exist? dir
295
- run_cmd <<~CMD
296
- cd #{dir.shellescape}/.. \
297
- && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
298
- && rm -r '#{d.name}.reg'
299
- CMD
300
- end
301
- fix = true
302
- end
303
- %i[blast mytaxain wintax gene_ids region_ids].each do |ext|
304
- file = res.file_path(ext)
305
- unless file.nil?
306
- FileUtils.rm(file) if File.exist? file
307
- fix = true
308
- end
309
- end
310
- if fix
311
- cli.say " > Fixing #{d.name}"
312
- d.add_result(:mytaxa_scan, true, force: true)
313
- end
314
- end
315
- end
316
-
317
- ##
318
- # Perform start operation with MiGA::Cli +cli+
319
- def check_start(cli)
320
- cli.say 'Looking for legacy .start files lingering'
321
- cli.load_project.each_dataset do |d|
322
- d.each_result do |r_k, r|
323
- if File.exist? r.path(:start)
324
- cli.say " > Registering again #{d.name}:#{r_k}"
325
- r.save
326
- end
327
- end
328
- end
329
- end
330
-
331
- ##
332
- # Perform taxonomy operation with MiGA::Cli +cli+
333
- def check_tax(cli)
334
- # cli.say 'o Checking for taxonomy/distances consistency'
335
- # TODO: Find 95%ANI clusters with entries from different species
336
- # TODO: Find different 95%ANI clusters with genomes from the same species
337
- # TODO: Find AAI values too high or too low for each LCA rank
338
- end
339
-
340
- ##
341
- # Run command +cmd+ with options +opts+
342
- def run_cmd(cmd, opts = {})
343
- opts = { return: :output, err2out: true, raise: false }.merge(opts)
344
- cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
345
- warn(cmdo) unless cmdo.empty?
346
- end
347
67
  end
@@ -7,6 +7,18 @@ end
7
7
  ##
8
8
  # Helper module including download functions for the *_get actions
9
9
  module MiGA::Cli::Action::Download::Base
10
+ def cli_base_flags(opt)
11
+ opt.on(
12
+ '--max INT', Integer,
13
+ 'Maximum number of datasets to download (by default: unlimited)'
14
+ ) { |v| cli[:max_datasets] = v }
15
+ opt.on(
16
+ '-m', '--metadata STRING',
17
+ 'Metadata as key-value pairs separated by = and delimited by comma',
18
+ 'Values are saved as strings except for booleans (true / false) or nil'
19
+ ) { |v| cli[:metadata] = v }
20
+ end
21
+
10
22
  def cli_filters(opt)
11
23
  opt.on(
12
24
  '--exclude PATH',
@@ -53,6 +65,40 @@ module MiGA::Cli::Action::Download::Base
53
65
  ) { |v| cli[:remote_list] = v }
54
66
  end
55
67
 
68
+ def generic_perform
69
+ p, ds = load_tasks
70
+ d, downloaded = download_entries(ds, p)
71
+
72
+ # Finalize
73
+ finalize_tasks(d, downloaded)
74
+ unlink_entries(p, p.dataset_names - d) if cli[:unlink]
75
+ end
76
+
77
+ def load_tasks
78
+ sanitize_cli
79
+ p = cli.load_project
80
+ ds = remote_list
81
+ ds = discard_excluded(ds)
82
+ ds = impose_limit(ds)
83
+ [p, ds]
84
+ end
85
+
86
+ def finalize_tasks(d, downloaded)
87
+ cli.say "Datasets listed: #{d.size}"
88
+ act = cli[:dry] ? 'to download' : 'downloaded'
89
+ cli.say "Datasets #{act}: #{downloaded}"
90
+ unless cli[:remote_list].nil?
91
+ File.open(cli[:remote_list], 'w') do |fh|
92
+ d.each { |i| fh.puts i }
93
+ end
94
+ end
95
+ end
96
+
97
+ def unlink_entries(p, unlink)
98
+ unlink.each { |i| p.unlink_dataset(i).remove! }
99
+ cli.say "Datasets unlinked: #{unlink.size}"
100
+ end
101
+
56
102
  def discard_excluded(ds)
57
103
  unless cli[:exclude].nil?
58
104
  cli.say "Discarding datasets in #{cli[:exclude]}"
@@ -105,7 +151,7 @@ module MiGA::Cli::Action::Download::Base
105
151
  # Saves the (generic remote) entry identified by +name+ with +body+ into the
106
152
  # project +p+, and returns +true+ on success and +false+ otherwise
107
153
  def save_entry(name, body, p)
108
- cli.say ' Locating remote dataset'
154
+ cli.say " Locating remote dataset: #{name}"
109
155
  body[:md][:metadata_only] = true if cli[:only_md]
110
156
  rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
111
157
  if cli[:get_md]
@@ -119,6 +165,7 @@ module MiGA::Cli::Action::Download::Base
119
165
  true
120
166
  rescue MiGA::RemoteDataMissingError => e
121
167
  raise(e) unless cli[:ignore_removed]
168
+ cli.say " Removed dataset ignored: #{name}"
122
169
  false
123
170
  end
124
171
  end
@@ -18,15 +18,7 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
18
18
  '-T', '--taxon STRING',
19
19
  '(Mandatory) Taxon name in GTDB format (e.g., g__Escherichia)'
20
20
  ) { |v| cli[:taxon] = v }
21
- opt.on(
22
- '--max INT', Integer,
23
- 'Maximum number of datasets to download (by default: unlimited)'
24
- ) { |v| cli[:max_datasets] = v }
25
- opt.on(
26
- '-m', '--metadata STRING',
27
- 'Metadata as key-value pairs separated by = and delimited by comma',
28
- 'Values are saved as strings except for booleans (true / false) or nil'
29
- ) { |v| cli[:metadata] = v }
21
+ cli_base_flags(opt)
30
22
  cli_task_flags(opt)
31
23
  cli_name_modifiers(opt)
32
24
  cli_filters(opt)
@@ -39,27 +31,6 @@ class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
39
31
  end
40
32
 
41
33
  def perform
42
- sanitize_cli
43
- p = cli.load_project
44
- ds = remote_list
45
- ds = discard_excluded(ds)
46
- ds = impose_limit(ds)
47
- d, downloaded = download_entries(ds, p)
48
-
49
- # Finalize
50
- cli.say "Datasets listed: #{d.size}"
51
- act = cli[:dry] ? 'to download' : 'downloaded'
52
- cli.say "Datasets #{act}: #{downloaded}"
53
- unless cli[:remote_list].nil?
54
- File.open(cli[:remote_list], 'w') do |fh|
55
- d.each { |i| fh.puts i }
56
- end
57
- end
58
- return unless cli[:unlink]
59
-
60
- unlink = p.dataset_names - d
61
- unlink.each { |i| p.unlink_dataset(i).remove! }
62
- cli.say "Datasets unlinked: #{unlink.size}"
34
+ generic_perform
63
35
  end
64
-
65
36
  end
@@ -20,48 +20,23 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
20
20
  '-T', '--taxon STRING',
21
21
  '(Mandatory) Taxon name (e.g., a species binomial)'
22
22
  ) { |v| cli[:taxon] = v }
23
- opt.on(
24
- '--max INT', Integer,
25
- 'Maximum number of datasets to download (by default: unlimited)'
26
- ) { |v| cli[:max_datasets] = v }
27
- opt.on(
28
- '-m', '--metadata STRING',
29
- 'Metadata as key-value pairs separated by = and delimited by comma',
30
- 'Values are saved as strings except for booleans (true / false) or nil'
31
- ) { |v| cli[:metadata] = v }
23
+ cli_base_flags(opt)
32
24
  cli_task_flags(opt)
33
25
  cli_name_modifiers(opt)
34
26
  cli_filters(opt)
35
27
  cli_save_actions(opt)
36
28
  opt.on(
37
29
  '--api-key STRING',
30
+ '::HIDE::' # For backwards compatibility
31
+ ) { |v| ENV['NCBI_API_KEY'] = v }
32
+ opt.on(
33
+ '--ncbi-api-key STRING',
38
34
  'NCBI API key'
39
35
  ) { |v| ENV['NCBI_API_KEY'] = v }
40
36
  end
41
37
  end
42
38
 
43
39
  def perform
44
- sanitize_cli
45
- p = cli.load_project
46
- ds = remote_list
47
- ds = discard_excluded(ds)
48
- ds = impose_limit(ds)
49
- d, downloaded = download_entries(ds, p)
50
-
51
- # Finalize
52
- cli.say "Datasets listed: #{d.size}"
53
- act = cli[:dry] ? 'to download' : 'downloaded'
54
- cli.say "Datasets #{act}: #{downloaded}"
55
- unless cli[:remote_list].nil?
56
- File.open(cli[:remote_list], 'w') do |fh|
57
- d.each { |i| fh.puts i }
58
- end
59
- end
60
- return unless cli[:unlink]
61
-
62
- unlink = p.dataset_names - d
63
- unlink.each { |i| p.unlink_dataset(i).remove! }
64
- cli.say "Datasets unlinked: #{unlink.size}"
40
+ generic_perform
65
41
  end
66
-
67
42
  end
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
43
43
  '-h', '--help',
44
44
  'Display this screen'
45
45
  ) do
46
- puts opt
46
+ puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
47
47
  exit
48
48
  end
49
49
  opt.separator ''
@@ -7,18 +7,47 @@ class MiGA::Dataset < MiGA::MiGA
7
7
 
8
8
  # Class-level
9
9
  class << self
10
+ ##
11
+ # Directories containing the results from dataset-specific tasks
10
12
  def RESULT_DIRS
11
13
  @@RESULT_DIRS
12
14
  end
13
15
 
16
+ ##
17
+ # Supported dataset types
14
18
  def KNOWN_TYPES
15
19
  @@KNOWN_TYPES
16
20
  end
17
21
 
22
+ ##
23
+ # Returns an Array of tasks (Symbols) to be executed before project-wide
24
+ # tasks
18
25
  def PREPROCESSING_TASKS
19
26
  @@PREPROCESSING_TASKS
20
27
  end
21
28
 
29
+ ##
30
+ # Tasks to be excluded from query datasets
31
+ def EXCLUDE_NOREF_TASKS
32
+ @@EXCLUDE_NOREF_TASKS
33
+ end
34
+
35
+ ##
36
+ # Tasks to be executed only in datasets that are single-organism. These
37
+ # tasks are ignored for multi-organism datasets or for unknown types
38
+ def ONLY_NONMULTI_TASKS
39
+ @@ONLY_NONMULTI_TASKS
40
+ end
41
+
42
+ ##
43
+ # Tasks to be executed only in datasets that are multi-organism. These
44
+ # tasks are ignored for single-organism datasets or for unknwon types
45
+ def ONLY_MULTI_TASKS
46
+ @@ONLY_MULTI_TASKS
47
+ end
48
+
49
+ ##
50
+ # Options supported by datasets
22
51
  def OPTIONS
23
52
  @@OPTIONS
24
53
  end
@@ -69,7 +98,7 @@ module MiGA::Dataset::Base
69
98
  }
70
99
 
71
100
  ##
72
- # Returns an Array of tasks to be executed before project-wide tasks
101
+ # Returns an Array of tasks (Symbols) to be executed before project-wide tasks
73
102
  @@PREPROCESSING_TASKS = [
74
103
  :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
75
104
  :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
@@ -77,19 +106,19 @@ module MiGA::Dataset::Base
77
106
  ]
78
107
 
79
108
  ##
80
- # Tasks to be excluded from query datasets.
109
+ # Tasks to be excluded from query datasets
81
110
  @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
82
111
  @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
83
112
 
84
113
  ##
85
- # Tasks to be executed only in datasets that are not multi-organism. These
86
- # tasks are ignored for multi-organism datasets or for unknown types.
114
+ # Tasks to be executed only in datasets that are single-organism. These
115
+ # tasks are ignored for multi-organism datasets or for unknown types
87
116
  @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
88
117
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
89
118
 
90
119
  ##
91
120
  # Tasks to be executed only in datasets that are multi-organism. These
92
- # tasks are ignored for single-organism datasets or for unknwon types.
121
+ # tasks are ignored for single-organism datasets or for unknwon types
93
122
  @@ONLY_MULTI_TASKS = [:mytaxa]
94
123
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
95
124