miga-base 1.2.14.2 → 1.2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'miga/result'
2
4
  require 'miga/dataset/base'
3
5
  require 'miga/common/with_result'
@@ -13,56 +15,17 @@ module MiGA::Dataset::Result
13
15
  include MiGA::Dataset::Base
14
16
  include MiGA::Common::WithResult
15
17
 
18
+ require 'miga/dataset/result/ignore'
19
+ require 'miga/dataset/result/add'
20
+ include MiGA::Dataset::Result::Ignore
21
+ include MiGA::Dataset::Result::Add
22
+
16
23
  ##
17
24
  # Return the basename for results
18
25
  def result_base
19
26
  name
20
27
  end
21
28
 
22
- ##
23
- # Should I ignore +task+ for this dataset?
24
- def ignore_task?(task)
25
- why_ignore(task) != :execute
26
- end
27
-
28
- ##
29
- # Return a code explaining why a task is ignored.
30
- # The values are symbols:
31
- # - empty: the dataset has no data
32
- # - inactive: the dataset is inactive
33
- # - upstream: the task is upstream from dataset's input
34
- # - force: forced to ignore by metadata
35
- # - project: incompatible project
36
- # - noref: incompatible dataset, only for reference
37
- # - multi: incompatible dataset, only for multi
38
- # - nonmulti: incompatible dataset, only for nonmulti
39
- # - complete: the task is already complete
40
- # - execute: do not ignore, execute the task
41
- def why_ignore(task)
42
- if !get_result(task).nil?
43
- :complete
44
- elsif !active?
45
- :inactive
46
- elsif first_preprocessing.nil?
47
- :empty
48
- elsif @@PREPROCESSING_TASKS.index(task) <
49
- @@PREPROCESSING_TASKS.index(first_preprocessing)
50
- :upstream
51
- elsif !metadata["run_#{task}"].nil?
52
- metadata["run_#{task}"] ? :execute : :force
53
- elsif task == :taxonomy && project.option(:ref_project).nil?
54
- :project
55
- elsif @@_EXCLUDE_NOREF_TASKS_H[task] && !ref?
56
- :noref
57
- elsif @@_ONLY_MULTI_TASKS_H[task] && !multi?
58
- :multi
59
- elsif @@_ONLY_NONMULTI_TASKS_H[task] && !nonmulti?
60
- :nonmulti
61
- else
62
- :execute
63
- end
64
- end
65
-
66
29
  ##
67
30
  # Returns the key symbol of the first registered result (sorted by the
68
31
  # execution order). This typically corresponds to the result used as the
@@ -95,18 +58,18 @@ module MiGA::Dataset::Result
95
58
  # - 2 for a queued result (a task yet to be executed).
96
59
  # It passes +save+ to #add_result
97
60
  def profile_advance(save = false)
98
- first_task = first_preprocessing(save)
99
- return Array.new(@@PREPROCESSING_TASKS.size, 0) if first_task.nil?
61
+ # Determine the start point
62
+ first_task = first_preprocessing(save) or
63
+ return Array.new(self.class.PREPROCESSING_TASKS.size, 0)
100
64
 
101
- adv = []
65
+ # Traverse all tasks and determine the corresponding state
102
66
  state = 0
103
67
  next_task = next_preprocessing(save)
104
- @@PREPROCESSING_TASKS.each do |task|
68
+ self.class.PREPROCESSING_TASKS.map do |task|
105
69
  state = 1 if first_task == task
106
70
  state = 2 if !next_task.nil? && next_task == task
107
- adv << state
71
+ state
108
72
  end
109
- adv
110
73
  end
111
74
 
112
75
  ##
@@ -125,9 +88,9 @@ module MiGA::Dataset::Result
125
88
  def result_status(task)
126
89
  reason = why_ignore(task)
127
90
  case reason
128
- when :upstream; :-
129
- when :execute; :pending
130
- when :complete; :complete
91
+ when :upstream then :-
92
+ when :execute then :pending
93
+ when :complete then :complete
131
94
  else; :"ignore_#{reason}"
132
95
  end
133
96
  end
@@ -136,306 +99,32 @@ module MiGA::Dataset::Result
136
99
  # Clean-up all the stored distances, removing values for datasets no longer in
137
100
  # the project as reference datasets.
138
101
  def cleanup_distances!
139
- r = get_result(:distances)
140
- return if r.nil?
102
+ return if get_result(:distances).nil?
141
103
 
142
104
  require 'miga/sqlite'
143
105
  ref = project.datasets.select(&:ref?).select(&:active?).map(&:name)
144
- %i[haai_db aai_db ani_db].each do |db_type|
145
- db = r.file_path(db_type)
146
- next if db.nil? || !File.size?(db)
147
-
148
- sqlite_db = MiGA::SQLite.new(db)
149
- table = db_type[-6..-4]
150
- val = sqlite_db.run("select seq2 from #{table}")
151
- next if val.empty?
152
-
153
- (val.map(&:first) - ref).each do |extra|
154
- sqlite_db.run("delete from #{table} where seq2=?", extra)
155
- end
106
+ %i[haai aai ani].each do |metric|
107
+ cleanup_distances_by_metric!(ref, metric)
156
108
  end
157
109
  end
158
110
 
159
111
  private
160
112
 
161
113
  ##
162
- # Add result type +:raw_reads+ at +base+ (no +_opts+ supported)
163
- def add_result_raw_reads(base, _opts)
164
- return nil unless result_files_exist?(base, '.1.fastq')
165
-
166
- add_files_to_ds_result(
167
- MiGA::Result.new("#{base}.json"), name,
168
- if result_files_exist?(base, '.2.fastq')
169
- { pair1: '.1.fastq', pair2: '.2.fastq' }
170
- else
171
- { single: '.1.fastq' }
172
- end
173
- )
174
- end
175
-
176
- ##
177
- # Add result type +:trimmed_reads+ at +base+ (no +_opts+ supported)
178
- def add_result_trimmed_reads(base, _opts)
179
- return nil unless result_files_exist?(base, '.1.clipped.fastq')
180
-
181
- add_files_to_ds_result(
182
- MiGA::Result.new("#{base}.json"), name,
183
- if result_files_exist?(base, '.2.clipped.fastq')
184
- { pair1: '.1.clipped.fastq', pair2: '.2.clipped.fastq' }
185
- else
186
- { single: '.1.clipped.fastq' }
187
- end
188
- ).tap do |r|
189
- # Legacy files
190
- r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
191
- r.add_file(:single, "#{name}.1.clipped.single.fastq")
192
- end
193
- end
194
-
195
- ##
196
- # Add result type +:read_quality+ at +base+ (no +_opts+ supported)
197
- def add_result_read_quality(base, _opts)
198
- return nil unless
199
- result_files_exist?(base, %w[.post.1.html]) ||
200
- result_files_exist?(base, %w[.solexaqa .fastqc])
201
-
202
- add_files_to_ds_result(
203
- MiGA::Result.new("#{base}.json"), name,
204
- pre_qc_1: '.pre.1.html', pre_qc_2: '.pre.2.html',
205
- post_qc_1: '.post.1.html', post_qc_2: '.post.2.html',
206
- adapter_detection: '.adapters.txt',
207
- # Legacy files
208
- solexaqa: '.solexaqa', fastqc: '.fastqc'
209
- )
210
- end
211
-
212
- ##
213
- # Add result type +:trimmed_fasta+ at +base+ (no +_opts+ supported)
214
- def add_result_trimmed_fasta(base, _opts)
215
- return nil unless
216
- result_files_exist?(base, '.CoupledReads.fa') ||
217
- result_files_exist?(base, '.SingleReads.fa') ||
218
- result_files_exist?(base, %w[.1.fasta .2.fasta])
219
-
220
- add_files_to_ds_result(
221
- MiGA::Result.new("#{base}.json"), name,
222
- coupled: '.CoupledReads.fa',
223
- single: '.SingleReads.fa',
224
- pair1: '.1.fasta',
225
- pair2: '.2.fasta'
226
- )
227
- end
228
-
229
- ##
230
- # Add result type +:assembly+ at +base+. Hash +opts+ supports
231
- # +is_clean: Boolean+.
232
- def add_result_assembly(base, opts)
233
- return nil unless result_files_exist?(base, '.LargeContigs.fna')
234
-
235
- r = add_files_to_ds_result(
236
- MiGA::Result.new("#{base}.json"), name,
237
- largecontigs: '.LargeContigs.fna',
238
- allcontigs: '.AllContigs.fna',
239
- assembly_data: ''
240
- )
241
- opts[:is_clean] ||= false
242
- r.clean! if opts[:is_clean]
243
- unless r.clean?
244
- MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
245
- r.clean!
246
- end
247
- r
248
- end
249
-
250
- ##
251
- # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
252
- def add_result_cds(base, opts)
253
- return nil unless result_files_exist?(base, %w[.faa])
254
-
255
- r = add_files_to_ds_result(
256
- MiGA::Result.new("#{base}.json"), name,
257
- proteins: '.faa',
258
- genes: '.fna',
259
- gff2: '.gff2',
260
- gff3: '.gff3',
261
- tab: '.tab'
262
- )
263
- opts[:is_clean] ||= false
264
- r.clean! if opts[:is_clean]
265
- unless r.clean?
266
- MiGA::MiGA.clean_fasta_file(r.file_path(:proteins))
267
- MiGA::MiGA.clean_fasta_file(r.file_path(:genes)) if r.file_path(:genes)
268
- r.clean!
269
- end
270
- r
271
- end
272
-
273
- ##
274
- # Add result type +:essential_genes+ at +base+ (no +_opts+ supported).
275
- def add_result_essential_genes(base, _opts)
276
- return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
114
+ # Cleanup the tables of a specific +metric+ (symbol) removing all values
115
+ # against dataset names not in +ref+ (Array of string)
116
+ def cleanup_distances_by_metric!(ref, metric)
117
+ db_type = :"#{metric}_db"
118
+ db = get_result(:distances).file_path(db_type)
119
+ return if db.nil? || !File.size?(db)
277
120
 
278
- add_files_to_ds_result(
279
- MiGA::Result.new("#{base}.json"), name,
280
- ess_genes: '.ess.faa',
281
- collection: '.ess',
282
- report: '.ess/log',
283
- alignments: '.ess/proteins.aln',
284
- fastaai_index: '.faix.db.gz',
285
- fastaai_index_2: '.faix'
286
- )
287
- end
288
-
289
- ##
290
- # Add result type +:ssu+ at +base+. Hash +opts+ supports +is_clean: Boolean+
291
- def add_result_ssu(base, opts)
292
- return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
293
- return nil unless result_files_exist?(base, '.ssu.fa')
294
-
295
- r = add_files_to_ds_result(
296
- MiGA::Result.new("#{base}.json"), name,
297
- longest_ssu_gene: '.ssu.fa',
298
- ssu_gff: '.ssu.gff', # DEPRECATED
299
- gff: '.gff',
300
- all_ssu_genes: '.ssu.all.fa',
301
- classification: '.rdp.tsv',
302
- trna_list: '.trna.txt'
303
- )
304
- opts[:is_clean] ||= false
305
- r.clean! if opts[:is_clean]
306
- unless r.clean?
307
- MiGA::MiGA.clean_fasta_file(r.file_path(:longest_ssu_gene))
308
- r.clean!
309
- end
310
- r
311
- end
312
-
313
- ##
314
- # Add result type +:mytaxa+ at +base+ (no +_opts+ supported)
315
- def add_result_mytaxa(base, _opts)
316
- if multi?
317
- return nil unless
318
- result_files_exist?(base, '.mytaxa') ||
319
- result_files_exist?(base, '.nomytaxa.txt')
320
-
321
- add_files_to_ds_result(
322
- MiGA::Result.new("#{base}.json"), name,
323
- mytaxa: '.mytaxa',
324
- blast: '.blast',
325
- mytaxain: '.mytaxain',
326
- nomytaxa: '.nomytaxa.txt',
327
- species: '.mytaxa.Species.txt',
328
- genus: '.mytaxa.Genus.txt',
329
- phylum: '.mytaxa.Phylum.txt',
330
- innominate: '.mytaxa.innominate',
331
- kronain: '.mytaxa.krona',
332
- krona: '.html'
333
- )
334
- else
335
- MiGA::Result.new("#{base}.json")
336
- end
337
- end
338
-
339
- ##
340
- # Add result type +:mytaxa_scan+ at +base+ (no +_opts+ supported)
341
- def add_result_mytaxa_scan(base, _opts)
342
- if nonmulti?
343
- return nil unless
344
- result_files_exist?(base, %w[.pdf .mytaxa]) ||
345
- result_files_exist?(base, '.nomytaxa.txt')
346
-
347
- add_files_to_ds_result(
348
- MiGA::Result.new("#{base}.json"), name,
349
- nomytaxa: '.nomytaxa.txt',
350
- mytaxa: '.mytaxa',
351
- report: '.pdf',
352
- regions_archive: '.reg.tar',
353
- # Intermediate / Deprecated:
354
- blast: '.blast',
355
- mytaxain: '.mytaxain',
356
- wintax: '.wintax',
357
- gene_ids: '.wintax.genes',
358
- region_ids: '.wintax.regions',
359
- regions: '.reg'
360
- )
361
- else
362
- MiGA::Result.new("#{base}.json")
363
- end
364
- end
365
-
366
- ##
367
- # Add result type +:distances+ at +base+ (no +_opts+ supported)
368
- def add_result_distances(base, _opts)
369
- if nonmulti?
370
- if ref?
371
- add_result_distances_ref(base)
372
- else
373
- add_result_distances_nonref(base)
374
- end
375
- else
376
- add_result_distances_multi(base)
377
- end
378
- end
379
-
380
- ##
381
- # Add result type +:taxonomy+ at +base+ (no +_opts+ supported)
382
- def add_result_taxonomy(base, _opts)
383
- add_result_distances_nonref(base)
384
- end
385
-
386
- ##
387
- # Add result type +:stats+ at +base+ (no +_opts+ supported)
388
- def add_result_stats(base, _opts)
389
- MiGA::Result.new("#{base}.json")
390
- end
391
-
392
- ##
393
- # Add result type +:distances+ for _multi_ datasets at +base+
394
- def add_result_distances_multi(base)
395
- MiGA::Result.new("#{base}.json")
396
- end
121
+ sqlite_db = MiGA::SQLite.new(db)
122
+ table = db_type[-6..-4]
123
+ val = sqlite_db.run("select seq2 from #{table}")
124
+ return if val.empty?
397
125
 
398
- ##
399
- # Add result type +:distances+ for _nonmulti_ reference datasets at +base+
400
- def add_result_distances_ref(base)
401
- pref = File.dirname(base)
402
- return nil unless File.exist?("#{pref}/01.haai/#{name}.db")
403
-
404
- MiGA::Result.new("#{base}.json").tap do |r|
405
- r.add_files(
406
- haai_db: "01.haai/#{name}.db",
407
- aai_db: "02.aai/#{name}.db",
408
- ani_db: "03.ani/#{name}.db"
409
- )
126
+ (val.map(&:first) - ref).each do |extra|
127
+ sqlite_db.run("delete from #{table} where seq2=?", extra)
410
128
  end
411
129
  end
412
-
413
- ##
414
- # Add result type +:distances+ for _nonmulti_ query datasets at +base+
415
- def add_result_distances_nonref(base)
416
- return nil unless
417
- result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) ||
418
- result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
419
-
420
- add_files_to_ds_result(
421
- MiGA::Result.new("#{base}.json"), name,
422
- aai_medoids: '.aai-medoids.tsv',
423
- haai_db: '.haai.db',
424
- aai_db: '.aai.db',
425
- ani_medoids: '.ani-medoids.tsv',
426
- ani_db: '.ani.db',
427
- ref_tree: '.nwk',
428
- ref_tree_pdf: '.nwk.pdf',
429
- intax_test: '.intax.txt'
430
- )
431
- end
432
-
433
- ##
434
- # Add files in +rel_files+ Hash to the result +r+ with dataset name +name+
435
- def add_files_to_ds_result(r, name, rel_files)
436
- files = {}
437
- rel_files.each { |k, v| files[k] = name + v }
438
- r.add_files(files)
439
- r
440
- end
441
130
  end
@@ -43,6 +43,12 @@ class MiGA::RemoteDataset
43
43
  # +format+: String, passed to download
44
44
  def ncbi_asm_rest(opts)
45
45
  url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
46
+ if url_dir.nil? || url_dir.empty?
47
+ raise MiGA::RemoteDataMissingError.new(
48
+ "Missing ftppath_genbank in NCBI Assembly JSON"
49
+ )
50
+ end
51
+
46
52
  url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
47
53
  download(
48
54
  :web, :assembly_gz, url,
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 14, 2].freeze
15
+ VERSION = [1.2, 15, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 1, 16)
23
+ VERSION_DATE = Date.new(2023, 1, 24)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -106,6 +106,12 @@ class RemoteDatasetTest < Test::Unit::TestCase
106
106
  assert { rd.get_metadata[:is_ref_type] }
107
107
  end
108
108
 
109
+ def test_missing_data
110
+ declare_remote_access
111
+ rd = MiGA::RemoteDataset.new('GCA_000484975.1', :assembly, :ncbi)
112
+ assert_raise(MiGA::RemoteDataMissingError) { rd.save_to(project, 'bad') }
113
+ end
114
+
109
115
  # This test is too expensive (too much time to run it!)
110
116
  # def test_net_timeout
111
117
  # declare_remote_access
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.14.2
4
+ version: 1.2.15.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-16 00:00:00.000000000 Z
11
+ date: 2023-01-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -141,6 +141,9 @@ files:
141
141
  - lib/miga/cli/action/derep_wf.rb
142
142
  - lib/miga/cli/action/doctor.rb
143
143
  - lib/miga/cli/action/doctor/base.rb
144
+ - lib/miga/cli/action/doctor/databases.rb
145
+ - lib/miga/cli/action/doctor/distances.rb
146
+ - lib/miga/cli/action/doctor/operations.rb
144
147
  - lib/miga/cli/action/download/base.rb
145
148
  - lib/miga/cli/action/download/gtdb.rb
146
149
  - lib/miga/cli/action/download/ncbi.rb
@@ -196,6 +199,8 @@ files:
196
199
  - lib/miga/dataset/base.rb
197
200
  - lib/miga/dataset/hooks.rb
198
201
  - lib/miga/dataset/result.rb
202
+ - lib/miga/dataset/result/add.rb
203
+ - lib/miga/dataset/result/ignore.rb
199
204
  - lib/miga/dataset/status.rb
200
205
  - lib/miga/json.rb
201
206
  - lib/miga/lair.rb