miga-base 1.2.14.2 → 1.2.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +16 -40
- data/lib/miga/cli/action/doctor/databases.rb +39 -0
- data/lib/miga/cli/action/doctor/distances.rb +144 -0
- data/lib/miga/cli/action/doctor/operations.rb +159 -0
- data/lib/miga/cli/action/doctor.rb +7 -287
- data/lib/miga/cli/action/download/base.rb +64 -2
- data/lib/miga/cli/action/gtdb_get.rb +2 -31
- data/lib/miga/cli/action/ncbi_get.rb +6 -31
- data/lib/miga/cli/opt_helper.rb +1 -1
- data/lib/miga/common/errors.rb +10 -0
- data/lib/miga/dataset/base.rb +34 -5
- data/lib/miga/dataset/result/add.rb +286 -0
- data/lib/miga/dataset/result/ignore.rb +93 -0
- data/lib/miga/dataset/result.rb +31 -342
- data/lib/miga/remote_dataset/download.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/test/remote_dataset_test.rb +6 -0
- metadata +7 -2
@@ -0,0 +1,286 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiGA::Dataset::Result::Add
|
4
|
+
##
|
5
|
+
# Add result type +:raw_reads+ at +base+ (no +_opts+ supported)
|
6
|
+
def add_result_raw_reads(base, _opts)
|
7
|
+
return nil unless result_files_exist?(base, '.1.fastq')
|
8
|
+
|
9
|
+
add_files_to_ds_result(
|
10
|
+
MiGA::Result.new("#{base}.json"), name,
|
11
|
+
if result_files_exist?(base, '.2.fastq')
|
12
|
+
{ pair1: '.1.fastq', pair2: '.2.fastq' }
|
13
|
+
else
|
14
|
+
{ single: '.1.fastq' }
|
15
|
+
end
|
16
|
+
)
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Add result type +:trimmed_reads+ at +base+ (no +_opts+ supported)
|
21
|
+
def add_result_trimmed_reads(base, _opts)
|
22
|
+
return nil unless result_files_exist?(base, '.1.clipped.fastq')
|
23
|
+
|
24
|
+
add_files_to_ds_result(
|
25
|
+
MiGA::Result.new("#{base}.json"), name,
|
26
|
+
if result_files_exist?(base, '.2.clipped.fastq')
|
27
|
+
{ pair1: '.1.clipped.fastq', pair2: '.2.clipped.fastq' }
|
28
|
+
else
|
29
|
+
{ single: '.1.clipped.fastq' }
|
30
|
+
end
|
31
|
+
).tap do |r|
|
32
|
+
# Legacy files
|
33
|
+
r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
|
34
|
+
r.add_file(:single, "#{name}.1.clipped.single.fastq")
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# Add result type +:read_quality+ at +base+ (no +_opts+ supported)
|
40
|
+
def add_result_read_quality(base, _opts)
|
41
|
+
return nil unless
|
42
|
+
result_files_exist?(base, %w[.post.1.html]) ||
|
43
|
+
result_files_exist?(base, %w[.solexaqa .fastqc])
|
44
|
+
|
45
|
+
add_files_to_ds_result(
|
46
|
+
MiGA::Result.new("#{base}.json"), name,
|
47
|
+
pre_qc_1: '.pre.1.html', pre_qc_2: '.pre.2.html',
|
48
|
+
post_qc_1: '.post.1.html', post_qc_2: '.post.2.html',
|
49
|
+
adapter_detection: '.adapters.txt',
|
50
|
+
# Legacy files
|
51
|
+
solexaqa: '.solexaqa', fastqc: '.fastqc'
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Add result type +:trimmed_fasta+ at +base+ (no +_opts+ supported)
|
57
|
+
def add_result_trimmed_fasta(base, _opts)
|
58
|
+
return nil unless
|
59
|
+
result_files_exist?(base, '.CoupledReads.fa') ||
|
60
|
+
result_files_exist?(base, '.SingleReads.fa') ||
|
61
|
+
result_files_exist?(base, %w[.1.fasta .2.fasta])
|
62
|
+
|
63
|
+
add_files_to_ds_result(
|
64
|
+
MiGA::Result.new("#{base}.json"), name,
|
65
|
+
coupled: '.CoupledReads.fa',
|
66
|
+
single: '.SingleReads.fa',
|
67
|
+
pair1: '.1.fasta',
|
68
|
+
pair2: '.2.fasta'
|
69
|
+
)
|
70
|
+
end
|
71
|
+
|
72
|
+
##
|
73
|
+
# Add result type +:assembly+ at +base+. Hash +opts+ supports
|
74
|
+
# +is_clean: Boolean+.
|
75
|
+
def add_result_assembly(base, opts)
|
76
|
+
return nil unless result_files_exist?(base, '.LargeContigs.fna')
|
77
|
+
|
78
|
+
r = add_files_to_ds_result(
|
79
|
+
MiGA::Result.new("#{base}.json"), name,
|
80
|
+
largecontigs: '.LargeContigs.fna',
|
81
|
+
allcontigs: '.AllContigs.fna',
|
82
|
+
assembly_data: ''
|
83
|
+
)
|
84
|
+
opts[:is_clean] ||= false
|
85
|
+
r.clean! if opts[:is_clean]
|
86
|
+
unless r.clean?
|
87
|
+
MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
|
88
|
+
r.clean!
|
89
|
+
end
|
90
|
+
r
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
|
95
|
+
def add_result_cds(base, opts)
|
96
|
+
return nil unless result_files_exist?(base, %w[.faa])
|
97
|
+
|
98
|
+
r = add_files_to_ds_result(
|
99
|
+
MiGA::Result.new("#{base}.json"), name,
|
100
|
+
proteins: '.faa',
|
101
|
+
genes: '.fna',
|
102
|
+
gff2: '.gff2',
|
103
|
+
gff3: '.gff3',
|
104
|
+
tab: '.tab'
|
105
|
+
)
|
106
|
+
opts[:is_clean] ||= false
|
107
|
+
r.clean! if opts[:is_clean]
|
108
|
+
unless r.clean?
|
109
|
+
MiGA::MiGA.clean_fasta_file(r.file_path(:proteins))
|
110
|
+
MiGA::MiGA.clean_fasta_file(r.file_path(:genes)) if r.file_path(:genes)
|
111
|
+
r.clean!
|
112
|
+
end
|
113
|
+
r
|
114
|
+
end
|
115
|
+
|
116
|
+
##
|
117
|
+
# Add result type +:essential_genes+ at +base+ (no +_opts+ supported).
|
118
|
+
def add_result_essential_genes(base, _opts)
|
119
|
+
return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
|
120
|
+
|
121
|
+
add_files_to_ds_result(
|
122
|
+
MiGA::Result.new("#{base}.json"), name,
|
123
|
+
ess_genes: '.ess.faa',
|
124
|
+
collection: '.ess',
|
125
|
+
report: '.ess/log',
|
126
|
+
alignments: '.ess/proteins.aln',
|
127
|
+
fastaai_index: '.faix.db.gz',
|
128
|
+
fastaai_index_2: '.faix'
|
129
|
+
)
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# Add result type +:ssu+ at +base+. Hash +opts+ supports +is_clean: Boolean+
|
134
|
+
def add_result_ssu(base, opts)
|
135
|
+
return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
|
136
|
+
return nil unless result_files_exist?(base, '.ssu.fa')
|
137
|
+
|
138
|
+
r = add_files_to_ds_result(
|
139
|
+
MiGA::Result.new("#{base}.json"), name,
|
140
|
+
longest_ssu_gene: '.ssu.fa',
|
141
|
+
ssu_gff: '.ssu.gff', # DEPRECATED
|
142
|
+
gff: '.gff',
|
143
|
+
all_ssu_genes: '.ssu.all.fa',
|
144
|
+
classification: '.rdp.tsv',
|
145
|
+
trna_list: '.trna.txt'
|
146
|
+
)
|
147
|
+
opts[:is_clean] ||= false
|
148
|
+
r.clean! if opts[:is_clean]
|
149
|
+
unless r.clean?
|
150
|
+
MiGA::MiGA.clean_fasta_file(r.file_path(:longest_ssu_gene))
|
151
|
+
r.clean!
|
152
|
+
end
|
153
|
+
r
|
154
|
+
end
|
155
|
+
|
156
|
+
##
|
157
|
+
# Add result type +:mytaxa+ at +base+ (no +_opts+ supported)
|
158
|
+
def add_result_mytaxa(base, _opts)
|
159
|
+
if multi?
|
160
|
+
return nil unless
|
161
|
+
result_files_exist?(base, '.mytaxa') ||
|
162
|
+
result_files_exist?(base, '.nomytaxa.txt')
|
163
|
+
|
164
|
+
add_files_to_ds_result(
|
165
|
+
MiGA::Result.new("#{base}.json"), name,
|
166
|
+
mytaxa: '.mytaxa',
|
167
|
+
blast: '.blast',
|
168
|
+
mytaxain: '.mytaxain',
|
169
|
+
nomytaxa: '.nomytaxa.txt',
|
170
|
+
species: '.mytaxa.Species.txt',
|
171
|
+
genus: '.mytaxa.Genus.txt',
|
172
|
+
phylum: '.mytaxa.Phylum.txt',
|
173
|
+
innominate: '.mytaxa.innominate',
|
174
|
+
kronain: '.mytaxa.krona',
|
175
|
+
krona: '.html'
|
176
|
+
)
|
177
|
+
else
|
178
|
+
MiGA::Result.new("#{base}.json")
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
##
|
183
|
+
# Add result type +:mytaxa_scan+ at +base+ (no +_opts+ supported)
|
184
|
+
def add_result_mytaxa_scan(base, _opts)
|
185
|
+
if nonmulti?
|
186
|
+
return nil unless
|
187
|
+
result_files_exist?(base, %w[.pdf .mytaxa]) ||
|
188
|
+
result_files_exist?(base, '.nomytaxa.txt')
|
189
|
+
|
190
|
+
add_files_to_ds_result(
|
191
|
+
MiGA::Result.new("#{base}.json"), name,
|
192
|
+
nomytaxa: '.nomytaxa.txt',
|
193
|
+
mytaxa: '.mytaxa',
|
194
|
+
report: '.pdf',
|
195
|
+
regions_archive: '.reg.tar',
|
196
|
+
# Intermediate / Deprecated:
|
197
|
+
blast: '.blast',
|
198
|
+
mytaxain: '.mytaxain',
|
199
|
+
wintax: '.wintax',
|
200
|
+
gene_ids: '.wintax.genes',
|
201
|
+
region_ids: '.wintax.regions',
|
202
|
+
regions: '.reg'
|
203
|
+
)
|
204
|
+
else
|
205
|
+
MiGA::Result.new("#{base}.json")
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
##
|
210
|
+
# Add result type +:distances+ at +base+ (no +_opts+ supported)
|
211
|
+
def add_result_distances(base, _opts)
|
212
|
+
if nonmulti?
|
213
|
+
if ref?
|
214
|
+
add_result_distances_ref(base)
|
215
|
+
else
|
216
|
+
add_result_distances_nonref(base)
|
217
|
+
end
|
218
|
+
else
|
219
|
+
add_result_distances_multi(base)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
##
|
224
|
+
# Add result type +:taxonomy+ at +base+ (no +_opts+ supported)
|
225
|
+
def add_result_taxonomy(base, _opts)
|
226
|
+
add_result_distances_nonref(base)
|
227
|
+
end
|
228
|
+
|
229
|
+
##
|
230
|
+
# Add result type +:stats+ at +base+ (no +_opts+ supported)
|
231
|
+
def add_result_stats(base, _opts)
|
232
|
+
MiGA::Result.new("#{base}.json")
|
233
|
+
end
|
234
|
+
|
235
|
+
private
|
236
|
+
|
237
|
+
##
|
238
|
+
# Add result type +:distances+ for _multi_ datasets at +base+
|
239
|
+
def add_result_distances_multi(base)
|
240
|
+
MiGA::Result.new("#{base}.json")
|
241
|
+
end
|
242
|
+
|
243
|
+
##
|
244
|
+
# Add result type +:distances+ for _nonmulti_ reference datasets at +base+
|
245
|
+
def add_result_distances_ref(base)
|
246
|
+
pref = File.dirname(base)
|
247
|
+
return nil unless File.exist?("#{pref}/01.haai/#{name}.db")
|
248
|
+
|
249
|
+
MiGA::Result.new("#{base}.json").tap do |r|
|
250
|
+
r.add_files(
|
251
|
+
haai_db: "01.haai/#{name}.db",
|
252
|
+
aai_db: "02.aai/#{name}.db",
|
253
|
+
ani_db: "03.ani/#{name}.db"
|
254
|
+
)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
##
|
259
|
+
# Add result type +:distances+ for _nonmulti_ query datasets at +base+
|
260
|
+
def add_result_distances_nonref(base)
|
261
|
+
return nil unless
|
262
|
+
result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) ||
|
263
|
+
result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
|
264
|
+
|
265
|
+
add_files_to_ds_result(
|
266
|
+
MiGA::Result.new("#{base}.json"), name,
|
267
|
+
aai_medoids: '.aai-medoids.tsv',
|
268
|
+
haai_db: '.haai.db',
|
269
|
+
aai_db: '.aai.db',
|
270
|
+
ani_medoids: '.ani-medoids.tsv',
|
271
|
+
ani_db: '.ani.db',
|
272
|
+
ref_tree: '.nwk',
|
273
|
+
ref_tree_pdf: '.nwk.pdf',
|
274
|
+
intax_test: '.intax.txt'
|
275
|
+
)
|
276
|
+
end
|
277
|
+
|
278
|
+
##
|
279
|
+
# Add files in +rel_files+ Hash to the result +r+ with dataset name +name+
|
280
|
+
def add_files_to_ds_result(r, name, rel_files)
|
281
|
+
files = {}
|
282
|
+
rel_files.each { |k, v| files[k] = name + v }
|
283
|
+
r.add_files(files)
|
284
|
+
r
|
285
|
+
end
|
286
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiGA::Dataset::Result::Ignore
|
4
|
+
##
|
5
|
+
# Should I ignore +task+ for this dataset?
|
6
|
+
def ignore_task?(task)
|
7
|
+
why_ignore(task) != :execute
|
8
|
+
end
|
9
|
+
|
10
|
+
##
|
11
|
+
# Returns an array of symbols indicating all the possible reasons why a
|
12
|
+
# given task migh be ignored:
|
13
|
+
# - empty: the dataset has no data
|
14
|
+
# - inactive: the dataset is inactive
|
15
|
+
# - upstream: the task is upstream from dataset's input
|
16
|
+
# - force: forced to ignore by metadata
|
17
|
+
# - project: incompatible project
|
18
|
+
# - noref: incompatible dataset, only for reference
|
19
|
+
# - multi: incompatible dataset, only for multi
|
20
|
+
# - nonmulti: incompatible dataset, only for nonmulti
|
21
|
+
# - complete: the task is already complete
|
22
|
+
def ignore_reasons
|
23
|
+
%i[empty inactive upstream force project noref multi nonmulti complete]
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Return a code explaining why a task is ignored (see +ignore_reasons+) or
|
28
|
+
# the symbol +:execute+ (do not ignore, execute the task)
|
29
|
+
def why_ignore(task)
|
30
|
+
# Find a reason to ignore it
|
31
|
+
ignore_reasons.each do |i|
|
32
|
+
return i if send(:"ignore_#{i}?", task)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Otherwise, execute
|
36
|
+
return :execute
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Ignore +task+ because it's already done
|
41
|
+
def ignore_complete?(task)
|
42
|
+
!get_result(task).nil?
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Ignore any task because the dataset is inactive (+_task+ is ignored)
|
47
|
+
def ignore_inactive?(_task)
|
48
|
+
!active?
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Ignore any task because the dataset is empty (+_task+ is ignored)
|
53
|
+
def ignore_empty?(_task)
|
54
|
+
first_preprocessing.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# Ignore +task+ because it's upstream from the entry point
|
59
|
+
def ignore_upstream?(task)
|
60
|
+
self.class.PREPROCESSING_TASKS.index(task) <
|
61
|
+
self.class.PREPROCESSING_TASKS.index(first_preprocessing)
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Ignore +task+ because the metadata says so
|
66
|
+
def ignore_force?(task)
|
67
|
+
!(metadata["run_#{task}"].nil? || metadata["run_#{task}"])
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Ignore +task+ because the project is not compatible
|
72
|
+
def ignore_project?(task)
|
73
|
+
task == :taxonomy && project.option(:ref_project).nil?
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Ignore +task+ because it's not a reference dataset
|
78
|
+
def ignore_noref?(task)
|
79
|
+
self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Ignore +task+ because it's not a multi dataset
|
84
|
+
def ignore_multi?(task)
|
85
|
+
self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# Ignore +task+ because it's not a nonmulti dataset
|
90
|
+
def ignore_nonmulti?(task)
|
91
|
+
self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
|
92
|
+
end
|
93
|
+
end
|