miga-base 1.2.14.2 → 1.2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiGA::Dataset::Result::Add
4
+ ##
5
+ # Add result type +:raw_reads+ at +base+ (no +_opts+ supported)
6
+ def add_result_raw_reads(base, _opts)
7
+ return nil unless result_files_exist?(base, '.1.fastq')
8
+
9
+ add_files_to_ds_result(
10
+ MiGA::Result.new("#{base}.json"), name,
11
+ if result_files_exist?(base, '.2.fastq')
12
+ { pair1: '.1.fastq', pair2: '.2.fastq' }
13
+ else
14
+ { single: '.1.fastq' }
15
+ end
16
+ )
17
+ end
18
+
19
+ ##
20
+ # Add result type +:trimmed_reads+ at +base+ (no +_opts+ supported)
21
+ def add_result_trimmed_reads(base, _opts)
22
+ return nil unless result_files_exist?(base, '.1.clipped.fastq')
23
+
24
+ add_files_to_ds_result(
25
+ MiGA::Result.new("#{base}.json"), name,
26
+ if result_files_exist?(base, '.2.clipped.fastq')
27
+ { pair1: '.1.clipped.fastq', pair2: '.2.clipped.fastq' }
28
+ else
29
+ { single: '.1.clipped.fastq' }
30
+ end
31
+ ).tap do |r|
32
+ # Legacy files
33
+ r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
34
+ r.add_file(:single, "#{name}.1.clipped.single.fastq")
35
+ end
36
+ end
37
+
38
+ ##
39
+ # Add result type +:read_quality+ at +base+ (no +_opts+ supported)
40
+ def add_result_read_quality(base, _opts)
41
+ return nil unless
42
+ result_files_exist?(base, %w[.post.1.html]) ||
43
+ result_files_exist?(base, %w[.solexaqa .fastqc])
44
+
45
+ add_files_to_ds_result(
46
+ MiGA::Result.new("#{base}.json"), name,
47
+ pre_qc_1: '.pre.1.html', pre_qc_2: '.pre.2.html',
48
+ post_qc_1: '.post.1.html', post_qc_2: '.post.2.html',
49
+ adapter_detection: '.adapters.txt',
50
+ # Legacy files
51
+ solexaqa: '.solexaqa', fastqc: '.fastqc'
52
+ )
53
+ end
54
+
55
+ ##
56
+ # Add result type +:trimmed_fasta+ at +base+ (no +_opts+ supported)
57
+ def add_result_trimmed_fasta(base, _opts)
58
+ return nil unless
59
+ result_files_exist?(base, '.CoupledReads.fa') ||
60
+ result_files_exist?(base, '.SingleReads.fa') ||
61
+ result_files_exist?(base, %w[.1.fasta .2.fasta])
62
+
63
+ add_files_to_ds_result(
64
+ MiGA::Result.new("#{base}.json"), name,
65
+ coupled: '.CoupledReads.fa',
66
+ single: '.SingleReads.fa',
67
+ pair1: '.1.fasta',
68
+ pair2: '.2.fasta'
69
+ )
70
+ end
71
+
72
+ ##
73
+ # Add result type +:assembly+ at +base+. Hash +opts+ supports
74
+ # +is_clean: Boolean+.
75
+ def add_result_assembly(base, opts)
76
+ return nil unless result_files_exist?(base, '.LargeContigs.fna')
77
+
78
+ r = add_files_to_ds_result(
79
+ MiGA::Result.new("#{base}.json"), name,
80
+ largecontigs: '.LargeContigs.fna',
81
+ allcontigs: '.AllContigs.fna',
82
+ assembly_data: ''
83
+ )
84
+ opts[:is_clean] ||= false
85
+ r.clean! if opts[:is_clean]
86
+ unless r.clean?
87
+ MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
88
+ r.clean!
89
+ end
90
+ r
91
+ end
92
+
93
+ ##
94
+ # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
95
+ def add_result_cds(base, opts)
96
+ return nil unless result_files_exist?(base, %w[.faa])
97
+
98
+ r = add_files_to_ds_result(
99
+ MiGA::Result.new("#{base}.json"), name,
100
+ proteins: '.faa',
101
+ genes: '.fna',
102
+ gff2: '.gff2',
103
+ gff3: '.gff3',
104
+ tab: '.tab'
105
+ )
106
+ opts[:is_clean] ||= false
107
+ r.clean! if opts[:is_clean]
108
+ unless r.clean?
109
+ MiGA::MiGA.clean_fasta_file(r.file_path(:proteins))
110
+ MiGA::MiGA.clean_fasta_file(r.file_path(:genes)) if r.file_path(:genes)
111
+ r.clean!
112
+ end
113
+ r
114
+ end
115
+
116
+ ##
117
+ # Add result type +:essential_genes+ at +base+ (no +_opts+ supported).
118
+ def add_result_essential_genes(base, _opts)
119
+ return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
120
+
121
+ add_files_to_ds_result(
122
+ MiGA::Result.new("#{base}.json"), name,
123
+ ess_genes: '.ess.faa',
124
+ collection: '.ess',
125
+ report: '.ess/log',
126
+ alignments: '.ess/proteins.aln',
127
+ fastaai_index: '.faix.db.gz',
128
+ fastaai_index_2: '.faix'
129
+ )
130
+ end
131
+
132
+ ##
133
+ # Add result type +:ssu+ at +base+. Hash +opts+ supports +is_clean: Boolean+
134
+ def add_result_ssu(base, opts)
135
+ return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
136
+ return nil unless result_files_exist?(base, '.ssu.fa')
137
+
138
+ r = add_files_to_ds_result(
139
+ MiGA::Result.new("#{base}.json"), name,
140
+ longest_ssu_gene: '.ssu.fa',
141
+ ssu_gff: '.ssu.gff', # DEPRECATED
142
+ gff: '.gff',
143
+ all_ssu_genes: '.ssu.all.fa',
144
+ classification: '.rdp.tsv',
145
+ trna_list: '.trna.txt'
146
+ )
147
+ opts[:is_clean] ||= false
148
+ r.clean! if opts[:is_clean]
149
+ unless r.clean?
150
+ MiGA::MiGA.clean_fasta_file(r.file_path(:longest_ssu_gene))
151
+ r.clean!
152
+ end
153
+ r
154
+ end
155
+
156
+ ##
157
+ # Add result type +:mytaxa+ at +base+ (no +_opts+ supported)
158
+ def add_result_mytaxa(base, _opts)
159
+ if multi?
160
+ return nil unless
161
+ result_files_exist?(base, '.mytaxa') ||
162
+ result_files_exist?(base, '.nomytaxa.txt')
163
+
164
+ add_files_to_ds_result(
165
+ MiGA::Result.new("#{base}.json"), name,
166
+ mytaxa: '.mytaxa',
167
+ blast: '.blast',
168
+ mytaxain: '.mytaxain',
169
+ nomytaxa: '.nomytaxa.txt',
170
+ species: '.mytaxa.Species.txt',
171
+ genus: '.mytaxa.Genus.txt',
172
+ phylum: '.mytaxa.Phylum.txt',
173
+ innominate: '.mytaxa.innominate',
174
+ kronain: '.mytaxa.krona',
175
+ krona: '.html'
176
+ )
177
+ else
178
+ MiGA::Result.new("#{base}.json")
179
+ end
180
+ end
181
+
182
+ ##
183
+ # Add result type +:mytaxa_scan+ at +base+ (no +_opts+ supported)
184
+ def add_result_mytaxa_scan(base, _opts)
185
+ if nonmulti?
186
+ return nil unless
187
+ result_files_exist?(base, %w[.pdf .mytaxa]) ||
188
+ result_files_exist?(base, '.nomytaxa.txt')
189
+
190
+ add_files_to_ds_result(
191
+ MiGA::Result.new("#{base}.json"), name,
192
+ nomytaxa: '.nomytaxa.txt',
193
+ mytaxa: '.mytaxa',
194
+ report: '.pdf',
195
+ regions_archive: '.reg.tar',
196
+ # Intermediate / Deprecated:
197
+ blast: '.blast',
198
+ mytaxain: '.mytaxain',
199
+ wintax: '.wintax',
200
+ gene_ids: '.wintax.genes',
201
+ region_ids: '.wintax.regions',
202
+ regions: '.reg'
203
+ )
204
+ else
205
+ MiGA::Result.new("#{base}.json")
206
+ end
207
+ end
208
+
209
+ ##
210
+ # Add result type +:distances+ at +base+ (no +_opts+ supported)
211
+ def add_result_distances(base, _opts)
212
+ if nonmulti?
213
+ if ref?
214
+ add_result_distances_ref(base)
215
+ else
216
+ add_result_distances_nonref(base)
217
+ end
218
+ else
219
+ add_result_distances_multi(base)
220
+ end
221
+ end
222
+
223
+ ##
224
+ # Add result type +:taxonomy+ at +base+ (no +_opts+ supported)
225
+ def add_result_taxonomy(base, _opts)
226
+ add_result_distances_nonref(base)
227
+ end
228
+
229
+ ##
230
+ # Add result type +:stats+ at +base+ (no +_opts+ supported)
231
+ def add_result_stats(base, _opts)
232
+ MiGA::Result.new("#{base}.json")
233
+ end
234
+
235
+ private
236
+
237
+ ##
238
+ # Add result type +:distances+ for _multi_ datasets at +base+
239
+ def add_result_distances_multi(base)
240
+ MiGA::Result.new("#{base}.json")
241
+ end
242
+
243
+ ##
244
+ # Add result type +:distances+ for _nonmulti_ reference datasets at +base+
245
+ def add_result_distances_ref(base)
246
+ pref = File.dirname(base)
247
+ return nil unless File.exist?("#{pref}/01.haai/#{name}.db")
248
+
249
+ MiGA::Result.new("#{base}.json").tap do |r|
250
+ r.add_files(
251
+ haai_db: "01.haai/#{name}.db",
252
+ aai_db: "02.aai/#{name}.db",
253
+ ani_db: "03.ani/#{name}.db"
254
+ )
255
+ end
256
+ end
257
+
258
+ ##
259
+ # Add result type +:distances+ for _nonmulti_ query datasets at +base+
260
+ def add_result_distances_nonref(base)
261
+ return nil unless
262
+ result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) ||
263
+ result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
264
+
265
+ add_files_to_ds_result(
266
+ MiGA::Result.new("#{base}.json"), name,
267
+ aai_medoids: '.aai-medoids.tsv',
268
+ haai_db: '.haai.db',
269
+ aai_db: '.aai.db',
270
+ ani_medoids: '.ani-medoids.tsv',
271
+ ani_db: '.ani.db',
272
+ ref_tree: '.nwk',
273
+ ref_tree_pdf: '.nwk.pdf',
274
+ intax_test: '.intax.txt'
275
+ )
276
+ end
277
+
278
+ ##
279
+ # Add files in +rel_files+ Hash to the result +r+ with dataset name +name+
280
+ def add_files_to_ds_result(r, name, rel_files)
281
+ files = {}
282
+ rel_files.each { |k, v| files[k] = name + v }
283
+ r.add_files(files)
284
+ r
285
+ end
286
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiGA::Dataset::Result::Ignore
4
+ ##
5
+ # Should I ignore +task+ for this dataset?
6
+ def ignore_task?(task)
7
+ why_ignore(task) != :execute
8
+ end
9
+
10
+ ##
11
+ # Returns an array of symbols indicating all the possible reasons why a
12
+ # given task migh be ignored:
13
+ # - empty: the dataset has no data
14
+ # - inactive: the dataset is inactive
15
+ # - upstream: the task is upstream from dataset's input
16
+ # - force: forced to ignore by metadata
17
+ # - project: incompatible project
18
+ # - noref: incompatible dataset, only for reference
19
+ # - multi: incompatible dataset, only for multi
20
+ # - nonmulti: incompatible dataset, only for nonmulti
21
+ # - complete: the task is already complete
22
+ def ignore_reasons
23
+ %i[empty inactive upstream force project noref multi nonmulti complete]
24
+ end
25
+
26
+ ##
27
+ # Return a code explaining why a task is ignored (see +ignore_reasons+) or
28
+ # the symbol +:execute+ (do not ignore, execute the task)
29
+ def why_ignore(task)
30
+ # Find a reason to ignore it
31
+ ignore_reasons.each do |i|
32
+ return i if send(:"ignore_#{i}?", task)
33
+ end
34
+
35
+ # Otherwise, execute
36
+ return :execute
37
+ end
38
+
39
+ ##
40
+ # Ignore +task+ because it's already done
41
+ def ignore_complete?(task)
42
+ !get_result(task).nil?
43
+ end
44
+
45
+ ##
46
+ # Ignore any task because the dataset is inactive (+_task+ is ignored)
47
+ def ignore_inactive?(_task)
48
+ !active?
49
+ end
50
+
51
+ ##
52
+ # Ignore any task because the dataset is empty (+_task+ is ignored)
53
+ def ignore_empty?(_task)
54
+ first_preprocessing.nil?
55
+ end
56
+
57
+ ##
58
+ # Ignore +task+ because it's upstream from the entry point
59
+ def ignore_upstream?(task)
60
+ self.class.PREPROCESSING_TASKS.index(task) <
61
+ self.class.PREPROCESSING_TASKS.index(first_preprocessing)
62
+ end
63
+
64
+ ##
65
+ # Ignore +task+ because the metadata says so
66
+ def ignore_force?(task)
67
+ !(metadata["run_#{task}"].nil? || metadata["run_#{task}"])
68
+ end
69
+
70
+ ##
71
+ # Ignore +task+ because the project is not compatible
72
+ def ignore_project?(task)
73
+ task == :taxonomy && project.option(:ref_project).nil?
74
+ end
75
+
76
+ ##
77
+ # Ignore +task+ because it's not a reference dataset
78
+ def ignore_noref?(task)
79
+ self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
80
+ end
81
+
82
+ ##
83
+ # Ignore +task+ because it's not a multi dataset
84
+ def ignore_multi?(task)
85
+ self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
86
+ end
87
+
88
+ ##
89
+ # Ignore +task+ because it's not a nonmulti dataset
90
+ def ignore_nonmulti?(task)
91
+ self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
92
+ end
93
+ end