miga-base 0.3.1.3 → 0.3.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,73 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ class MiGA::Dataset < MiGA::MiGA
5
+
6
+ # Class-level
7
+ class << self
8
+ def RESULT_DIRS ; @@RESULT_DIRS ; end
9
+ def KNOWN_TYPES ; @@KNOWN_TYPES ; end
10
+ def PREPROCESSING_TASKS ; @@PREPROCESSING_TASKS ; end
11
+ end
12
+
13
+ end
14
+
15
+ module MiGA::Dataset::Base
16
+
17
+ ##
18
+ # Directories containing the results from dataset-specific tasks.
19
+ @@RESULT_DIRS = {
20
+ # Preprocessing
21
+ raw_reads: "01.raw_reads", trimmed_reads: "02.trimmed_reads",
22
+ read_quality: "03.read_quality", trimmed_fasta: "04.trimmed_fasta",
23
+ assembly: "05.assembly", cds: "06.cds",
24
+ # Annotation
25
+ essential_genes: "07.annotation/01.function/01.essential",
26
+ ssu: "07.annotation/01.function/02.ssu",
27
+ mytaxa: "07.annotation/02.taxonomy/01.mytaxa",
28
+ mytaxa_scan: "07.annotation/03.qa/02.mytaxa_scan",
29
+ # Distances (for single-species datasets)
30
+ distances: "09.distances", taxonomy: "09.distances/05.taxonomy",
31
+ # General statistics
32
+ stats: "90.stats"
33
+ }
34
+
35
+ ##
36
+ # Supported dataset types.
37
+ @@KNOWN_TYPES = {
38
+ genome: {description: "The genome from an isolate.", multi: false},
39
+ scgenome: {description: "A Single-cell Genome Amplification (SGA).",
40
+ multi: false},
41
+ popgenome: {description: "A population genome (including " +
42
+ "metagenomic bins).", :multi=>false},
43
+ metagenome: {description: "A metagenome (excluding viromes).",
44
+ multi: true},
45
+ virome: {description: "A viral metagenome.", multi: true}
46
+ }
47
+
48
+ ##
49
+ # Returns an Array of tasks to be executed before project-wide tasks.
50
+ @@PREPROCESSING_TASKS = [:raw_reads, :trimmed_reads, :read_quality,
51
+ :trimmed_fasta, :assembly, :cds, :essential_genes, :ssu, :mytaxa,
52
+ :mytaxa_scan, :distances, :taxonomy, :stats]
53
+
54
+ ##
55
+ # Tasks to be excluded from query datasets.
56
+ @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
57
+ @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map{ |i| [i,true] }]
58
+
59
+ ##
60
+ # Tasks to be executed only in datasets that are not multi-organism. These
61
+ # tasks are ignored for multi-organism datasets or for unknown types.
62
+ @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances, :taxonomy]
63
+ @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map{ |i| [i,true] }]
64
+
65
+ ##
66
+ # Tasks to be executed only in datasets that are multi-organism. These
67
+ # tasks are ignored for single-organism datasets or for unknwon types.
68
+ @@ONLY_MULTI_TASKS = [:mytaxa]
69
+ @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map{ |i| [i,true] }]
70
+
71
+
72
+ end
73
+
@@ -1,9 +1,113 @@
1
1
 
2
2
  require "sqlite3"
3
+ require "miga/result"
4
+ require "miga/dataset/base"
3
5
 
4
6
  ##
5
7
  # Helper module including specific functions to add dataset results.
6
- module MiGA::DatasetResult
8
+ module MiGA::Dataset::Result
9
+
10
+ include MiGA::Dataset::Base
11
+
12
+ ##
13
+ # Get the result MiGA::Result in this dataset identified by the symbol +k+.
14
+ def result(k)
15
+ return nil if @@RESULT_DIRS[k.to_sym].nil?
16
+ MiGA::Result.load(
17
+ "#{project.path}/data/#{@@RESULT_DIRS[k.to_sym]}/#{name}.json" )
18
+ end
19
+
20
+ ##
21
+ # Get all the results (Array of MiGA::Result) in this dataset.
22
+ def results ; @@RESULT_DIRS.keys.map{ |k| result k }.compact ; end
23
+
24
+ ##
25
+ # For each result executes the 2-ary +blk+ block: key symbol and MiGA::Result.
26
+ def each_result(&blk)
27
+ @@RESULT_DIRS.keys.each do |k|
28
+ blk.call(k, result(k)) unless result(k).nil?
29
+ end
30
+ end
31
+
32
+ ##
33
+ # Look for the result with symbol key +result_type+ and register it in the
34
+ # dataset. If +save+ is false, it doesn't register the result, but it still
35
+ # returns a result if the expected files are complete. The +opts+ hash
36
+ # controls result creation (if necessary). Supported values include:
37
+ # - +is_clean+: A Boolean indicating if the input files are clean.
38
+ # - +force+: A Boolean indicating if the result must be re-indexed. If true, it
39
+ # implies save=true.
40
+ # Returns MiGA::Result or nil.
41
+ def add_result(result_type, save=true, opts={})
42
+ dir = @@RESULT_DIRS[result_type]
43
+ return nil if dir.nil?
44
+ base = File.expand_path("data/#{dir}/#{name}", project.path)
45
+ unless opts[:force]
46
+ r_pre = MiGA::Result.load("#{base}.json")
47
+ return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
48
+ end
49
+ r = File.exist?("#{base}.done") ?
50
+ self.send("add_result_#{result_type}", base, opts) : nil
51
+ r.save unless r.nil?
52
+ r
53
+ end
54
+
55
+ ##
56
+ # Gets a result as MiGA::Result for the datasets with +result_type+. This is
57
+ # equivalent to +add_result(result_type, false)+.
58
+ def get_result(result_type) ; add_result(result_type, false) ; end
59
+
60
+ ##
61
+ # Returns the key symbol of the first registered result (sorted by the
62
+ # execution order). This typically corresponds to the result used as the
63
+ # initial input. Passes +save+ to #add_result.
64
+ def first_preprocessing(save=false)
65
+ @@PREPROCESSING_TASKS.find do |t|
66
+ not ignore_task?(t) and not add_result(t, save).nil?
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Returns the key symbol of the next task that needs to be executed. Passes
72
+ # +save+ to #add_result.
73
+ def next_preprocessing(save=false)
74
+ after_first = false
75
+ first = first_preprocessing(save)
76
+ return nil if first.nil?
77
+ @@PREPROCESSING_TASKS.each do |t|
78
+ next if ignore_task? t
79
+ return t if after_first and add_result(t, save).nil?
80
+ after_first = (after_first or (t==first))
81
+ end
82
+ nil
83
+ end
84
+
85
+ ##
86
+ # Are all the dataset-specific tasks done? Passes +save+ to #add_result.
87
+ def done_preprocessing?(save=false)
88
+ !first_preprocessing(save).nil? and next_preprocessing(save).nil?
89
+ end
90
+
91
+ ##
92
+ # Returns an array indicating the stage of each task (sorted by execution
93
+ # order). The values are integers:
94
+ # - 0 for an undefined result (a task before the initial input).
95
+ # - 1 for a registered result (a completed task).
96
+ # - 2 for a queued result (a task yet to be executed).
97
+ # It passes +save+ to #add_result
98
+ def profile_advance(save=false)
99
+ first_task = first_preprocessing(save)
100
+ return Array.new(@@PREPROCESSING_TASKS.size, 0) if first_task.nil?
101
+ adv = []
102
+ state = 0
103
+ next_task = next_preprocessing(save)
104
+ @@PREPROCESSING_TASKS.each do |task|
105
+ state = 1 if first_task==task
106
+ state = 2 if !next_task.nil? and next_task==task
107
+ adv << state
108
+ end
109
+ adv
110
+ end
7
111
 
8
112
  ##
9
113
  # Clean-up all the stored distances, removing values for datasets no longer in
data/lib/miga/project.rb CHANGED
@@ -2,102 +2,18 @@
2
2
  # @license Artistic-2.0
3
3
 
4
4
  require "miga/dataset"
5
- require "miga/project_result"
5
+ require "miga/project/result"
6
+ require "miga/project/dataset"
7
+ require "miga/project/plugins"
6
8
 
7
9
  ##
8
10
  # MiGA representation of a project.
9
11
  class MiGA::Project < MiGA::MiGA
10
12
 
11
- include MiGA::ProjectResult
13
+ include MiGA::Project::Result
14
+ include MiGA::Project::Dataset
15
+ include MiGA::Project::Plugins
12
16
 
13
- # Class-level
14
-
15
- ##
16
- # Top-level folders inside a project.
17
- @@FOLDERS = %w[data metadata daemon]
18
-
19
- ##
20
- # Folders for results.
21
- @@DATA_FOLDERS = %w[
22
- 01.raw_reads 02.trimmed_reads 03.read_quality 04.trimmed_fasta
23
- 05.assembly 06.cds
24
- 07.annotation 07.annotation/01.function 07.annotation/02.taxonomy
25
- 07.annotation/01.function/01.essential
26
- 07.annotation/01.function/02.ssu
27
- 07.annotation/02.taxonomy/01.mytaxa
28
- 07.annotation/03.qa 07.annotation/03.qa/01.checkm
29
- 07.annotation/03.qa/02.mytaxa_scan
30
- 08.mapping 08.mapping/01.read-ctg 08.mapping/02.read-gene
31
- 09.distances 09.distances/01.haai 09.distances/02.aai
32
- 09.distances/03.ani 09.distances/04.ssu 09.distances/05.taxonomy
33
- 10.clades 10.clades/01.find 10.clades/02.ani 10.clades/03.ogs
34
- 10.clades/04.phylogeny 10.clades/04.phylogeny/01.essential
35
- 10.clades/04.phylogeny/02.core 10.clades/05.metadata
36
- 90.stats
37
- ]
38
-
39
- ##
40
- # Directories containing the results from project-wide tasks.
41
- def self.RESULT_DIRS ; @@RESULT_DIRS ; end
42
- @@RESULT_DIRS = {
43
- project_stats: "90.stats",
44
- # Distances
45
- haai_distances: "09.distances/01.haai",
46
- aai_distances: "09.distances/02.aai",
47
- ani_distances: "09.distances/03.ani",
48
- #ssu_distances: "09.distances/04.ssu",
49
- # Clade identification
50
- clade_finding: "10.clades/01.find",
51
- # Clade analysis
52
- subclades: "10.clades/02.ani",
53
- ogs: "10.clades/03.ogs"
54
- #ess_phylogeny: "10.clades/04.phylogeny/01.essential",
55
- #core_phylogeny: "10.clades/04.phylogeny/02.core",
56
- #clade_metadata: "10.clades/05.metadata"
57
- }
58
-
59
- ##
60
- # Supported types of projects.
61
- def self.KNOWN_TYPES ; @@KNOWN_TYPES ; end
62
- @@KNOWN_TYPES = {
63
- mixed: {
64
- description: "Mixed collection of genomes, metagenomes, and viromes.",
65
- single: true, multi: true},
66
- genomes: {description: "Collection of genomes.",
67
- single: true, multi: false},
68
- clade: {description: "Collection of closely-related genomes (ANI >= 90%).",
69
- single: true, multi: false},
70
- metagenomes: {description: "Collection of metagenomes and/or viromes.",
71
- single: false, multi: true}
72
- }
73
-
74
- ##
75
- # Project-wide distance estimations.
76
- def self.DISTANCE_TASKS ; @@DISTANCE_TASKS ; end
77
- @@DISTANCE_TASKS = [:project_stats,
78
- :haai_distances, :aai_distances, :ani_distances, :clade_finding]
79
-
80
- ##
81
- # Project-wide tasks for :clade projects.
82
- def self.INCLADE_TASKS ; @@INCLADE_TASKS ; end
83
- @@INCLADE_TASKS = [:subclades, :ogs]
84
-
85
- ##
86
- # Does the project at +path+ exist?
87
- def self.exist?(path)
88
- Dir.exist?(path) and File.exist?("#{path}/miga.project.json")
89
- end
90
-
91
- ##
92
- # Load the project at +path+. Returns MiGA::Project if project exists, nil
93
- # otherwise.
94
- def self.load(path)
95
- return nil unless Project.exist? path
96
- Project.new path
97
- end
98
-
99
- # Instance-level
100
-
101
17
  ##
102
18
  # Absolute path to the project folder.
103
19
  attr_reader :path
@@ -167,229 +83,4 @@ class MiGA::Project < MiGA::MiGA
167
83
  # Is this a project for multi-organism datasets?
168
84
  def is_multi? ; @@KNOWN_TYPES[type][:multi] ; end
169
85
 
170
- ##
171
- # Returns Array of MiGA::Dataset.
172
- def datasets
173
- metadata[:datasets].map{ |name| dataset(name) }
174
- end
175
-
176
- ##
177
- # Returns Array of String (without evaluating dataset objects).
178
- def dataset_names
179
- metadata[:datasets]
180
- end
181
-
182
- ##
183
- # Returns MiGA::Dataset.
184
- def dataset(name)
185
- name = name.miga_name
186
- return nil unless MiGA::Dataset.exist?(self, name)
187
- @datasets ||= {}
188
- @datasets[name] ||= MiGA::Dataset.new(self, name)
189
- @datasets[name]
190
- end
191
-
192
- ##
193
- # Iterate through datasets, with one or two variables passed to +blk+.
194
- # If one, the dataset MiGA::Dataset object is passed. If two, the name and
195
- # the dataset object are passed.
196
- def each_dataset(&blk)
197
- metadata[:datasets].each do |name|
198
- if blk.arity == 1
199
- blk.call(dataset(name))
200
- else
201
- blk.call(name, dataset(name))
202
- end
203
- end
204
- end
205
-
206
- ##
207
- # Add dataset identified by +name+ and return MiGA::Dataset.
208
- def add_dataset(name)
209
- unless metadata[:datasets].include? name
210
- MiGA::Dataset.new(self, name)
211
- @metadata[:datasets] << name
212
- save
213
- end
214
- dataset(name)
215
- end
216
-
217
- ##
218
- # Unlink dataset identified by +name+ and return MiGA::Dataset.
219
- def unlink_dataset(name)
220
- d = dataset(name)
221
- return nil if d.nil?
222
- self.metadata[:datasets].delete(name)
223
- save
224
- d
225
- end
226
-
227
- ##
228
- # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
229
- # supported by File#generic_transfer.
230
- def import_dataset(ds, method=:hardlink)
231
- raise "Impossible to import dataset, it already exists: #{ds.name}." if
232
- MiGA::Dataset.exist?(self, ds.name)
233
- # Import dataset results
234
- ds.each_result do |task, result|
235
- # import result files
236
- result.each_file do |file|
237
- File.generic_transfer("#{result.dir}/#{file}",
238
- "#{path}/data/#{MiGA::Dataset.RESULT_DIRS[task]}/#{file}", method)
239
- end
240
- # import result metadata
241
- %w(json start done).each do |suffix|
242
- if File.exist? "#{result.dir}/#{ds.name}.#{suffix}"
243
- File.generic_transfer("#{result.dir}/#{ds.name}.#{suffix}",
244
- "#{path}/data/#{MiGA::Dataset.RESULT_DIRS[task]}/" +
245
- "#{ds.name}.#{suffix}", method)
246
- end
247
- end
248
- end
249
- # Import dataset metadata
250
- File.generic_transfer("#{ds.project.path}/metadata/#{ds.name}.json",
251
- "#{self.path}/metadata/#{ds.name}.json", method)
252
- # Save dataset
253
- self.add_dataset(ds.name)
254
- end
255
-
256
- ##
257
- # Get result identified by Symbol +name+, returns MiGA::Result.
258
- def result(name)
259
- dir = @@RESULT_DIRS[name.to_sym]
260
- return nil if dir.nil?
261
- MiGA::Result.load("#{path}/data/#{dir}/miga-project.json")
262
- end
263
-
264
- ##
265
- # Get all results, an Array of MiGA::Result.
266
- def results
267
- @@RESULT_DIRS.keys.map{ |k| result(k) }.reject{ |r| r.nil? }
268
- end
269
-
270
- ##
271
- # Add the result identified by Symbol +name+, and return MiGA::Result. Save
272
- # the result if +save+. The +opts+ hash controls result creation (if necessary).
273
- # Supported values include:
274
- # - +force+: A Boolean indicating if the result must be re-indexed. If true, it
275
- # implies save=true.
276
- def add_result(name, save=true, opts={})
277
- return nil if @@RESULT_DIRS[name].nil?
278
- base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
279
- unless opts[:force]
280
- r_pre = MiGA::Result.load("#{base}.json")
281
- return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
282
- end
283
- r = result_files_exist?(base, ".done") ?
284
- send("add_result_#{name}", base) : nil
285
- r.save unless r.nil?
286
- r
287
- end
288
-
289
- ##
290
- # Get the next distances task, saving intermediate results if +save+. Returns
291
- # a Symbol.
292
- def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
293
-
294
- ##
295
- # Get the next inclade task, saving intermediate results if +save+. Returns a
296
- # Symbol.
297
- def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
298
-
299
- ##
300
- # Get the next task from +tasks+, saving intermediate results if +save+.
301
- # Returns a Symbol.
302
- def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
303
- tasks.find do |t|
304
- if metadata["run_#{t}"]==false or
305
- (!is_clade? and @@INCLADE_TASKS.include?(t) and
306
- metadata["run_#{t}"]!=true)
307
- false
308
- else
309
- add_result(t, save).nil?
310
- end
311
- end
312
- end
313
-
314
- ##
315
- # Find all datasets with (potential) result files but are yet unregistered.
316
- def unregistered_datasets
317
- datasets = []
318
- MiGA::Dataset.RESULT_DIRS.values.each do |dir|
319
- dir_p = "#{path}/data/#{dir}"
320
- next unless Dir.exist? dir_p
321
- Dir.entries(dir_p).each do |file|
322
- next unless
323
- file =~ %r{
324
- \.(fa(a|sta|stqc?)?|fna|solexaqa|gff[23]?|done|ess)(\.gz)?$
325
- }x
326
- m = /([^\.]+)/.match(file)
327
- datasets << m[1] unless m.nil? or m[1] == "miga-project"
328
- end
329
- end
330
- datasets.uniq - metadata[:datasets]
331
- end
332
-
333
- ##
334
- # Are all the datasets in the project preprocessed? Save intermediate results
335
- # if +save+.
336
- def done_preprocessing?(save=true)
337
- datasets.map{|ds| (not ds.is_ref?) or ds.done_preprocessing?(save) }.all?
338
- end
339
-
340
- ##
341
- # Returns a two-dimensional matrix (Array of Array) where the first index
342
- # corresponds to the dataset, the second index corresponds to the dataset
343
- # task, and the value corresponds to:
344
- # - 0: Before execution.
345
- # - 1: Done (or not required).
346
- # - 2: To do.
347
- def profile_datasets_advance
348
- advance = []
349
- self.each_dataset_profile_advance do |ds_adv|
350
- advance << ds_adv
351
- end
352
- advance
353
- end
354
-
355
- ##
356
- # Call +blk+ passing the result of MiGA::Dataset#profile_advance for each
357
- # registered dataset.
358
- def each_dataset_profile_advance(&blk)
359
- each_dataset { |ds| blk.call(ds.profile_advance) }
360
- end
361
-
362
- ##
363
- # Installs the plugin in the specified path.
364
- def install_plugin(path)
365
- abs_path = File.absolute_path(path)
366
- raise "Plugin already installed in project: #{abs_path}." unless
367
- metadata[:plugins].nil? or not metadata[:plugins].include?(abs_path)
368
- raise "Malformed MiGA plugin: #{abs_path}." unless
369
- File.exist?(File.expand_path("miga-plugin.json", abs_path))
370
- self.metadata[:plugins] ||= []
371
- self.metadata[:plugins] << abs_path
372
- save
373
- end
374
-
375
- ##
376
- # Uninstall the plugin in the specified path.
377
- def uninstall_plugin(path)
378
- abs_path = File.absolute_path(path)
379
- raise "Plugin not currently installed: #{abs_path}." if
380
- metadata[:plugins].nil? or not metadata[:plugins].include?(abs_path)
381
- self.metadata[:plugins].delete(abs_path)
382
- save
383
- end
384
-
385
- ##
386
- # List plugins installed in the project.
387
- def plugins ; metadata[:plugins] ||= [] ; end
388
-
389
- ##
390
- # Loads the plugins installed in the project.
391
- def load_plugins
392
- plugins.each { |pl| require File.expand_path("lib-plugin.rb", pl) }
393
- end
394
-
395
86
  end