miga-base 0.3.1.3 → 0.3.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ class MiGA::Dataset < MiGA::MiGA
5
+
6
+ # Class-level
7
+ class << self
8
+ def RESULT_DIRS ; @@RESULT_DIRS ; end
9
+ def KNOWN_TYPES ; @@KNOWN_TYPES ; end
10
+ def PREPROCESSING_TASKS ; @@PREPROCESSING_TASKS ; end
11
+ end
12
+
13
+ end
14
+
15
+ module MiGA::Dataset::Base
16
+
17
+ ##
18
+ # Directories containing the results from dataset-specific tasks.
19
+ @@RESULT_DIRS = {
20
+ # Preprocessing
21
+ raw_reads: "01.raw_reads", trimmed_reads: "02.trimmed_reads",
22
+ read_quality: "03.read_quality", trimmed_fasta: "04.trimmed_fasta",
23
+ assembly: "05.assembly", cds: "06.cds",
24
+ # Annotation
25
+ essential_genes: "07.annotation/01.function/01.essential",
26
+ ssu: "07.annotation/01.function/02.ssu",
27
+ mytaxa: "07.annotation/02.taxonomy/01.mytaxa",
28
+ mytaxa_scan: "07.annotation/03.qa/02.mytaxa_scan",
29
+ # Distances (for single-species datasets)
30
+ distances: "09.distances", taxonomy: "09.distances/05.taxonomy",
31
+ # General statistics
32
+ stats: "90.stats"
33
+ }
34
+
35
+ ##
36
+ # Supported dataset types.
37
+ @@KNOWN_TYPES = {
38
+ genome: {description: "The genome from an isolate.", multi: false},
39
+ scgenome: {description: "A Single-cell Genome Amplification (SGA).",
40
+ multi: false},
41
+ popgenome: {description: "A population genome (including " +
42
+ "metagenomic bins).", :multi=>false},
43
+ metagenome: {description: "A metagenome (excluding viromes).",
44
+ multi: true},
45
+ virome: {description: "A viral metagenome.", multi: true}
46
+ }
47
+
48
+ ##
49
+ # Returns an Array of tasks to be executed before project-wide tasks.
50
+ @@PREPROCESSING_TASKS = [:raw_reads, :trimmed_reads, :read_quality,
51
+ :trimmed_fasta, :assembly, :cds, :essential_genes, :ssu, :mytaxa,
52
+ :mytaxa_scan, :distances, :taxonomy, :stats]
53
+
54
+ ##
55
+ # Tasks to be excluded from query datasets.
56
+ @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
57
+ @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map{ |i| [i,true] }]
58
+
59
+ ##
60
+ # Tasks to be executed only in datasets that are not multi-organism. These
61
+ # tasks are ignored for multi-organism datasets or for unknown types.
62
+ @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances, :taxonomy]
63
+ @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map{ |i| [i,true] }]
64
+
65
+ ##
66
+ # Tasks to be executed only in datasets that are multi-organism. These
67
+ # tasks are ignored for single-organism datasets or for unknwon types.
68
+ @@ONLY_MULTI_TASKS = [:mytaxa]
69
+ @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map{ |i| [i,true] }]
70
+
71
+
72
+ end
73
+
@@ -1,9 +1,113 @@
1
1
 
2
2
  require "sqlite3"
3
+ require "miga/result"
4
+ require "miga/dataset/base"
3
5
 
4
6
  ##
5
7
  # Helper module including specific functions to add dataset results.
6
- module MiGA::DatasetResult
8
+ module MiGA::Dataset::Result
9
+
10
+ include MiGA::Dataset::Base
11
+
12
+ ##
13
+ # Get the result MiGA::Result in this dataset identified by the symbol +k+.
14
+ def result(k)
15
+ return nil if @@RESULT_DIRS[k.to_sym].nil?
16
+ MiGA::Result.load(
17
+ "#{project.path}/data/#{@@RESULT_DIRS[k.to_sym]}/#{name}.json" )
18
+ end
19
+
20
+ ##
21
+ # Get all the results (Array of MiGA::Result) in this dataset.
22
+ def results ; @@RESULT_DIRS.keys.map{ |k| result k }.compact ; end
23
+
24
+ ##
25
+ # For each result executes the 2-ary +blk+ block: key symbol and MiGA::Result.
26
+ def each_result(&blk)
27
+ @@RESULT_DIRS.keys.each do |k|
28
+ blk.call(k, result(k)) unless result(k).nil?
29
+ end
30
+ end
31
+
32
+ ##
33
+ # Look for the result with symbol key +result_type+ and register it in the
34
+ # dataset. If +save+ is false, it doesn't register the result, but it still
35
+ # returns a result if the expected files are complete. The +opts+ hash
36
+ # controls result creation (if necessary). Supported values include:
37
+ # - +is_clean+: A Boolean indicating if the input files are clean.
38
+ # - +force+: A Boolean indicating if the result must be re-indexed. If true, it
39
+ # implies save=true.
40
+ # Returns MiGA::Result or nil.
41
+ def add_result(result_type, save=true, opts={})
42
+ dir = @@RESULT_DIRS[result_type]
43
+ return nil if dir.nil?
44
+ base = File.expand_path("data/#{dir}/#{name}", project.path)
45
+ unless opts[:force]
46
+ r_pre = MiGA::Result.load("#{base}.json")
47
+ return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
48
+ end
49
+ r = File.exist?("#{base}.done") ?
50
+ self.send("add_result_#{result_type}", base, opts) : nil
51
+ r.save unless r.nil?
52
+ r
53
+ end
54
+
55
+ ##
56
+ # Gets a result as MiGA::Result for the datasets with +result_type+. This is
57
+ # equivalent to +add_result(result_type, false)+.
58
+ def get_result(result_type) ; add_result(result_type, false) ; end
59
+
60
+ ##
61
+ # Returns the key symbol of the first registered result (sorted by the
62
+ # execution order). This typically corresponds to the result used as the
63
+ # initial input. Passes +save+ to #add_result.
64
+ def first_preprocessing(save=false)
65
+ @@PREPROCESSING_TASKS.find do |t|
66
+ not ignore_task?(t) and not add_result(t, save).nil?
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Returns the key symbol of the next task that needs to be executed. Passes
72
+ # +save+ to #add_result.
73
+ def next_preprocessing(save=false)
74
+ after_first = false
75
+ first = first_preprocessing(save)
76
+ return nil if first.nil?
77
+ @@PREPROCESSING_TASKS.each do |t|
78
+ next if ignore_task? t
79
+ return t if after_first and add_result(t, save).nil?
80
+ after_first = (after_first or (t==first))
81
+ end
82
+ nil
83
+ end
84
+
85
+ ##
86
+ # Are all the dataset-specific tasks done? Passes +save+ to #add_result.
87
+ def done_preprocessing?(save=false)
88
+ !first_preprocessing(save).nil? and next_preprocessing(save).nil?
89
+ end
90
+
91
+ ##
92
+ # Returns an array indicating the stage of each task (sorted by execution
93
+ # order). The values are integers:
94
+ # - 0 for an undefined result (a task before the initial input).
95
+ # - 1 for a registered result (a completed task).
96
+ # - 2 for a queued result (a task yet to be executed).
97
+ # It passes +save+ to #add_result
98
+ def profile_advance(save=false)
99
+ first_task = first_preprocessing(save)
100
+ return Array.new(@@PREPROCESSING_TASKS.size, 0) if first_task.nil?
101
+ adv = []
102
+ state = 0
103
+ next_task = next_preprocessing(save)
104
+ @@PREPROCESSING_TASKS.each do |task|
105
+ state = 1 if first_task==task
106
+ state = 2 if !next_task.nil? and next_task==task
107
+ adv << state
108
+ end
109
+ adv
110
+ end
7
111
 
8
112
  ##
9
113
  # Clean-up all the stored distances, removing values for datasets no longer in
data/lib/miga/project.rb CHANGED
@@ -2,102 +2,18 @@
2
2
  # @license Artistic-2.0
3
3
 
4
4
  require "miga/dataset"
5
- require "miga/project_result"
5
+ require "miga/project/result"
6
+ require "miga/project/dataset"
7
+ require "miga/project/plugins"
6
8
 
7
9
  ##
8
10
  # MiGA representation of a project.
9
11
  class MiGA::Project < MiGA::MiGA
10
12
 
11
- include MiGA::ProjectResult
13
+ include MiGA::Project::Result
14
+ include MiGA::Project::Dataset
15
+ include MiGA::Project::Plugins
12
16
 
13
- # Class-level
14
-
15
- ##
16
- # Top-level folders inside a project.
17
- @@FOLDERS = %w[data metadata daemon]
18
-
19
- ##
20
- # Folders for results.
21
- @@DATA_FOLDERS = %w[
22
- 01.raw_reads 02.trimmed_reads 03.read_quality 04.trimmed_fasta
23
- 05.assembly 06.cds
24
- 07.annotation 07.annotation/01.function 07.annotation/02.taxonomy
25
- 07.annotation/01.function/01.essential
26
- 07.annotation/01.function/02.ssu
27
- 07.annotation/02.taxonomy/01.mytaxa
28
- 07.annotation/03.qa 07.annotation/03.qa/01.checkm
29
- 07.annotation/03.qa/02.mytaxa_scan
30
- 08.mapping 08.mapping/01.read-ctg 08.mapping/02.read-gene
31
- 09.distances 09.distances/01.haai 09.distances/02.aai
32
- 09.distances/03.ani 09.distances/04.ssu 09.distances/05.taxonomy
33
- 10.clades 10.clades/01.find 10.clades/02.ani 10.clades/03.ogs
34
- 10.clades/04.phylogeny 10.clades/04.phylogeny/01.essential
35
- 10.clades/04.phylogeny/02.core 10.clades/05.metadata
36
- 90.stats
37
- ]
38
-
39
- ##
40
- # Directories containing the results from project-wide tasks.
41
- def self.RESULT_DIRS ; @@RESULT_DIRS ; end
42
- @@RESULT_DIRS = {
43
- project_stats: "90.stats",
44
- # Distances
45
- haai_distances: "09.distances/01.haai",
46
- aai_distances: "09.distances/02.aai",
47
- ani_distances: "09.distances/03.ani",
48
- #ssu_distances: "09.distances/04.ssu",
49
- # Clade identification
50
- clade_finding: "10.clades/01.find",
51
- # Clade analysis
52
- subclades: "10.clades/02.ani",
53
- ogs: "10.clades/03.ogs"
54
- #ess_phylogeny: "10.clades/04.phylogeny/01.essential",
55
- #core_phylogeny: "10.clades/04.phylogeny/02.core",
56
- #clade_metadata: "10.clades/05.metadata"
57
- }
58
-
59
- ##
60
- # Supported types of projects.
61
- def self.KNOWN_TYPES ; @@KNOWN_TYPES ; end
62
- @@KNOWN_TYPES = {
63
- mixed: {
64
- description: "Mixed collection of genomes, metagenomes, and viromes.",
65
- single: true, multi: true},
66
- genomes: {description: "Collection of genomes.",
67
- single: true, multi: false},
68
- clade: {description: "Collection of closely-related genomes (ANI >= 90%).",
69
- single: true, multi: false},
70
- metagenomes: {description: "Collection of metagenomes and/or viromes.",
71
- single: false, multi: true}
72
- }
73
-
74
- ##
75
- # Project-wide distance estimations.
76
- def self.DISTANCE_TASKS ; @@DISTANCE_TASKS ; end
77
- @@DISTANCE_TASKS = [:project_stats,
78
- :haai_distances, :aai_distances, :ani_distances, :clade_finding]
79
-
80
- ##
81
- # Project-wide tasks for :clade projects.
82
- def self.INCLADE_TASKS ; @@INCLADE_TASKS ; end
83
- @@INCLADE_TASKS = [:subclades, :ogs]
84
-
85
- ##
86
- # Does the project at +path+ exist?
87
- def self.exist?(path)
88
- Dir.exist?(path) and File.exist?("#{path}/miga.project.json")
89
- end
90
-
91
- ##
92
- # Load the project at +path+. Returns MiGA::Project if project exists, nil
93
- # otherwise.
94
- def self.load(path)
95
- return nil unless Project.exist? path
96
- Project.new path
97
- end
98
-
99
- # Instance-level
100
-
101
17
  ##
102
18
  # Absolute path to the project folder.
103
19
  attr_reader :path
@@ -167,229 +83,4 @@ class MiGA::Project < MiGA::MiGA
167
83
  # Is this a project for multi-organism datasets?
168
84
  def is_multi? ; @@KNOWN_TYPES[type][:multi] ; end
169
85
 
170
- ##
171
- # Returns Array of MiGA::Dataset.
172
- def datasets
173
- metadata[:datasets].map{ |name| dataset(name) }
174
- end
175
-
176
- ##
177
- # Returns Array of String (without evaluating dataset objects).
178
- def dataset_names
179
- metadata[:datasets]
180
- end
181
-
182
- ##
183
- # Returns MiGA::Dataset.
184
- def dataset(name)
185
- name = name.miga_name
186
- return nil unless MiGA::Dataset.exist?(self, name)
187
- @datasets ||= {}
188
- @datasets[name] ||= MiGA::Dataset.new(self, name)
189
- @datasets[name]
190
- end
191
-
192
- ##
193
- # Iterate through datasets, with one or two variables passed to +blk+.
194
- # If one, the dataset MiGA::Dataset object is passed. If two, the name and
195
- # the dataset object are passed.
196
- def each_dataset(&blk)
197
- metadata[:datasets].each do |name|
198
- if blk.arity == 1
199
- blk.call(dataset(name))
200
- else
201
- blk.call(name, dataset(name))
202
- end
203
- end
204
- end
205
-
206
- ##
207
- # Add dataset identified by +name+ and return MiGA::Dataset.
208
- def add_dataset(name)
209
- unless metadata[:datasets].include? name
210
- MiGA::Dataset.new(self, name)
211
- @metadata[:datasets] << name
212
- save
213
- end
214
- dataset(name)
215
- end
216
-
217
- ##
218
- # Unlink dataset identified by +name+ and return MiGA::Dataset.
219
- def unlink_dataset(name)
220
- d = dataset(name)
221
- return nil if d.nil?
222
- self.metadata[:datasets].delete(name)
223
- save
224
- d
225
- end
226
-
227
- ##
228
- # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
229
- # supported by File#generic_transfer.
230
- def import_dataset(ds, method=:hardlink)
231
- raise "Impossible to import dataset, it already exists: #{ds.name}." if
232
- MiGA::Dataset.exist?(self, ds.name)
233
- # Import dataset results
234
- ds.each_result do |task, result|
235
- # import result files
236
- result.each_file do |file|
237
- File.generic_transfer("#{result.dir}/#{file}",
238
- "#{path}/data/#{MiGA::Dataset.RESULT_DIRS[task]}/#{file}", method)
239
- end
240
- # import result metadata
241
- %w(json start done).each do |suffix|
242
- if File.exist? "#{result.dir}/#{ds.name}.#{suffix}"
243
- File.generic_transfer("#{result.dir}/#{ds.name}.#{suffix}",
244
- "#{path}/data/#{MiGA::Dataset.RESULT_DIRS[task]}/" +
245
- "#{ds.name}.#{suffix}", method)
246
- end
247
- end
248
- end
249
- # Import dataset metadata
250
- File.generic_transfer("#{ds.project.path}/metadata/#{ds.name}.json",
251
- "#{self.path}/metadata/#{ds.name}.json", method)
252
- # Save dataset
253
- self.add_dataset(ds.name)
254
- end
255
-
256
- ##
257
- # Get result identified by Symbol +name+, returns MiGA::Result.
258
- def result(name)
259
- dir = @@RESULT_DIRS[name.to_sym]
260
- return nil if dir.nil?
261
- MiGA::Result.load("#{path}/data/#{dir}/miga-project.json")
262
- end
263
-
264
- ##
265
- # Get all results, an Array of MiGA::Result.
266
- def results
267
- @@RESULT_DIRS.keys.map{ |k| result(k) }.reject{ |r| r.nil? }
268
- end
269
-
270
- ##
271
- # Add the result identified by Symbol +name+, and return MiGA::Result. Save
272
- # the result if +save+. The +opts+ hash controls result creation (if necessary).
273
- # Supported values include:
274
- # - +force+: A Boolean indicating if the result must be re-indexed. If true, it
275
- # implies save=true.
276
- def add_result(name, save=true, opts={})
277
- return nil if @@RESULT_DIRS[name].nil?
278
- base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
279
- unless opts[:force]
280
- r_pre = MiGA::Result.load("#{base}.json")
281
- return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
282
- end
283
- r = result_files_exist?(base, ".done") ?
284
- send("add_result_#{name}", base) : nil
285
- r.save unless r.nil?
286
- r
287
- end
288
-
289
- ##
290
- # Get the next distances task, saving intermediate results if +save+. Returns
291
- # a Symbol.
292
- def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
293
-
294
- ##
295
- # Get the next inclade task, saving intermediate results if +save+. Returns a
296
- # Symbol.
297
- def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
298
-
299
- ##
300
- # Get the next task from +tasks+, saving intermediate results if +save+.
301
- # Returns a Symbol.
302
- def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
303
- tasks.find do |t|
304
- if metadata["run_#{t}"]==false or
305
- (!is_clade? and @@INCLADE_TASKS.include?(t) and
306
- metadata["run_#{t}"]!=true)
307
- false
308
- else
309
- add_result(t, save).nil?
310
- end
311
- end
312
- end
313
-
314
- ##
315
- # Find all datasets with (potential) result files but are yet unregistered.
316
- def unregistered_datasets
317
- datasets = []
318
- MiGA::Dataset.RESULT_DIRS.values.each do |dir|
319
- dir_p = "#{path}/data/#{dir}"
320
- next unless Dir.exist? dir_p
321
- Dir.entries(dir_p).each do |file|
322
- next unless
323
- file =~ %r{
324
- \.(fa(a|sta|stqc?)?|fna|solexaqa|gff[23]?|done|ess)(\.gz)?$
325
- }x
326
- m = /([^\.]+)/.match(file)
327
- datasets << m[1] unless m.nil? or m[1] == "miga-project"
328
- end
329
- end
330
- datasets.uniq - metadata[:datasets]
331
- end
332
-
333
- ##
334
- # Are all the datasets in the project preprocessed? Save intermediate results
335
- # if +save+.
336
- def done_preprocessing?(save=true)
337
- datasets.map{|ds| (not ds.is_ref?) or ds.done_preprocessing?(save) }.all?
338
- end
339
-
340
- ##
341
- # Returns a two-dimensional matrix (Array of Array) where the first index
342
- # corresponds to the dataset, the second index corresponds to the dataset
343
- # task, and the value corresponds to:
344
- # - 0: Before execution.
345
- # - 1: Done (or not required).
346
- # - 2: To do.
347
- def profile_datasets_advance
348
- advance = []
349
- self.each_dataset_profile_advance do |ds_adv|
350
- advance << ds_adv
351
- end
352
- advance
353
- end
354
-
355
- ##
356
- # Call +blk+ passing the result of MiGA::Dataset#profile_advance for each
357
- # registered dataset.
358
- def each_dataset_profile_advance(&blk)
359
- each_dataset { |ds| blk.call(ds.profile_advance) }
360
- end
361
-
362
- ##
363
- # Installs the plugin in the specified path.
364
- def install_plugin(path)
365
- abs_path = File.absolute_path(path)
366
- raise "Plugin already installed in project: #{abs_path}." unless
367
- metadata[:plugins].nil? or not metadata[:plugins].include?(abs_path)
368
- raise "Malformed MiGA plugin: #{abs_path}." unless
369
- File.exist?(File.expand_path("miga-plugin.json", abs_path))
370
- self.metadata[:plugins] ||= []
371
- self.metadata[:plugins] << abs_path
372
- save
373
- end
374
-
375
- ##
376
- # Uninstall the plugin in the specified path.
377
- def uninstall_plugin(path)
378
- abs_path = File.absolute_path(path)
379
- raise "Plugin not currently installed: #{abs_path}." if
380
- metadata[:plugins].nil? or not metadata[:plugins].include?(abs_path)
381
- self.metadata[:plugins].delete(abs_path)
382
- save
383
- end
384
-
385
- ##
386
- # List plugins installed in the project.
387
- def plugins ; metadata[:plugins] ||= [] ; end
388
-
389
- ##
390
- # Loads the plugins installed in the project.
391
- def load_plugins
392
- plugins.each { |pl| require File.expand_path("lib-plugin.rb", pl) }
393
- end
394
-
395
86
  end