miga-base 0.7.9.0 → 0.7.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -0
  3. data/lib/miga/cli/action/browse.rb +213 -0
  4. data/lib/miga/cli/action/browse/about.html +31 -0
  5. data/lib/miga/cli/action/browse/dataset.html +5 -0
  6. data/lib/miga/cli/action/browse/dataset_menu_item.html +3 -0
  7. data/lib/miga/cli/action/browse/datasets.html +4 -0
  8. data/lib/miga/cli/action/browse/favicon-32.png +0 -0
  9. data/lib/miga/cli/action/browse/index.html +8 -0
  10. data/lib/miga/cli/action/browse/layout.html +57 -0
  11. data/lib/miga/cli/action/browse/redirect.html +11 -0
  12. data/lib/miga/cli/action/browse/style.css +97 -0
  13. data/lib/miga/cli/action/classify_wf.rb +3 -1
  14. data/lib/miga/cli/action/derep_wf.rb +4 -0
  15. data/lib/miga/cli/action/edit.rb +9 -6
  16. data/lib/miga/cli/action/quality_wf.rb +4 -1
  17. data/lib/miga/cli/action/stats.rb +1 -1
  18. data/lib/miga/cli/action/wf.rb +11 -3
  19. data/lib/miga/cli/base.rb +27 -26
  20. data/lib/miga/common/format.rb +26 -6
  21. data/lib/miga/daemon.rb +6 -4
  22. data/lib/miga/dataset.rb +5 -1
  23. data/lib/miga/dataset/base.rb +3 -3
  24. data/lib/miga/dataset/hooks.rb +4 -4
  25. data/lib/miga/dataset/result.rb +18 -14
  26. data/lib/miga/lair.rb +1 -1
  27. data/lib/miga/project/dataset.rb +3 -5
  28. data/lib/miga/project/hooks.rb +4 -3
  29. data/lib/miga/remote_dataset/download.rb +2 -1
  30. data/lib/miga/result.rb +3 -1
  31. data/lib/miga/result/stats.rb +55 -23
  32. data/lib/miga/version.rb +2 -2
  33. data/scripts/cds.bash +0 -1
  34. data/scripts/distances.bash +6 -1
  35. data/test/daemon_test.rb +1 -1
  36. data/test/dataset_test.rb +3 -1
  37. data/test/project_test.rb +1 -1
  38. data/test/remote_dataset_test.rb +1 -1
  39. metadata +16 -6
@@ -42,7 +42,9 @@ class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
42
42
  '--no-summaries',
43
43
  'Do not generate intermediate step summaries'
44
44
  ) { |v| cli[:summaries] = v }
45
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
45
+ opts_for_wf(
46
+ opt, 'Input genome assemblies (nucleotides, FastA)', qual: false
47
+ )
46
48
  end
47
49
  end
48
50
 
@@ -19,6 +19,10 @@ class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
19
19
  'Use Average Amino Acid Identity (AAI) as genome similarity metric',
20
20
  'By default: Use Average Nucleotide Identity (ANI)'
21
21
  ) { cli[:metric] = :aai }
22
+ opt.on(
23
+ '--ani',
24
+ 'Use Average Nucleotide Identity (ANI) as similarity metric (default)'
25
+ ) { cli[:metric] = :ani }
22
26
  opt.on(
23
27
  '--threshold FLOAT', Float,
24
28
  "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
@@ -17,18 +17,21 @@ class MiGA::Cli::Action::Edit < MiGA::Cli::Action
17
17
  'Activate dataset; requires -D'
18
18
  ) { |v| cli[:activate] = v }
19
19
  opt.on(
20
- '--inactivate',
21
- 'Inactivate dataset; requires -D'
22
- ) { |v| cli[:activate] = !v }
20
+ '--inactivate [reason]',
21
+ 'Inactivate dataset; requires -D',
22
+ 'The argument is optional: reason to inactivate dataset'
23
+ ) { |v| cli[:activate] = false ; cli[:reason] = v }
23
24
  end
24
25
  end
25
26
 
26
27
  def perform
27
28
  obj = cli.load_project_or_dataset
28
29
  unless cli[:activate].nil?
29
- cli.ensure_par({ dataset: '-D' },
30
- '%<name>s is mandatory with --[in-]activate: please provide %<flag>s')
31
- cli[:activate] ? obj.activate! : obj.inactivate!
30
+ cli.ensure_par(
31
+ { dataset: '-D' },
32
+ '%<name>s is mandatory with --[in-]activate: please provide %<flag>s'
33
+ )
34
+ cli[:activate] ? obj.activate! : obj.inactivate!(cli[:reason])
32
35
  end
33
36
  cli.add_metadata(obj)
34
37
  obj.save
@@ -15,7 +15,10 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
15
15
  '-m', '--mytaxa-scan',
16
16
  'Perform MyTaxa scan analysis'
17
17
  ) { |v| cli[:mytaxa] = v }
18
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
18
+ opts_for_wf(
19
+ opt, 'Input genome assemblies (nucleotides, FastA)',
20
+ qual: false
21
+ )
19
22
  end
20
23
  end
21
24
 
@@ -38,7 +38,7 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
38
38
  end
39
39
  if cli[:key].nil?
40
40
  r[:stats].each do |k, v|
41
- k_n = k == :g_c_content ? 'G+C content' : k.to_s.unmiga_name.capitalize
41
+ k_n = k.to_s.unmiga_name.sub(/^./, &:upcase)
42
42
  cli.puts "#{k_n}: #{v.is_a?(Array) ? v.join(' ') : v}"
43
43
  end
44
44
  else
@@ -15,7 +15,7 @@ module MiGA::Cli::Action::Wf
15
15
 
16
16
  def opts_for_wf(opt, files_desc, params = {})
17
17
  {
18
- multi: false, cleanup: true, project_type: false, ncbi: true
18
+ multi: false, cleanup: true, project_type: false, ncbi: true, qual: true
19
19
  }.each { |k, v| params[k] = v if params[k].nil? }
20
20
  opt.on(
21
21
  '-o', '--out_dir PATH',
@@ -40,6 +40,13 @@ module MiGA::Cli::Action::Wf
40
40
  'Only download complete genomes, not drafts'
41
41
  ) { |v| cli[:ncbi_draft] = v }
42
42
  end
43
+ if params[:qual]
44
+ opt.on(
45
+ '--min-qual FLOAT', Float,
46
+ 'Minimum genome quality to include in analysis',
47
+ 'By default: 50.0'
48
+ ) { |v| cli[:min_qual] = v }
49
+ end
43
50
  if params[:cleanup]
44
51
  opt.on(
45
52
  '-c', '--clean',
@@ -125,7 +132,7 @@ module MiGA::Cli::Action::Wf
125
132
  ]) unless MiGA::Project.exist? cli[:outdir]
126
133
  # Define project metadata
127
134
  p = cli.load_project(:outdir, '-o')
128
- [:haai_p, :aai_p, :ani_p, :ess_coll].each { |i| p_metadata[i] = cli[i] }
135
+ %i[haai_p aai_p ani_p ess_coll min_qual].each { |i| p_metadata[i] = cli[i] }
129
136
  p_metadata[:type] = cli[:project_type]
130
137
  transfer_metadata(p, p_metadata)
131
138
  # Download datasets
@@ -159,9 +166,10 @@ module MiGA::Cli::Action::Wf
159
166
  '-P', cli[:outdir],
160
167
  '-r', r,
161
168
  '-o', File.expand_path("#{r}.tsv", cli[:outdir]),
162
- '--tab'
169
+ '--tab', '--ref', '--active'
163
170
  ])
164
171
  end
172
+ call_cli(['browse', '-P', cli[:outdir]])
165
173
  end
166
174
 
167
175
  def cleanup
@@ -11,39 +11,40 @@ module MiGA::Cli::Base
11
11
  preproc_wf: 'Preprocess input genomes or metagenomes',
12
12
  index_wf: 'Generate distance indexing of input genomes',
13
13
  # Projects
14
- new: 'Creates an empty MiGA project',
15
- about: 'Displays information about a MiGA project',
16
- doctor: 'Performs consistency checks on a MiGA project',
17
- get_db: 'Downloads a pre-indexed database',
14
+ new: 'Create an empty MiGA project',
15
+ about: 'Display information about a MiGA project',
16
+ doctor: 'Perform consistency checks on a MiGA project',
17
+ get_db: 'Download a pre-indexed database',
18
+ browse: 'Explore a project locally using a web browser',
18
19
  # Datasets
19
- add: 'Creates a dataset in a MiGA project',
20
- get: 'Downloads a dataset from public databases into a MiGA project',
21
- ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
22
- rm: 'Removes a dataset from an MiGA project',
23
- find: 'Finds unregistered datasets based on result files',
20
+ add: 'Create a dataset in a MiGA project',
21
+ get: 'Download a dataset from public databases into a MiGA project',
22
+ ncbi_get: 'Download all genomes in a taxon from NCBI into a MiGA project',
23
+ rm: 'Remove a dataset from an MiGA project',
24
+ find: 'Find unregistered datasets based on result files',
24
25
  ln: 'Link datasets (including results) from one project to another',
25
- ls: 'Lists all registered datasets in an MiGA project',
26
- archive: 'Generates a tar-ball with all files from select datasets',
26
+ ls: 'List all registered datasets in an MiGA project',
27
+ archive: 'Generate a tar-ball with all files from select datasets',
27
28
  # Results
28
- add_result: 'Registers a result',
29
- stats: 'Extracts statistics for the given result',
30
- files: 'Lists registered files from the results of a dataset or project',
31
- run: 'Executes locally one step analysis producing the given result',
32
- summary: 'Generates a summary table for the statistics of all datasets',
33
- next_step: 'Returns the next task to run in a dataset or project',
29
+ add_result: 'Register a result',
30
+ stats: 'Extract statistics for the given result',
31
+ files: 'List registered files from the results of a dataset or project',
32
+ run: 'Execute locally one step analysis producing the given result',
33
+ summary: 'Generate a summary table for the statistics of all datasets',
34
+ next_step: 'Return the next task to run in a dataset or project',
34
35
  # Objects (Datasets or Projects)
35
- edit: 'Edits the metadata of a dataset or project',
36
+ edit: 'Edit the metadata of a dataset or project',
36
37
  # System
37
38
  init: 'Initialize MiGA to process new projects',
38
- daemon: 'Controls the daemon of a MiGA project',
39
- lair: 'Controls groups of daemons for several MiGA projects',
40
- date: 'Returns the current date in standard MiGA format',
41
- console: 'Opens an IRB console with MiGA',
39
+ daemon: 'Control the daemon of a MiGA project',
40
+ lair: 'Control groups of daemons for several MiGA projects',
41
+ date: 'Return the current date in standard MiGA format',
42
+ console: 'Open an IRB console with MiGA',
42
43
  # Taxonomy
43
- tax_set: 'Registers taxonomic information for datasets',
44
- tax_test: 'Returns test of taxonomic distributions for query datasets',
45
- tax_index: 'Creates a taxonomy-indexed list of the datasets',
46
- tax_dist: 'Estimates distributions of distance by taxonomy',
44
+ tax_set: 'Register taxonomic information for datasets',
45
+ tax_test: 'Return test of taxonomic distributions for query datasets',
46
+ tax_index: 'Create a taxonomy-indexed list of the datasets',
47
+ tax_dist: 'Estimate distributions of distance by taxonomy',
47
48
  }
48
49
 
49
50
  @@TASK_ALIAS = {
@@ -68,15 +68,20 @@ module MiGA::Common::Format
68
68
  # a FastA or FastQ file (supports gzipped files). The +format+ must be a
69
69
  # Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
70
70
  # controlled via the +opts+ Hash. Supported options include:
71
- # - +:n50+: If true, it also returns the N50 and the median (in bp)
72
- # - +:gc+: If true, it also returns the G+C content (in %)
73
- # - +:x+: If true, it also returns the undetermined bases content (in %)
71
+ # - +:n50+: Include the N50 and the median (in bp)
72
+ # - +:gc+: Include the G+C content (in %)
73
+ # - +:x+: Include the undetermined bases content (in %)
74
+ # - +:skew+: Include G-C and A-T sequence skew (in %; forces gc: true).
75
+ # See definition used here in DOI:10.1177/117693430700300006
74
76
  def seqs_length(file, format, opts = {})
77
+ opts[:gc] = true if opts[:skew]
75
78
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
76
79
  l = []
77
80
  gc = 0
78
81
  xn = 0
79
- i = 0 # <- Zlib::GzipReader doesn't set `$.`
82
+ t = 0
83
+ c = 0
84
+ i = 0 # <- Zlib::GzipReader doesn't set `$.`
80
85
  fh.each_line do |ln|
81
86
  i += 1
82
87
  if (format == :fasta and ln =~ /^>/) or
@@ -86,6 +91,10 @@ module MiGA::Common::Format
86
91
  l[l.size - 1] += ln.chomp.size
87
92
  gc += ln.scan(/[GCgc]/).count if opts[:gc]
88
93
  xn += ln.scan(/[XNxn]/).count if opts[:x]
94
+ if opts[:skew]
95
+ t += ln.scan(/[Tt]/).count
96
+ c += ln.scan(/[Cc]/).count
97
+ end
89
98
  end
90
99
  end
91
100
  fh.close
@@ -97,6 +106,12 @@ module MiGA::Common::Format
97
106
  o[:sd] = Math.sqrt o[:var]
98
107
  o[:gc] = 100.0 * gc / o[:tot] if opts[:gc]
99
108
  o[:x] = 100.0 * xn / o[:tot] if opts[:x]
109
+ if opts[:skew]
110
+ at = o[:tot] - gc
111
+ o[:at_skew] = 100.0 * (2 * t - at) / at
112
+ o[:gc_skew] = 100.0 * (2 * c - gc) / gc
113
+ end
114
+
100
115
  if opts[:n50]
101
116
  l.sort!
102
117
  thr = o[:tot] / 2
@@ -132,9 +147,14 @@ class String
132
147
  end
133
148
 
134
149
  ##
135
- # Replace underscores by spaces or dots (depending on context).
150
+ # Replace underscores by spaces or other symbols depending on context
136
151
  def unmiga_name
137
- gsub(/_(str|sp|subsp|pv)__/, '_\\1._').tr('_', ' ')
152
+ gsub(/_(str|sp|subsp|pv)__/, '_\\1._')
153
+ .gsub(/g_c_(content)/, 'G+C \\1')
154
+ .gsub(/g_c_(skew)/, 'G-C \\1')
155
+ .gsub(/a_t_(skew)/, 'A-T \\1')
156
+ .gsub(/x_content/, &:capitalize)
157
+ .tr('_', ' ')
138
158
  end
139
159
 
140
160
  ##
@@ -72,6 +72,7 @@ class MiGA::Daemon < MiGA::MiGA
72
72
  say '-----------------------------------'
73
73
  say 'MiGA:%s launched' % project.name
74
74
  say '-----------------------------------'
75
+ recalculate_status!
75
76
  load_status
76
77
  say 'Configuration options:'
77
78
  say @runopts.to_s
@@ -99,6 +100,7 @@ class MiGA::Daemon < MiGA::MiGA
99
100
  end
100
101
 
101
102
  def recalculate_status!
103
+ say 'Recalculating status for all datasets'
102
104
  project.each_dataset(&:recalculate_status)
103
105
  end
104
106
 
@@ -158,8 +160,8 @@ class MiGA::Daemon < MiGA::MiGA
158
160
  end
159
161
 
160
162
  ##
161
- # Traverse datasets, and returns boolean indicating if at any datasets
162
- # are incomplete
163
+ # Traverse datasets, and returns boolean indicating if at any reference
164
+ # datasets are incomplete
163
165
  def check_datasets
164
166
  l_say(2, 'Checking datasets')
165
167
  o = false
@@ -167,7 +169,7 @@ class MiGA::Daemon < MiGA::MiGA
167
169
  next unless ds.status == :incomplete
168
170
  next if ds.next_preprocessing(false).nil?
169
171
 
170
- o = true
172
+ o = true if ds.ref?
171
173
  queue_job(:d, ds)
172
174
  end
173
175
  o
@@ -183,7 +185,7 @@ class MiGA::Daemon < MiGA::MiGA
183
185
  return if project.dataset_names.empty?
184
186
 
185
187
  # Double-check if all datasets are ready
186
- return unless project.done_preprocessing?(false)
188
+ return unless project.done_preprocessing?
187
189
 
188
190
  # Queue project-level job
189
191
  to_run = project.next_task(nil, false)
@@ -97,7 +97,10 @@ class MiGA::Dataset < MiGA::MiGA
97
97
 
98
98
  ##
99
99
  # Inactivate a dataset. This halts automated processing by the daemon
100
- def inactivate!
100
+ #
101
+ # If given, the +reason+ string is saved as a metadata +:warn+ entry
102
+ def inactivate!(reason = nil)
103
+ metadata[:warn] = "Inactive: #{reason}" unless reason.nil?
101
104
  metadata[:inactive] = true
102
105
  metadata.save
103
106
  pull_hook :on_inactivate
@@ -107,6 +110,7 @@ class MiGA::Dataset < MiGA::MiGA
107
110
  # Activate a dataset. This removes the +:inactive+ flag
108
111
  def activate!
109
112
  metadata[:inactive] = nil
113
+ metadata[:warn] = nil if metadata[:warn] && metadata[:warn] =~ /^Inactive: /
110
114
  metadata.save
111
115
  pull_hook :on_activate
112
116
  end
@@ -35,8 +35,8 @@ module MiGA::Dataset::Base
35
35
  mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
36
36
  mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
37
37
  # Distances (for single-species datasets)
38
- distances: '09.distances',
39
38
  taxonomy: '09.distances/05.taxonomy',
39
+ distances: '09.distances',
40
40
  # General statistics
41
41
  stats: '90.stats'
42
42
  }
@@ -66,7 +66,7 @@ module MiGA::Dataset::Base
66
66
  @@PREPROCESSING_TASKS = [
67
67
  :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
68
68
  :assembly, :cds, :essential_genes, :ssu, :mytaxa, :mytaxa_scan,
69
- :distances, :taxonomy, :stats
69
+ :taxonomy, :distances, :stats
70
70
  ]
71
71
 
72
72
  ##
@@ -77,7 +77,7 @@ module MiGA::Dataset::Base
77
77
  ##
78
78
  # Tasks to be executed only in datasets that are not multi-organism. These
79
79
  # tasks are ignored for multi-organism datasets or for unknown types.
80
- @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances, :taxonomy]
80
+ @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
81
81
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
82
82
 
83
83
  ##
@@ -52,15 +52,15 @@ module MiGA::Dataset::Hooks
52
52
  end
53
53
 
54
54
  ##
55
- # Run +cmd+ in the command-line with {{variables}}: dataset, project, miga,
56
- # object (as defined for the event, if any)
55
+ # Run +cmd+ in the command-line with {{variables}}:
56
+ # dataset, project, project_name, miga, object (if defined for the event)
57
57
  # - +hook_args+: +[cmd]+
58
58
  # - +event_args+: +[object (optional)]+
59
59
  def hook_run_cmd(hook_args, event_args)
60
60
  Process.wait(
61
61
  spawn hook_args.first.miga_variables(
62
- dataset: name, project: project.path, miga: MiGA::MiGA.root_path,
63
- object: event_args.first
62
+ dataset: name, project: project.path, project_name: project.name,
63
+ miga: MiGA::MiGA.root_path, object: event_args.first
64
64
  )
65
65
  )
66
66
  end
@@ -26,15 +26,24 @@ module MiGA::Dataset::Result
26
26
  # The values are symbols:
27
27
  # - empty: the dataset has no data
28
28
  # - inactive: the dataset is inactive
29
+ # - upstream: the task is upstream from dataset's input
29
30
  # - force: forced to ignore by metadata
30
31
  # - project: incompatible project
31
32
  # - noref: incompatible dataset, only for reference
32
33
  # - multi: incompatible dataset, only for multi
33
34
  # - nonmulti: incompatible dataset, only for nonmulti
35
+ # - complete: the task is already complete
34
36
  # - execute: do not ignore, execute the task
35
37
  def why_ignore(task)
36
- if !active?
38
+ if !get_result(task).nil?
39
+ :complete
40
+ elsif !active?
37
41
  :inactive
42
+ elsif first_preprocessing.nil?
43
+ :empty
44
+ elsif @@PREPROCESSING_TASKS.index(task) <
45
+ @@PREPROCESSING_TASKS.index(first_preprocessing)
46
+ :upstream
38
47
  elsif !metadata["run_#{task}"].nil?
39
48
  metadata["run_#{task}"] ? :execute : :force
40
49
  elsif task == :taxonomy && project.metadata[:ref_project].nil?
@@ -56,7 +65,7 @@ module MiGA::Dataset::Result
56
65
  # initial input. Passes +save+ to #add_result.
57
66
  def first_preprocessing(save = false)
58
67
  @first_processing ||= @@PREPROCESSING_TASKS.find do |t|
59
- !ignore_task?(t) && !add_result(t, save).nil?
68
+ !add_result(t, save).nil?
60
69
  end
61
70
  end
62
71
 
@@ -70,7 +79,7 @@ module MiGA::Dataset::Result
70
79
  false
71
80
  elsif add_result(t, save).nil?
72
81
  if (metadata["_try_#{t}"] || 0) > (project.metadata[:max_try] || 10)
73
- inactivate!
82
+ inactivate! "Too many errors in step #{t}"
74
83
  false
75
84
  else
76
85
  true
@@ -121,17 +130,12 @@ module MiGA::Dataset::Result
121
130
  # - complete: a task with registered results
122
131
  # - pending: a task queued to be performed
123
132
  def result_status(task)
124
- if first_preprocessing.nil?
125
- :ignore_empty
126
- elsif !get_result(task).nil?
127
- :complete
128
- elsif @@PREPROCESSING_TASKS.index(task) <
129
- @@PREPROCESSING_TASKS.index(first_preprocessing)
130
- :-
131
- elsif ignore_task?(task)
132
- :"ignore_#{why_ignore task}"
133
- else
134
- :pending
133
+ reason = why_ignore(task)
134
+ case reason
135
+ when :upstream; :-
136
+ when :execute; :pending
137
+ when :complete; :complete
138
+ else; :"ignore_#{reason}"
135
139
  end
136
140
  end
137
141
 
@@ -111,7 +111,7 @@ class MiGA::Lair < MiGA::MiGA
111
111
 
112
112
  yield(project)
113
113
  elsif Dir.exist? f
114
- each_project(f) { |project| yield(project) }
114
+ each_project(f) { |p| yield(p) }
115
115
  end
116
116
  end
117
117
  end
@@ -134,12 +134,10 @@ module MiGA::Project::Dataset
134
134
  ##
135
135
  # Are all the datasets in the project preprocessed? Save intermediate results
136
136
  # if +save+ (until the first incomplete dataset is reached).
137
- def done_preprocessing?(save = true)
138
- dataset_names.each do |dn|
139
- ds = dataset(dn)
140
- return false if ds.is_ref? and not ds.done_preprocessing?(save)
137
+ def done_preprocessing?(save = false)
138
+ !each_dataset.any? do |d|
139
+ d.ref? && d.active? && !d.done_preprocessing?(save)
141
140
  end
142
- true
143
141
  end
144
142
 
145
143
  ##